diff options
618 files changed, 32292 insertions, 14682 deletions
diff --git a/Documentation/ABI/testing/configfs-rdma_cm b/Documentation/ABI/testing/configfs-rdma_cm new file mode 100644 index 000000000000..5c389aaf5291 --- /dev/null +++ b/Documentation/ABI/testing/configfs-rdma_cm @@ -0,0 +1,22 @@ +What: /config/rdma_cm +Date: November 29, 2015 +KernelVersion: 4.4.0 +Description: Interface is used to configure RDMA-cable HCAs in respect to + RDMA-CM attributes. + + Attributes are visible only when configfs is mounted. To mount + configfs in /config directory use: + # mount -t configfs none /config/ + + In order to set parameters related to a specific HCA, a directory + for this HCA has to be created: + mkdir -p /config/rdma_cm/<hca> + + +What: /config/rdma_cm/<hca>/ports/<port-num>/default_roce_mode +Date: November 29, 2015 +KernelVersion: 4.4.0 +Description: RDMA-CM based connections from HCA <hca> at port <port-num> + will be initiated with this RoCE type as default. + The possible RoCE types are either "IB/RoCE v1" or "RoCE v2". + This parameter has RW access. diff --git a/Documentation/ABI/testing/sysfs-class-infiniband b/Documentation/ABI/testing/sysfs-class-infiniband new file mode 100644 index 000000000000..a86abe66a316 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-infiniband @@ -0,0 +1,16 @@ +What: /sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/ndevs/<gid-index> +Date: November 29, 2015 +KernelVersion: 4.4.0 +Contact: linux-rdma@vger.kernel.org +Description: The net-device's name associated with the GID resides + at index <gid-index>. + +What: /sys/class/infiniband/<hca>/ports/<port-number>/gid_attrs/types/<gid-index> +Date: November 29, 2015 +KernelVersion: 4.4.0 +Contact: linux-rdma@vger.kernel.org +Description: The RoCE type of the associated GID resides at index <gid-index>. + This could either be "IB/RoCE v1" for IB and RoCE v1 based GODs + or "RoCE v2" for RoCE v2 based GIDs. + + diff --git a/Documentation/devicetree/bindings/input/gpio-keys.txt b/Documentation/devicetree/bindings/input/gpio-keys.txt index cf1333d1dd52..21641236c095 100644 --- a/Documentation/devicetree/bindings/input/gpio-keys.txt +++ b/Documentation/devicetree/bindings/input/gpio-keys.txt @@ -6,6 +6,7 @@ Required properties: Optional properties: - autorepeat: Boolean, Enable auto repeat feature of Linux input subsystem. + - label: String, name of the input device. Each button (key) is represented as a sub-node of "gpio-keys": Subnode properties: diff --git a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt index 0dfa60d88dd3..08efe6bc2193 100644 --- a/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt +++ b/Documentation/devicetree/bindings/thermal/rockchip-thermal.txt @@ -2,8 +2,10 @@ Required properties: - compatible : should be "rockchip,<name>-tsadc" + "rockchip,rk3228-tsadc": found on RK3228 SoCs "rockchip,rk3288-tsadc": found on RK3288 SoCs "rockchip,rk3368-tsadc": found on RK3368 SoCs + "rockchip,rk3399-tsadc": found on RK3399 SoCs - reg : physical base address of the controller and length of memory mapped region. - interrupts : The interrupt number to the cpu. The interrupt specifier format diff --git a/Documentation/infiniband/core_locking.txt b/Documentation/infiniband/core_locking.txt index e1678542279a..4b1f36b6ada0 100644 --- a/Documentation/infiniband/core_locking.txt +++ b/Documentation/infiniband/core_locking.txt @@ -15,7 +15,6 @@ Sleeping and interrupt context modify_ah query_ah destroy_ah - bind_mw post_send post_recv poll_cq @@ -31,7 +30,6 @@ Sleeping and interrupt context ib_modify_ah ib_query_ah ib_destroy_ah - ib_bind_mw ib_post_send ib_post_recv ib_req_notify_cq diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt index f4cbfe0ba108..edec3a3e648d 100644 --- a/Documentation/kernel-per-CPU-kthreads.txt +++ b/Documentation/kernel-per-CPU-kthreads.txt @@ -90,7 +90,7 @@ BLOCK_SOFTIRQ: Do all of the following: from being initiated from tasks that might run on the CPU to be de-jittered. (It is OK to force this CPU offline and then bring it back online before you start your application.) -BLOCK_IOPOLL_SOFTIRQ: Do all of the following: +IRQ_POLL_SOFTIRQ: Do all of the following: 1. Force block-device interrupts onto some other CPU. 2. Initiate any block I/O and block-I/O polling on other CPUs. 3. Once your application has started, prevent CPU-hotplug operations diff --git a/Documentation/sysctl/fs.txt b/Documentation/sysctl/fs.txt index 88152f214f48..302b5ed616a6 100644 --- a/Documentation/sysctl/fs.txt +++ b/Documentation/sysctl/fs.txt @@ -32,6 +32,8 @@ Currently, these files are in /proc/sys/fs: - nr_open - overflowuid - overflowgid +- pipe-user-pages-hard +- pipe-user-pages-soft - protected_hardlinks - protected_symlinks - suid_dumpable @@ -159,6 +161,27 @@ The default is 65534. ============================================================== +pipe-user-pages-hard: + +Maximum total number of pages a non-privileged user may allocate for pipes. +Once this limit is reached, no new pipes may be allocated until usage goes +below the limit again. When set to 0, no limit is applied, which is the default +setting. + +============================================================== + +pipe-user-pages-soft: + +Maximum total number of pages a non-privileged user may allocate for pipes +before the pipe size gets limited to a single page. Once this limit is reached, +new pipes will be limited to a single page in size for this user in order to +limit total memory usage, and trying to increase them using fcntl() will be +denied until usage goes below the limit again. The default value allows to +allocate up to 1024 pipes at their default size. When set to 0, no limit is +applied. + +============================================================== + protected_hardlinks: A long-standing class of security issues is the hardlink-based diff --git a/MAINTAINERS b/MAINTAINERS index 4ad9d7c46cac..78f71f471bd6 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -3665,13 +3665,12 @@ F: drivers/scsi/dpt* F: drivers/scsi/dpt/ DRBD DRIVER -P: Philipp Reisner -P: Lars Ellenberg -M: drbd-dev@lists.linbit.com -L: drbd-user@lists.linbit.com +M: Philipp Reisner <philipp.reisner@linbit.com> +M: Lars Ellenberg <lars.ellenberg@linbit.com> +L: drbd-dev@lists.linbit.com W: http://www.drbd.org -T: git git://git.drbd.org/linux-2.6-drbd.git drbd -T: git git://git.drbd.org/drbd-8.3.git +T: git git://git.linbit.com/linux-drbd.git +T: git git://git.linbit.com/drbd-8.4.git S: Supported F: drivers/block/drbd/ F: lib/lru_cache.c @@ -7150,27 +7149,45 @@ W: https://linuxtv.org S: Odd Fixes F: drivers/media/radio/radio-miropcm20* -Mellanox MLX5 core VPI driver -M: Eli Cohen <eli@mellanox.com> +MELLANOX MLX4 core VPI driver +M: Yishai Hadas <yishaih@mellanox.com> L: netdev@vger.kernel.org L: linux-rdma@vger.kernel.org W: http://www.mellanox.com Q: http://patchwork.ozlabs.org/project/netdev/list/ +S: Supported +F: drivers/net/ethernet/mellanox/mlx4/ +F: include/linux/mlx4/ + +MELLANOX MLX4 IB driver +M: Yishai Hadas <yishaih@mellanox.com> +L: linux-rdma@vger.kernel.org +W: http://www.mellanox.com Q: http://patchwork.kernel.org/project/linux-rdma/list/ -T: git git://openfabrics.org/~eli/connect-ib.git +S: Supported +F: drivers/infiniband/hw/mlx4/ +F: include/linux/mlx4/ + +MELLANOX MLX5 core VPI driver +M: Matan Barak <matanb@mellanox.com> +M: Leon Romanovsky <leonro@mellanox.com> +L: netdev@vger.kernel.org +L: linux-rdma@vger.kernel.org +W: http://www.mellanox.com +Q: http://patchwork.ozlabs.org/project/netdev/list/ S: Supported F: drivers/net/ethernet/mellanox/mlx5/core/ F: include/linux/mlx5/ -Mellanox MLX5 IB driver -M: Eli Cohen <eli@mellanox.com> +MELLANOX MLX5 IB driver +M: Matan Barak <matanb@mellanox.com> +M: Leon Romanovsky <leonro@mellanox.com> L: linux-rdma@vger.kernel.org W: http://www.mellanox.com Q: http://patchwork.kernel.org/project/linux-rdma/list/ -T: git git://openfabrics.org/~eli/connect-ib.git S: Supported -F: include/linux/mlx5/ F: drivers/infiniband/hw/mlx5/ +F: include/linux/mlx5/ MELEXIS MLX90614 DRIVER M: Crt Mori <cmo@melexis.com> @@ -7701,6 +7718,12 @@ W: https://github.com/jonmason/ntb/wiki T: git git://github.com/jonmason/ntb.git F: drivers/ntb/hw/intel/ +NTB AMD DRIVER +M: Xiangliang Yu <Xiangliang.Yu@amd.com> +L: linux-ntb@googlegroups.com +S: Supported +F: drivers/ntb/hw/amd/ + NTFS FILESYSTEM M: Anton Altaparmakov <anton@tuxera.com> L: linux-ntfs-dev@lists.sourceforge.net @@ -10452,9 +10475,11 @@ S: Maintained F: drivers/net/ethernet/dlink/sundance.c SUPERH +M: Yoshinori Sato <ysato@users.sourceforge.jp> +M: Rich Felker <dalias@libc.org> L: linux-sh@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-sh/list/ -S: Orphan +S: Maintained F: Documentation/sh/ F: arch/sh/ F: drivers/sh/ diff --git a/arch/arm/Kconfig.debug b/arch/arm/Kconfig.debug index 5c0e5cc8ed10..c6b6175d0203 100644 --- a/arch/arm/Kconfig.debug +++ b/arch/arm/Kconfig.debug @@ -153,10 +153,9 @@ choice mobile SoCs in the Kona family of chips (e.g. bcm28155, bcm11351, etc...) - config DEBUG_BCM63XX + config DEBUG_BCM63XX_UART bool "Kernel low-level debugging on BCM63XX UART" depends on ARCH_BCM_63XX - select DEBUG_UART_BCM63XX config DEBUG_BERLIN_UART bool "Marvell Berlin SoC Debug UART" @@ -1414,7 +1413,7 @@ config DEBUG_LL_INCLUDE default "debug/vf.S" if DEBUG_VF_UART default "debug/vt8500.S" if DEBUG_VT8500_UART0 default "debug/zynq.S" if DEBUG_ZYNQ_UART0 || DEBUG_ZYNQ_UART1 - default "debug/bcm63xx.S" if DEBUG_UART_BCM63XX + default "debug/bcm63xx.S" if DEBUG_BCM63XX_UART default "debug/digicolor.S" if DEBUG_DIGICOLOR_UA0 default "mach/debug-macro.S" @@ -1428,10 +1427,6 @@ config DEBUG_UART_8250 ARCH_IOP13XX || ARCH_IOP32X || ARCH_IOP33X || ARCH_IXP4XX || \ ARCH_RPC -# Compatibility options for BCM63xx -config DEBUG_UART_BCM63XX - def_bool ARCH_BCM_63XX - config DEBUG_UART_PHYS hex "Physical base address of debug UART" default 0x00100a00 if DEBUG_NETX_UART @@ -1529,7 +1524,7 @@ config DEBUG_UART_PHYS default 0xfffb0000 if DEBUG_OMAP1UART1 || DEBUG_OMAP7XXUART1 default 0xfffb0800 if DEBUG_OMAP1UART2 || DEBUG_OMAP7XXUART2 default 0xfffb9800 if DEBUG_OMAP1UART3 || DEBUG_OMAP7XXUART3 - default 0xfffe8600 if DEBUG_UART_BCM63XX + default 0xfffe8600 if DEBUG_BCM63XX_UART default 0xfffff700 if ARCH_IOP33X depends on ARCH_EP93XX || \ DEBUG_LL_UART_8250 || DEBUG_LL_UART_PL01X || \ @@ -1542,7 +1537,7 @@ config DEBUG_UART_PHYS DEBUG_RMOBILE_SCIFA0 || DEBUG_RMOBILE_SCIFA1 || \ DEBUG_RMOBILE_SCIFA4 || DEBUG_S3C24XX_UART || \ DEBUG_S3C64XX_UART || \ - DEBUG_UART_BCM63XX || DEBUG_ASM9260_UART || \ + DEBUG_BCM63XX_UART || DEBUG_ASM9260_UART || \ DEBUG_SIRFSOC_UART || DEBUG_DIGICOLOR_UA0 || \ DEBUG_AT91_UART @@ -1588,7 +1583,7 @@ config DEBUG_UART_VIRT default 0xfb10c000 if DEBUG_REALVIEW_PB1176_PORT default 0xfc40ab00 if DEBUG_BRCMSTB_UART default 0xfc705000 if DEBUG_ZTE_ZX - default 0xfcfe8600 if DEBUG_UART_BCM63XX + default 0xfcfe8600 if DEBUG_BCM63XX_UART default 0xfd000000 if DEBUG_SPEAR3XX || DEBUG_SPEAR13XX default 0xfd012000 if DEBUG_MVEBU_UART0_ALTERNATE && ARCH_MV78XX0 default 0xfd883000 if DEBUG_ALPINE_UART0 @@ -1638,7 +1633,7 @@ config DEBUG_UART_VIRT DEBUG_NETX_UART || \ DEBUG_QCOM_UARTDM || DEBUG_S3C24XX_UART || \ DEBUG_S3C64XX_UART || \ - DEBUG_UART_BCM63XX || DEBUG_ASM9260_UART || \ + DEBUG_BCM63XX_UART || DEBUG_ASM9260_UART || \ DEBUG_SIRFSOC_UART || DEBUG_DIGICOLOR_UA0 config DEBUG_UART_8250_SHIFT diff --git a/arch/arm/boot/dts/r8a7740-armadillo800eva.dts b/arch/arm/boot/dts/r8a7740-armadillo800eva.dts index 78a21f2828df..c548cabb102f 100644 --- a/arch/arm/boot/dts/r8a7740-armadillo800eva.dts +++ b/arch/arm/boot/dts/r8a7740-armadillo800eva.dts @@ -180,7 +180,7 @@ }; &extal1_clk { - clock-frequency = <25000000>; + clock-frequency = <24000000>; }; &extal2_clk { clock-frequency = <48000000>; diff --git a/arch/arm/mach-realview/Makefile b/arch/arm/mach-realview/Makefile index 8be6632407d8..dae8d86ef4cc 100644 --- a/arch/arm/mach-realview/Makefile +++ b/arch/arm/mach-realview/Makefile @@ -4,10 +4,9 @@ ccflags-$(CONFIG_ARCH_MULTIPLATFORM) := -I$(srctree)/$(src)/include \ -I$(srctree)/arch/arm/plat-versatile/include - +obj-y := core.o obj-$(CONFIG_REALVIEW_DT) += realview-dt.o obj-$(CONFIG_SMP) += platsmp-dt.o -obj-y := core.o ifdef CONFIG_ATAGS obj-$(CONFIG_MACH_REALVIEW_EB) += realview_eb.o diff --git a/arch/arm/mach-tegra/Kconfig b/arch/arm/mach-tegra/Kconfig index a90f3556017f..0fa8b84ed657 100644 --- a/arch/arm/mach-tegra/Kconfig +++ b/arch/arm/mach-tegra/Kconfig @@ -13,57 +13,5 @@ menuconfig ARCH_TEGRA select ARCH_HAS_RESET_CONTROLLER select RESET_CONTROLLER select SOC_BUS - select USB_ULPI if USB_PHY - select USB_ULPI_VIEWPORT if USB_PHY help This enables support for NVIDIA Tegra based systems. - -if ARCH_TEGRA - -config ARCH_TEGRA_2x_SOC - bool "Enable support for Tegra20 family" - select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP - select ARM_ERRATA_720789 - select ARM_ERRATA_754327 if SMP - select ARM_ERRATA_764369 if SMP - select PINCTRL_TEGRA20 - select PL310_ERRATA_727915 if CACHE_L2X0 - select PL310_ERRATA_769419 if CACHE_L2X0 - select TEGRA_TIMER - help - Support for NVIDIA Tegra AP20 and T20 processors, based on the - ARM CortexA9MP CPU and the ARM PL310 L2 cache controller - -config ARCH_TEGRA_3x_SOC - bool "Enable support for Tegra30 family" - select ARM_ERRATA_754322 - select ARM_ERRATA_764369 if SMP - select PINCTRL_TEGRA30 - select PL310_ERRATA_769419 if CACHE_L2X0 - select TEGRA_TIMER - help - Support for NVIDIA Tegra T30 processor family, based on the - ARM CortexA9MP CPU and the ARM PL310 L2 cache controller - -config ARCH_TEGRA_114_SOC - bool "Enable support for Tegra114 family" - select ARM_ERRATA_798181 if SMP - select ARM_L1_CACHE_SHIFT_6 - select HAVE_ARM_ARCH_TIMER - select PINCTRL_TEGRA114 - select TEGRA_TIMER - help - Support for NVIDIA Tegra T114 processor family, based on the - ARM CortexA15MP CPU - -config ARCH_TEGRA_124_SOC - bool "Enable support for Tegra124 family" - select ARM_L1_CACHE_SHIFT_6 - select HAVE_ARM_ARCH_TIMER - select PINCTRL_TEGRA124 - select TEGRA_TIMER - help - Support for NVIDIA Tegra T124 processor family, based on the - ARM CortexA15MP CPU - -endif diff --git a/arch/arm/mach-tegra/sleep-tegra20.S b/arch/arm/mach-tegra/sleep-tegra20.S index e6b684e14322..f5d19667484e 100644 --- a/arch/arm/mach-tegra/sleep-tegra20.S +++ b/arch/arm/mach-tegra/sleep-tegra20.S @@ -231,8 +231,11 @@ ENDPROC(tegra20_cpu_is_resettable_soon) * tegra20_tear_down_core in IRAM */ ENTRY(tegra20_sleep_core_finish) + mov r4, r0 /* Flush, disable the L1 data cache and exit SMP */ + mov r0, #TEGRA_FLUSH_CACHE_ALL bl tegra_disable_clean_inv_dcache + mov r0, r4 mov32 r3, tegra_shut_off_mmu add r3, r3, r0 diff --git a/arch/arm/mach-tegra/sleep-tegra30.S b/arch/arm/mach-tegra/sleep-tegra30.S index 9a2f0b051e10..16e5ff03383c 100644 --- a/arch/arm/mach-tegra/sleep-tegra30.S +++ b/arch/arm/mach-tegra/sleep-tegra30.S @@ -242,8 +242,11 @@ ENDPROC(tegra30_cpu_shutdown) * tegra30_tear_down_core in IRAM */ ENTRY(tegra30_sleep_core_finish) + mov r4, r0 /* Flush, disable the L1 data cache and exit SMP */ + mov r0, #TEGRA_FLUSH_CACHE_ALL bl tegra_disable_clean_inv_dcache + mov r0, r4 /* * Preload all the address literals that are needed for the diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 534a60ae282e..0eca3812527e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1200,10 +1200,7 @@ error: while (i--) if (pages[i]) __free_pages(pages[i], 0); - if (array_size <= PAGE_SIZE) - kfree(pages); - else - vfree(pages); + kvfree(pages); return NULL; } @@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, size_t size, struct dma_attrs *attrs) { int count = size >> PAGE_SHIFT; - int array_size = count * sizeof(struct page *); int i; if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) { @@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, __free_pages(pages[i], 0); } - if (array_size <= PAGE_SIZE) - kfree(pages); - else - vfree(pages); + kvfree(pages); return 0; } diff --git a/arch/arm64/Kconfig.platforms b/arch/arm64/Kconfig.platforms index 2c400412b3bc..21074f674bde 100644 --- a/arch/arm64/Kconfig.platforms +++ b/arch/arm64/Kconfig.platforms @@ -105,18 +105,6 @@ config ARCH_TEGRA help This enables support for the NVIDIA Tegra SoC family. -config ARCH_TEGRA_132_SOC - bool "NVIDIA Tegra132 SoC" - depends on ARCH_TEGRA - select PINCTRL_TEGRA124 - select USB_ULPI if USB_PHY - select USB_ULPI_VIEWPORT if USB_PHY - help - Enable support for NVIDIA Tegra132 SoC, based on the Denver - ARMv8 CPU. The Tegra132 SoC is similar to the Tegra124 SoC, - but contains an NVIDIA Denver CPU complex in place of - Tegra124's "4+1" Cortex-A15 CPU complex. - config ARCH_SPRD bool "Spreadtrum SoC platform" help diff --git a/arch/arm64/boot/dts/Makefile b/arch/arm64/boot/dts/Makefile index 76e7510835b2..f832b8a7453a 100644 --- a/arch/arm64/boot/dts/Makefile +++ b/arch/arm64/boot/dts/Makefile @@ -9,6 +9,7 @@ dts-dirs += freescale dts-dirs += hisilicon dts-dirs += marvell dts-dirs += mediatek +dts-dirs += nvidia dts-dirs += qcom dts-dirs += renesas dts-dirs += rockchip diff --git a/arch/arm64/boot/dts/nvidia/Makefile b/arch/arm64/boot/dts/nvidia/Makefile new file mode 100644 index 000000000000..a7e865da1005 --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/Makefile @@ -0,0 +1,7 @@ +dtb-$(CONFIG_ARCH_TEGRA_132_SOC) += tegra132-norrin.dtb +dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2371-0000.dtb +dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2371-2180.dtb +dtb-$(CONFIG_ARCH_TEGRA_210_SOC) += tegra210-p2571.dtb + +always := $(dtb-y) +clean-files := *.dtb diff --git a/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts b/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts new file mode 100644 index 000000000000..7dfe1c085966 --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra132-norrin.dts @@ -0,0 +1,1130 @@ +/dts-v1/; + +#include <dt-bindings/input/input.h> +#include "tegra132.dtsi" + +/ { + model = "NVIDIA Tegra132 Norrin"; + compatible = "nvidia,norrin", "nvidia,tegra132", "nvidia,tegra124"; + + aliases { + rtc0 = "/i2c@0,7000d000/as3722@40"; + rtc1 = "/rtc@0,7000e000"; + }; + + memory { + device_type = "memory"; + reg = <0x0 0x80000000 0x0 0x80000000>; + }; + + host1x@0,50000000 { + hdmi@0,54280000 { + status = "disabled"; + + vdd-supply = <&vdd_3v3_hdmi>; + pll-supply = <&vdd_hdmi_pll>; + hdmi-supply = <&vdd_5v0_hdmi>; + + nvidia,ddc-i2c-bus = <&hdmi_ddc>; + nvidia,hpd-gpio = + <&gpio TEGRA_GPIO(N, 7) GPIO_ACTIVE_HIGH>; + }; + + sor@0,54540000 { + status = "okay"; + + nvidia,dpaux = <&dpaux>; + nvidia,panel = <&panel>; + }; + + dpaux: dpaux@0,545c0000 { + vdd-supply = <&vdd_3v3_panel>; + status = "okay"; + }; + }; + + gpu@0,57000000 { + status = "okay"; + + vdd-supply = <&vdd_gpu>; + }; + + pinmux@0,70000868 { + pinctrl-names = "default"; + pinctrl-0 = <&pinmux_default>; + + pinmux_default: pinmux@0 { + dap_mclk1_pw4 { + nvidia,pins = "dap_mclk1_pw4"; + nvidia,function = "extperiph1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + dap2_din_pa4 { + nvidia,pins = "dap2_din_pa4"; + nvidia,function = "i2s1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + dap2_dout_pa5 { + nvidia,pins = "dap2_dout_pa5", + "dap2_fs_pa2", + "dap2_sclk_pa3"; + nvidia,function = "i2s1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + dap3_dout_pp2 { + nvidia,pins = "dap3_dout_pp2"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + dvfs_pwm_px0 { + nvidia,pins = "dvfs_pwm_px0", + "dvfs_clk_px2"; + nvidia,function = "cldvfs"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + ulpi_clk_py0 { + nvidia,pins = "ulpi_clk_py0", + "ulpi_nxt_py2", + "ulpi_stp_py3"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + ulpi_dir_py1 { + nvidia,pins = "ulpi_dir_py1"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + cam_i2c_scl_pbb1 { + nvidia,pins = "cam_i2c_scl_pbb1", + "cam_i2c_sda_pbb2"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_ENABLE>; + }; + gen2_i2c_scl_pt5 { + nvidia,pins = "gen2_i2c_scl_pt5", + "gen2_i2c_sda_pt6"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_ENABLE>; + }; + pj7 { + nvidia,pins = "pj7"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + spdif_in_pk6 { + nvidia,pins = "spdif_in_pk6"; + nvidia,function = "spdif"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + pk7 { + nvidia,pins = "pk7"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + pg4 { + nvidia,pins = "pg4", + "pg5", + "pg6", + "pi3"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + pg7 { + nvidia,pins = "pg7"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + ph1 { + nvidia,pins = "ph1"; + nvidia,function = "pwm1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + pk0 { + nvidia,pins = "pk0", + "kb_row15_ps7", + "clk_32k_out_pa0"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sdmmc1_clk_pz0 { + nvidia,pins = "sdmmc1_clk_pz0"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sdmmc1_cmd_pz1 { + nvidia,pins = "sdmmc1_cmd_pz1", + "sdmmc1_dat0_py7", + "sdmmc1_dat1_py6", + "sdmmc1_dat2_py5", + "sdmmc1_dat3_py4"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sdmmc3_clk_pa6 { + nvidia,pins = "sdmmc3_clk_pa6"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sdmmc3_cmd_pa7 { + nvidia,pins = "sdmmc3_cmd_pa7", + "sdmmc3_dat0_pb7", + "sdmmc3_dat1_pb6", + "sdmmc3_dat2_pb5", + "sdmmc3_dat3_pb4", + "kb_col4_pq4", + "sdmmc3_clk_lb_out_pee4", + "sdmmc3_clk_lb_in_pee5", + "sdmmc3_cd_n_pv2"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sdmmc4_clk_pcc4 { + nvidia,pins = "sdmmc4_clk_pcc4"; + nvidia,function = "sdmmc4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sdmmc4_cmd_pt7 { + nvidia,pins = "sdmmc4_cmd_pt7", + "sdmmc4_dat0_paa0", + "sdmmc4_dat1_paa1", + "sdmmc4_dat2_paa2", + "sdmmc4_dat3_paa3", + "sdmmc4_dat4_paa4", + "sdmmc4_dat5_paa5", + "sdmmc4_dat6_paa6", + "sdmmc4_dat7_paa7"; + nvidia,function = "sdmmc4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + mic_det_l { + nvidia,pins = "kb_row7_pr7"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + kb_row10_ps2 { + nvidia,pins = "kb_row10_ps2"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + kb_row9_ps1 { + nvidia,pins = "kb_row9_ps1"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_scl_pz6 { + nvidia,pins = "pwr_i2c_scl_pz6", + "pwr_i2c_sda_pz7"; + nvidia,function = "i2cpwr"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_ENABLE>; + }; + jtag_rtck { + nvidia,pins = "jtag_rtck"; + nvidia,function = "rtck"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + clk_32k_in { + nvidia,pins = "clk_32k_in"; + nvidia,function = "clk"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + core_pwr_req { + nvidia,pins = "core_pwr_req"; + nvidia,function = "pwron"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + cpu_pwr_req { + nvidia,pins = "cpu_pwr_req"; + nvidia,function = "cpu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + kb_col0_ap { + nvidia,pins = "kb_col0_pq0"; + nvidia,function = "rsvd4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + en_vdd_sd { + nvidia,pins = "kb_row0_pr0"; + nvidia,function = "rsvd4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + lid_open { + nvidia,pins = "kb_row4_pr4"; + nvidia,function = "rsvd3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + pwr_int_n { + nvidia,pins = "pwr_int_n"; + nvidia,function = "pmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + reset_out_n { + nvidia,pins = "reset_out_n"; + nvidia,function = "reset_out_n"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + clk3_out_pee0 { + nvidia,pins = "clk3_out_pee0"; + nvidia,function = "extperiph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_scl_pc4 { + nvidia,pins = "gen1_i2c_scl_pc4", + "gen1_i2c_sda_pc5"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_ENABLE>; + }; + hdmi_cec_pee3 { + nvidia,pins = "hdmi_cec_pee3"; + nvidia,function = "cec"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + hdmi_int_pn7 { + nvidia,pins = "hdmi_int_pn7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + ddc_scl_pv4 { + nvidia,pins = "ddc_scl_pv4", + "ddc_sda_pv5"; + nvidia,function = "i2c4"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,rcv-sel = <TEGRA_PIN_ENABLE>; + }; + usb_vbus_en0_pn4 { + nvidia,pins = "usb_vbus_en0_pn4", + "usb_vbus_en1_pn5", + "usb_vbus_en2_pff1"; + nvidia,function = "usb"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,lock = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + drive_sdio1 { + nvidia,pins = "drive_sdio1"; + nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>; + nvidia,schmitt = <TEGRA_PIN_DISABLE>; + nvidia,pull-down-strength = <36>; + nvidia,pull-up-strength = <20>; + nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_SLOW>; + nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_SLOW>; + }; + drive_sdio3 { + nvidia,pins = "drive_sdio3"; + nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>; + nvidia,schmitt = <TEGRA_PIN_DISABLE>; + nvidia,pull-down-strength = <22>; + nvidia,pull-up-strength = <36>; + nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_FASTEST>; + nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_FASTEST>; + }; + drive_gma { + nvidia,pins = "drive_gma"; + nvidia,high-speed-mode = <TEGRA_PIN_ENABLE>; + nvidia,schmitt = <TEGRA_PIN_DISABLE>; + nvidia,pull-down-strength = <2>; + nvidia,pull-up-strength = <1>; + nvidia,slew-rate-rising = <TEGRA_PIN_SLEW_RATE_FASTEST>; + nvidia,slew-rate-falling = <TEGRA_PIN_SLEW_RATE_FASTEST>; + nvidia,drive-type = <1>; + }; + ac_ok { + nvidia,pins = "pj0"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + codec_irq_l { + nvidia,pins = "ph4"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + lcd_bl_en { + nvidia,pins = "ph2"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + touch_irq_l { + nvidia,pins = "gpio_w3_aud_pw3"; + nvidia,function = "spi6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + tpm_davint_l { + nvidia,pins = "ph6"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + ts_irq_l { + nvidia,pins = "pk2"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + ts_reset_l { + nvidia,pins = "pk4"; + nvidia,function = "gmi"; + nvidia,pull = <1>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + ts_shdn_l { + nvidia,pins = "pk1"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + ph7 { + nvidia,pins = "ph7"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + sensor_irq_l { + nvidia,pins = "pi6"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + wifi_en { + nvidia,pins = "gpio_x7_aud_px7"; + nvidia,function = "rsvd4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + chromeos_write_protect { + nvidia,pins = "kb_row1_pr1"; + nvidia,function = "rsvd4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + hp_det_l { + nvidia,pins = "pi7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + }; + soc_warm_reset_l { + nvidia,pins = "pi5"; + nvidia,function = "gmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + }; + }; + }; + + serial@0,70006000 { + status = "okay"; + }; + + pwm: pwm@0,7000a000 { + status = "okay"; + }; + + /* HDMI DDC */ + hdmi_ddc: i2c@0,7000c700 { + status = "okay"; + clock-frequency = <100000>; + }; + + i2c@0,7000d000 { + status = "okay"; + clock-frequency = <400000>; + + as3722: pmic@40 { + compatible = "ams,as3722"; + reg = <0x40>; + interrupts = <GIC_SPI 86 IRQ_TYPE_LEVEL_HIGH>; + + ams,system-power-controller; + + #interrupt-cells = <2>; + interrupt-controller; + + #gpio-cells = <2>; + gpio-controller; + + pinctrl-names = "default"; + pinctrl-0 = <&as3722_default>; + + as3722_default: pinmux@0 { + gpio0 { + pins = "gpio0"; + function = "gpio"; + bias-pull-down; + }; + + gpio1 { + pins = "gpio1"; + function = "gpio"; + bias-pull-up; + }; + + gpio2_4_7 { + pins = "gpio2", "gpio4", "gpio7"; + function = "gpio"; + bias-pull-up; + }; + + gpio3 { + pins = "gpio3"; + function = "gpio"; + bias-high-impedance; + }; + + gpio5 { + pins = "gpio5"; + function = "clk32k-out"; + bias-pull-down; + }; + + gpio6 { + pins = "gpio6"; + function = "clk32k-out"; + bias-pull-down; + }; + }; + + regulators { + vsup-sd2-supply = <&vdd_5v0_sys>; + vsup-sd3-supply = <&vdd_5v0_sys>; + vsup-sd4-supply = <&vdd_5v0_sys>; + vsup-sd5-supply = <&vdd_5v0_sys>; + vin-ldo0-supply = <&vdd_1v35_lp0>; + vin-ldo1-6-supply = <&vdd_3v3_sys>; + vin-ldo2-5-7-supply = <&vddio_1v8>; + vin-ldo3-4-supply = <&vdd_3v3_sys>; + vin-ldo9-10-supply = <&vdd_5v0_sys>; + vin-ldo11-supply = <&vdd_3v3_run>; + + sd0 { + regulator-name = "+VDD_CPU_AP"; + regulator-min-microvolt = <700000>; + regulator-max-microvolt = <1350000>; + regulator-max-microamp = <3500000>; + regulator-always-on; + regulator-boot-on; + ams,ext-control = <2>; + }; + + sd1 { + regulator-name = "+VDD_CORE"; + regulator-min-microvolt = <700000>; + regulator-max-microvolt = <1350000>; + regulator-max-microamp = <4000000>; + regulator-always-on; + regulator-boot-on; + ams,ext-control = <1>; + }; + + vdd_1v35_lp0: sd2 { + regulator-name = "+1.35V_LP0(sd2)"; + regulator-min-microvolt = <1350000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + }; + + sd3 { + regulator-name = "+1.35V_LP0(sd3)"; + regulator-min-microvolt = <1350000>; + regulator-max-microvolt = <1350000>; + regulator-always-on; + regulator-boot-on; + }; + + vdd_1v05_run: sd4 { + regulator-name = "+1.05V_RUN"; + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + }; + + vddio_1v8: sd5 { + regulator-name = "+1.8V_VDDIO"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + regulator-always-on; + regulator-boot-on; + }; + + vdd_gpu: sd6 { + regulator-name = "+VDD_GPU_AP"; + regulator-min-microvolt = <800000>; + regulator-max-microvolt = <1200000>; + regulator-min-microamp = <3500000>; + regulator-max-microamp = <3500000>; + regulator-always-on; + regulator-boot-on; + }; + + ldo0 { + regulator-name = "+1.05_RUN_AVDD"; + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + regulator-always-on; + regulator-boot-on; + ams,ext-control = <1>; + }; + + ldo1 { + regulator-name = "+1.8V_RUN_CAM"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + + ldo2 { + regulator-name = "+1.2V_GEN_AVDD"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + regulator-always-on; + regulator-boot-on; + }; + + ldo3 { + regulator-name = "+1.00V_LP0_VDD_RTC"; + regulator-min-microvolt = <1000000>; + regulator-max-microvolt = <1000000>; + regulator-always-on; + regulator-boot-on; + ams,enable-tracking; + }; + + vdd_run_cam: ldo4 { + regulator-name = "+2.8V_RUN_CAM"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <2800000>; + }; + + ldo5 { + regulator-name = "+1.2V_RUN_CAM_FRONT"; + regulator-min-microvolt = <1200000>; + regulator-max-microvolt = <1200000>; + }; + + vddio_sdmmc3: ldo6 { + regulator-name = "+VDDIO_SDMMC3"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <3300000>; + }; + + ldo7 { + regulator-name = "+1.05V_RUN_CAM_REAR"; + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + }; + + ldo9 { + regulator-name = "+2.8V_RUN_TOUCH"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <2800000>; + }; + + ldo10 { + regulator-name = "+2.8V_RUN_CAM_AF"; + regulator-min-microvolt = <2800000>; + regulator-max-microvolt = <2800000>; + }; + + ldo11 { + regulator-name = "+1.8V_RUN_VPP_FUSE"; + regulator-min-microvolt = <1800000>; + regulator-max-microvolt = <1800000>; + }; + }; + }; + }; + + spi@0,7000d400 { + status = "okay"; + + ec: cros-ec@0 { + compatible = "google,cros-ec-spi"; + spi-max-frequency = <3000000>; + interrupt-parent = <&gpio>; + interrupts = <TEGRA_GPIO(C, 7) IRQ_TYPE_LEVEL_LOW>; + reg = <0>; + + google,cros-ec-spi-msg-delay = <2000>; + + i2c_20: i2c-tunnel { + compatible = "google,cros-ec-i2c-tunnel"; + #address-cells = <1>; + #size-cells = <0>; + + google,remote-bus = <0>; + + charger: bq24735 { + compatible = "ti,bq24735"; + reg = <0x9>; + interrupt-parent = <&gpio>; + interrupts = <TEGRA_GPIO(J, 0) + GPIO_ACTIVE_HIGH>; + ti,ac-detect-gpios = <&gpio + TEGRA_GPIO(J, 0) + GPIO_ACTIVE_HIGH>; + }; + + battery: smart-battery { + compatible = "sbs,sbs-battery"; + reg = <0xb>; + battery-name = "battery"; + sbs,i2c-retry-count = <2>; + sbs,poll-retry-count = <10>; + /* power-supplies = <&charger>; */ + }; + }; + + keyboard-controller { + compatible = "google,cros-ec-keyb"; + keypad,num-rows = <8>; + keypad,num-columns = <13>; + google,needs-ghost-filter; + linux,keymap = + <MATRIX_KEY(0x00, 0x01, KEY_LEFTMETA) + MATRIX_KEY(0x00, 0x02, KEY_F1) + MATRIX_KEY(0x00, 0x03, KEY_B) + MATRIX_KEY(0x00, 0x04, KEY_F10) + MATRIX_KEY(0x00, 0x06, KEY_N) + MATRIX_KEY(0x00, 0x08, KEY_EQUAL) + MATRIX_KEY(0x00, 0x0a, KEY_RIGHTALT) + + MATRIX_KEY(0x01, 0x01, KEY_ESC) + MATRIX_KEY(0x01, 0x02, KEY_F4) + MATRIX_KEY(0x01, 0x03, KEY_G) + MATRIX_KEY(0x01, 0x04, KEY_F7) + MATRIX_KEY(0x01, 0x06, KEY_H) + MATRIX_KEY(0x01, 0x08, KEY_APOSTROPHE) + MATRIX_KEY(0x01, 0x09, KEY_F9) + MATRIX_KEY(0x01, 0x0b, KEY_BACKSPACE) + + MATRIX_KEY(0x02, 0x00, KEY_LEFTCTRL) + MATRIX_KEY(0x02, 0x01, KEY_TAB) + MATRIX_KEY(0x02, 0x02, KEY_F3) + MATRIX_KEY(0x02, 0x03, KEY_T) + MATRIX_KEY(0x02, 0x04, KEY_F6) + MATRIX_KEY(0x02, 0x05, KEY_RIGHTBRACE) + MATRIX_KEY(0x02, 0x06, KEY_Y) + MATRIX_KEY(0x02, 0x07, KEY_102ND) + MATRIX_KEY(0x02, 0x08, KEY_LEFTBRACE) + MATRIX_KEY(0x02, 0x09, KEY_F8) + + MATRIX_KEY(0x03, 0x01, KEY_GRAVE) + MATRIX_KEY(0x03, 0x02, KEY_F2) + MATRIX_KEY(0x03, 0x03, KEY_5) + MATRIX_KEY(0x03, 0x04, KEY_F5) + MATRIX_KEY(0x03, 0x06, KEY_6) + MATRIX_KEY(0x03, 0x08, KEY_MINUS) + MATRIX_KEY(0x03, 0x0b, KEY_BACKSLASH) + + MATRIX_KEY(0x04, 0x00, KEY_RIGHTCTRL) + MATRIX_KEY(0x04, 0x01, KEY_A) + MATRIX_KEY(0x04, 0x02, KEY_D) + MATRIX_KEY(0x04, 0x03, KEY_F) + MATRIX_KEY(0x04, 0x04, KEY_S) + MATRIX_KEY(0x04, 0x05, KEY_K) + MATRIX_KEY(0x04, 0x06, KEY_J) + MATRIX_KEY(0x04, 0x08, KEY_SEMICOLON) + MATRIX_KEY(0x04, 0x09, KEY_L) + MATRIX_KEY(0x04, 0x0a, KEY_BACKSLASH) + MATRIX_KEY(0x04, 0x0b, KEY_ENTER) + + MATRIX_KEY(0x05, 0x01, KEY_Z) + MATRIX_KEY(0x05, 0x02, KEY_C) + MATRIX_KEY(0x05, 0x03, KEY_V) + MATRIX_KEY(0x05, 0x04, KEY_X) + MATRIX_KEY(0x05, 0x05, KEY_COMMA) + MATRIX_KEY(0x05, 0x06, KEY_M) + MATRIX_KEY(0x05, 0x07, KEY_LEFTSHIFT) + MATRIX_KEY(0x05, 0x08, KEY_SLASH) + MATRIX_KEY(0x05, 0x09, KEY_DOT) + MATRIX_KEY(0x05, 0x0b, KEY_SPACE) + + MATRIX_KEY(0x06, 0x01, KEY_1) + MATRIX_KEY(0x06, 0x02, KEY_3) + MATRIX_KEY(0x06, 0x03, KEY_4) + MATRIX_KEY(0x06, 0x04, KEY_2) + MATRIX_KEY(0x06, 0x05, KEY_8) + MATRIX_KEY(0x06, 0x06, KEY_7) + MATRIX_KEY(0x06, 0x08, KEY_0) + MATRIX_KEY(0x06, 0x09, KEY_9) + MATRIX_KEY(0x06, 0x0a, KEY_LEFTALT) + MATRIX_KEY(0x06, 0x0b, KEY_DOWN) + MATRIX_KEY(0x06, 0x0c, KEY_RIGHT) + + MATRIX_KEY(0x07, 0x01, KEY_Q) + MATRIX_KEY(0x07, 0x02, KEY_E) + MATRIX_KEY(0x07, 0x03, KEY_R) + MATRIX_KEY(0x07, 0x04, KEY_W) + MATRIX_KEY(0x07, 0x05, KEY_I) + MATRIX_KEY(0x07, 0x06, KEY_U) + MATRIX_KEY(0x07, 0x07, KEY_RIGHTSHIFT) + MATRIX_KEY(0x07, 0x08, KEY_P) + MATRIX_KEY(0x07, 0x09, KEY_O) + MATRIX_KEY(0x07, 0x0b, KEY_UP) + MATRIX_KEY(0x07, 0x0c, KEY_LEFT)>; + }; + }; + }; + + pmc@0,7000e400 { + nvidia,invert-interrupt; + nvidia,suspend-mode = <0>; + #wake-cells = <3>; + nvidia,cpu-pwr-good-time = <500>; + nvidia,cpu-pwr-off-time = <300>; + nvidia,core-pwr-good-time = <641 3845>; + nvidia,core-pwr-off-time = <61036>; + nvidia,core-power-req-active-high; + nvidia,sys-clock-req-active-high; + nvidia,reset-gpio = <&gpio TEGRA_GPIO(I, 5) GPIO_ACTIVE_LOW>; + }; + + /* WIFI/BT module */ + sdhci@0,700b0000 { + status = "disabled"; + }; + + /* external SD/MMC */ + sdhci@0,700b0400 { + cd-gpios = <&gpio TEGRA_GPIO(V, 2) GPIO_ACTIVE_LOW>; + power-gpios = <&gpio TEGRA_GPIO(R, 0) GPIO_ACTIVE_HIGH>; + wp-gpios = <&gpio TEGRA_GPIO(Q, 4) GPIO_ACTIVE_HIGH>; + status = "okay"; + bus-width = <4>; + vqmmc-supply = <&vddio_sdmmc3>; + }; + + /* EMMC 4.51 */ + sdhci@0,700b0600 { + status = "okay"; + bus-width = <8>; + non-removable; + }; + + usb@0,7d000000 { + status = "okay"; + }; + + usb-phy@0,7d000000 { + status = "okay"; + vbus-supply = <&vdd_usb1_vbus>; + }; + + usb@0,7d004000 { + status = "okay"; + }; + + usb-phy@0,7d004000 { + status = "okay"; + vbus-supply = <&vdd_run_cam>; + }; + + usb@0,7d008000 { + status = "okay"; + }; + + usb-phy@0,7d008000 { + status = "okay"; + vbus-supply = <&vdd_usb3_vbus>; + }; + + backlight: backlight { + compatible = "pwm-backlight"; + + enable-gpios = <&gpio TEGRA_GPIO(H, 2) GPIO_ACTIVE_HIGH>; + power-supply = <&vdd_led>; + pwms = <&pwm 1 1000000>; + + brightness-levels = <0 4 8 16 32 64 128 255>; + default-brightness-level = <6>; + + backlight-boot-off; + }; + + clocks { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <0>; + + clk32k_in: clock@0 { + compatible = "fixed-clock"; + reg=<0>; + #clock-cells = <0>; + clock-frequency = <32768>; + }; + }; + + gpio-keys { + compatible = "gpio-keys"; + + lid { + label = "Lid"; + gpios = <&gpio TEGRA_GPIO(R, 4) GPIO_ACTIVE_LOW>; + linux,input-type = <5>; + linux,code = <0>; + debounce-interval = <1>; + gpio-key,wakeup; + }; + + power { + label = "Power"; + gpios = <&gpio TEGRA_GPIO(Q, 0) GPIO_ACTIVE_LOW>; + linux,code = <KEY_POWER>; + debounce-interval = <10>; + gpio-key,wakeup; + }; + }; + + panel: panel { + compatible = "innolux,n116bge", "simple-panel"; + backlight = <&backlight>; + ddc-i2c-bus = <&dpaux>; + }; + + regulators { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <0>; + + vdd_mux: regulator@0 { + compatible = "regulator-fixed"; + reg = <0>; + regulator-name = "+VDD_MUX"; + regulator-min-microvolt = <19000000>; + regulator-max-microvolt = <19000000>; + regulator-always-on; + regulator-boot-on; + }; + + vdd_5v0_sys: regulator@1 { + compatible = "regulator-fixed"; + reg = <1>; + regulator-name = "+5V_SYS"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-always-on; + regulator-boot-on; + vin-supply = <&vdd_mux>; + }; + + vdd_3v3_sys: regulator@2 { + compatible = "regulator-fixed"; + reg = <2>; + regulator-name = "+3.3V_SYS"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + regulator-boot-on; + vin-supply = <&vdd_mux>; + }; + + vdd_3v3_run: regulator@3 { + compatible = "regulator-fixed"; + reg = <3>; + regulator-name = "+3.3V_RUN"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + regulator-always-on; + regulator-boot-on; + gpio = <&as3722 1 GPIO_ACTIVE_HIGH>; + enable-active-high; + vin-supply = <&vdd_3v3_sys>; + }; + + vdd_3v3_hdmi: regulator@4 { + compatible = "regulator-fixed"; + reg = <4>; + regulator-name = "+3.3V_AVDD_HDMI_AP_GATED"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + vin-supply = <&vdd_3v3_run>; + }; + + vdd_led: regulator@5 { + compatible = "regulator-fixed"; + reg = <5>; + regulator-name = "+VDD_LED"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + gpio = <&gpio TEGRA_GPIO(P, 2) GPIO_ACTIVE_HIGH>; + enable-active-high; + vin-supply = <&vdd_mux>; + }; + + vdd_usb1_vbus: regulator@6 { + compatible = "regulator-fixed"; + reg = <6>; + regulator-name = "+5V_USB_HS"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&gpio TEGRA_GPIO(N, 4) GPIO_ACTIVE_HIGH>; + enable-active-high; + gpio-open-drain; + vin-supply = <&vdd_5v0_sys>; + }; + + vdd_usb3_vbus: regulator@7 { + compatible = "regulator-fixed"; + reg = <7>; + regulator-name = "+5V_USB_SS"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&gpio TEGRA_GPIO(N, 5) GPIO_ACTIVE_HIGH>; + enable-active-high; + gpio-open-drain; + vin-supply = <&vdd_5v0_sys>; + }; + + vdd_3v3_panel: regulator@8 { + compatible = "regulator-fixed"; + reg = <8>; + regulator-name = "+3.3V_PANEL"; + regulator-min-microvolt = <3300000>; + regulator-max-microvolt = <3300000>; + gpio = <&as3722 4 GPIO_ACTIVE_HIGH>; + enable-active-high; + vin-supply = <&vdd_3v3_sys>; + }; + + vdd_hdmi_pll: regulator@9 { + compatible = "regulator-fixed"; + reg = <9>; + regulator-name = "+1.05V_RUN_AVDD_HDMI_PLL_AP_GATE"; + regulator-min-microvolt = <1050000>; + regulator-max-microvolt = <1050000>; + gpio = <&gpio TEGRA_GPIO(H, 7) GPIO_ACTIVE_LOW>; + vin-supply = <&vdd_1v05_run>; + }; + + vdd_5v0_hdmi: regulator@10 { + compatible = "regulator-fixed"; + reg = <10>; + regulator-name = "+5V_HDMI_CON"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + gpio = <&gpio TEGRA_GPIO(K, 6) GPIO_ACTIVE_HIGH>; + enable-active-high; + vin-supply = <&vdd_5v0_sys>; + }; + + vdd_5v0_ts: regulator@11 { + compatible = "regulator-fixed"; + reg = <11>; + regulator-name = "+5V_VDD_TS"; + regulator-min-microvolt = <5000000>; + regulator-max-microvolt = <5000000>; + regulator-always-on; + regulator-boot-on; + gpio = <&gpio TEGRA_GPIO(K, 1) GPIO_ACTIVE_HIGH>; + enable-active-high; + }; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra132.dtsi b/arch/arm64/boot/dts/nvidia/tegra132.dtsi new file mode 100644 index 000000000000..e8bb46027bed --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra132.dtsi @@ -0,0 +1,990 @@ +#include <dt-bindings/clock/tegra124-car.h> +#include <dt-bindings/gpio/tegra-gpio.h> +#include <dt-bindings/memory/tegra124-mc.h> +#include <dt-bindings/pinctrl/pinctrl-tegra.h> +#include <dt-bindings/pinctrl/pinctrl-tegra-xusb.h> +#include <dt-bindings/interrupt-controller/arm-gic.h> + +/ { + compatible = "nvidia,tegra132", "nvidia,tegra124"; + interrupt-parent = <&lic>; + #address-cells = <2>; + #size-cells = <2>; + + pcie-controller@0,01003000 { + compatible = "nvidia,tegra124-pcie"; + device_type = "pci"; + reg = <0x0 0x01003000 0x0 0x00000800 /* PADS registers */ + 0x0 0x01003800 0x0 0x00000800 /* AFI registers */ + 0x0 0x02000000 0x0 0x10000000>; /* configuration space */ + reg-names = "pads", "afi", "cs"; + interrupts = <GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>, /* controller interrupt */ + <GIC_SPI 99 IRQ_TYPE_LEVEL_HIGH>; /* MSI interrupt */ + interrupt-names = "intr", "msi"; + + #interrupt-cells = <1>; + interrupt-map-mask = <0 0 0 0>; + interrupt-map = <0 0 0 0 &gic GIC_SPI 98 IRQ_TYPE_LEVEL_HIGH>; + + bus-range = <0x00 0xff>; + #address-cells = <3>; + #size-cells = <2>; + + ranges = <0x82000000 0 0x01000000 0x0 0x01000000 0 0x00001000 /* port 0 configuration space */ + 0x82000000 0 0x01001000 0x0 0x01001000 0 0x00001000 /* port 1 configuration space */ + 0x81000000 0 0x0 0x0 0x12000000 0 0x00010000 /* downstream I/O (64 KiB) */ + 0x82000000 0 0x13000000 0x0 0x13000000 0 0x0d000000 /* non-prefetchable memory (208 MiB) */ + 0xc2000000 0 0x20000000 0x0 0x20000000 0 0x20000000>; /* prefetchable memory (512 MiB) */ + + clocks = <&tegra_car TEGRA124_CLK_PCIE>, + <&tegra_car TEGRA124_CLK_AFI>, + <&tegra_car TEGRA124_CLK_PLL_E>, + <&tegra_car TEGRA124_CLK_CML0>; + clock-names = "pex", "afi", "pll_e", "cml"; + resets = <&tegra_car 70>, + <&tegra_car 72>, + <&tegra_car 74>; + reset-names = "pex", "afi", "pcie_x"; + status = "disabled"; + + phys = <&padctl TEGRA_XUSB_PADCTL_PCIE>; + phy-names = "pcie"; + + pci@1,0 { + device_type = "pci"; + assigned-addresses = <0x82000800 0 0x01000000 0 0x1000>; + reg = <0x000800 0 0 0 0>; + status = "disabled"; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + + nvidia,num-lanes = <2>; + }; + + pci@2,0 { + device_type = "pci"; + assigned-addresses = <0x82001000 0 0x01001000 0 0x1000>; + reg = <0x001000 0 0 0 0>; + status = "disabled"; + + #address-cells = <3>; + #size-cells = <2>; + ranges; + + nvidia,num-lanes = <1>; + }; + }; + + host1x@0,50000000 { + compatible = "nvidia,tegra124-host1x", "simple-bus"; + reg = <0x0 0x50000000 0x0 0x00034000>; + interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>, /* syncpt */ + <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* general */ + clocks = <&tegra_car TEGRA124_CLK_HOST1X>; + clock-names = "host1x"; + resets = <&tegra_car 28>; + reset-names = "host1x"; + + #address-cells = <2>; + #size-cells = <2>; + + ranges = <0 0x54000000 0 0x54000000 0 0x01000000>; + + dc@0,54200000 { + compatible = "nvidia,tegra124-dc"; + reg = <0x0 0x54200000 0x0 0x00040000>; + interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_DISP1>, + <&tegra_car TEGRA124_CLK_PLL_P>; + clock-names = "dc", "parent"; + resets = <&tegra_car 27>; + reset-names = "dc"; + + iommus = <&mc TEGRA_SWGROUP_DC>; + + nvidia,head = <0>; + }; + + dc@0,54240000 { + compatible = "nvidia,tegra124-dc"; + reg = <0x0 0x54240000 0x0 0x00040000>; + interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_DISP2>, + <&tegra_car TEGRA124_CLK_PLL_P>; + clock-names = "dc", "parent"; + resets = <&tegra_car 26>; + reset-names = "dc"; + + iommus = <&mc TEGRA_SWGROUP_DCB>; + + nvidia,head = <1>; + }; + + hdmi@0,54280000 { + compatible = "nvidia,tegra124-hdmi"; + reg = <0x0 0x54280000 0x0 0x00040000>; + interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_HDMI>, + <&tegra_car TEGRA124_CLK_PLL_D2_OUT0>; + clock-names = "hdmi", "parent"; + resets = <&tegra_car 51>; + reset-names = "hdmi"; + status = "disabled"; + }; + + sor@0,54540000 { + compatible = "nvidia,tegra124-sor"; + reg = <0x0 0x54540000 0x0 0x00040000>; + interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_SOR0>, + <&tegra_car TEGRA124_CLK_PLL_D_OUT0>, + <&tegra_car TEGRA124_CLK_PLL_DP>, + <&tegra_car TEGRA124_CLK_CLK_M>; + clock-names = "sor", "parent", "dp", "safe"; + resets = <&tegra_car 182>; + reset-names = "sor"; + status = "disabled"; + }; + + dpaux: dpaux@0,545c0000 { + compatible = "nvidia,tegra124-dpaux"; + reg = <0x0 0x545c0000 0x0 0x00040000>; + interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_DPAUX>, + <&tegra_car TEGRA124_CLK_PLL_DP>; + clock-names = "dpaux", "parent"; + resets = <&tegra_car 181>; + reset-names = "dpaux"; + status = "disabled"; + }; + }; + + gic: interrupt-controller@0,50041000 { + compatible = "arm,cortex-a15-gic"; + #interrupt-cells = <3>; + interrupt-controller; + reg = <0x0 0x50041000 0x0 0x1000>, + <0x0 0x50042000 0x0 0x2000>, + <0x0 0x50044000 0x0 0x2000>, + <0x0 0x50046000 0x0 0x2000>; + interrupts = <GIC_PPI 9 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>; + interrupt-parent = <&gic>; + }; + + gpu@0,57000000 { + compatible = "nvidia,gk20a"; + reg = <0x0 0x57000000 0x0 0x01000000>, + <0x0 0x58000000 0x0 0x01000000>; + interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "stall", "nonstall"; + clocks = <&tegra_car TEGRA124_CLK_GPU>, + <&tegra_car TEGRA124_CLK_PLL_P_OUT5>; + clock-names = "gpu", "pwr"; + resets = <&tegra_car 184>; + reset-names = "gpu"; + status = "disabled"; + }; + + lic: interrupt-controller@60004000 { + compatible = "nvidia,tegra124-ictlr", "nvidia,tegra30-ictlr"; + reg = <0x0 0x60004000 0x0 0x100>, + <0x0 0x60004100 0x0 0x100>, + <0x0 0x60004200 0x0 0x100>, + <0x0 0x60004300 0x0 0x100>, + <0x0 0x60004400 0x0 0x100>; + interrupt-controller; + #interrupt-cells = <3>; + interrupt-parent = <&gic>; + }; + + timer@0,60005000 { + compatible = "nvidia,tegra124-timer", "nvidia,tegra20-timer"; + reg = <0x0 0x60005000 0x0 0x400>; + interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_TIMER>; + clock-names = "timer"; + }; + + tegra_car: clock@0,60006000 { + compatible = "nvidia,tegra132-car"; + reg = <0x0 0x60006000 0x0 0x1000>; + #clock-cells = <1>; + #reset-cells = <1>; + nvidia,external-memory-controller = <&emc>; + }; + + flow-controller@0,60007000 { + compatible = "nvidia,tegra124-flowctrl"; + reg = <0x0 0x60007000 0x0 0x1000>; + }; + + actmon@0,6000c800 { + compatible = "nvidia,tegra124-actmon"; + reg = <0x0 0x6000c800 0x0 0x400>; + interrupts = <GIC_SPI 45 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_ACTMON>, + <&tegra_car TEGRA124_CLK_EMC>; + clock-names = "actmon", "emc"; + resets = <&tegra_car 119>; + reset-names = "actmon"; + }; + + gpio: gpio@0,6000d000 { + compatible = "nvidia,tegra124-gpio", "nvidia,tegra30-gpio"; + reg = <0x0 0x6000d000 0x0 0x1000>; + interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>; + #gpio-cells = <2>; + gpio-controller; + #interrupt-cells = <2>; + interrupt-controller; + }; + + apbdma: dma@0,60020000 { + compatible = "nvidia,tegra124-apbdma", "nvidia,tegra148-apbdma"; + reg = <0x0 0x60020000 0x0 0x1400>; + interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_APBDMA>; + clock-names = "dma"; + resets = <&tegra_car 34>; + reset-names = "dma"; + #dma-cells = <1>; + }; + + apbmisc@0,70000800 { + compatible = "nvidia,tegra124-apbmisc", "nvidia,tegra20-apbmisc"; + reg = <0x0 0x70000800 0x0 0x64>, /* Chip revision */ + <0x0 0x7000e864 0x0 0x04>; /* Strapping options */ + }; + + pinmux: pinmux@0,70000868 { + compatible = "nvidia,tegra124-pinmux"; + reg = <0x0 0x70000868 0x0 0x164>, /* Pad control registers */ + <0x0 0x70003000 0x0 0x434>, /* Mux registers */ + <0x0 0x70000820 0x0 0x008>; /* MIPI pad control */ + }; + + /* + * There are two serial driver i.e. 8250 based simple serial + * driver and APB DMA based serial driver for higher baudrate + * and performace. To enable the 8250 based driver, the compatible + * is "nvidia,tegra124-uart", "nvidia,tegra20-uart" and to enable + * the APB DMA based serial driver, the comptible is + * "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart". + */ + uarta: serial@0,70006000 { + compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006000 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_UARTA>; + clock-names = "serial"; + resets = <&tegra_car 6>; + reset-names = "serial"; + dmas = <&apbdma 8>, <&apbdma 8>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + uartb: serial@0,70006040 { + compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006040 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_UARTB>; + clock-names = "serial"; + resets = <&tegra_car 7>; + reset-names = "serial"; + dmas = <&apbdma 9>, <&apbdma 9>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + uartc: serial@0,70006200 { + compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006200 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_UARTC>; + clock-names = "serial"; + resets = <&tegra_car 55>; + reset-names = "serial"; + dmas = <&apbdma 10>, <&apbdma 10>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + uartd: serial@0,70006300 { + compatible = "nvidia,tegra124-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006300 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_UARTD>; + clock-names = "serial"; + resets = <&tegra_car 65>; + reset-names = "serial"; + dmas = <&apbdma 19>, <&apbdma 19>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + pwm: pwm@0,7000a000 { + compatible = "nvidia,tegra124-pwm", "nvidia,tegra20-pwm"; + reg = <0x0 0x7000a000 0x0 0x100>; + #pwm-cells = <2>; + clocks = <&tegra_car TEGRA124_CLK_PWM>; + clock-names = "pwm"; + resets = <&tegra_car 17>; + reset-names = "pwm"; + status = "disabled"; + }; + + i2c@0,7000c000 { + compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c000 0x0 0x100>; + interrupts = <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_I2C1>; + clock-names = "div-clk"; + resets = <&tegra_car 12>; + reset-names = "i2c"; + dmas = <&apbdma 21>, <&apbdma 21>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000c400 { + compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c400 0x0 0x100>; + interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_I2C2>; + clock-names = "div-clk"; + resets = <&tegra_car 54>; + reset-names = "i2c"; + dmas = <&apbdma 22>, <&apbdma 22>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000c500 { + compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c500 0x0 0x100>; + interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_I2C3>; + clock-names = "div-clk"; + resets = <&tegra_car 67>; + reset-names = "i2c"; + dmas = <&apbdma 23>, <&apbdma 23>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000c700 { + compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c700 0x0 0x100>; + interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_I2C4>; + clock-names = "div-clk"; + resets = <&tegra_car 103>; + reset-names = "i2c"; + dmas = <&apbdma 26>, <&apbdma 26>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000d000 { + compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000d000 0x0 0x100>; + interrupts = <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_I2C5>; + clock-names = "div-clk"; + resets = <&tegra_car 47>; + reset-names = "i2c"; + dmas = <&apbdma 24>, <&apbdma 24>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000d100 { + compatible = "nvidia,tegra124-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000d100 0x0 0x100>; + interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_I2C6>; + clock-names = "div-clk"; + resets = <&tegra_car 166>; + reset-names = "i2c"; + dmas = <&apbdma 30>, <&apbdma 30>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000d400 { + compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000d400 0x0 0x200>; + interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_SBC1>; + clock-names = "spi"; + resets = <&tegra_car 41>; + reset-names = "spi"; + dmas = <&apbdma 15>, <&apbdma 15>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000d600 { + compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000d600 0x0 0x200>; + interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_SBC2>; + clock-names = "spi"; + resets = <&tegra_car 44>; + reset-names = "spi"; + dmas = <&apbdma 16>, <&apbdma 16>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000d800 { + compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000d800 0x0 0x200>; + interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_SBC3>; + clock-names = "spi"; + resets = <&tegra_car 46>; + reset-names = "spi"; + dmas = <&apbdma 17>, <&apbdma 17>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000da00 { + compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000da00 0x0 0x200>; + interrupts = <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_SBC4>; + clock-names = "spi"; + resets = <&tegra_car 68>; + reset-names = "spi"; + dmas = <&apbdma 18>, <&apbdma 18>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000dc00 { + compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000dc00 0x0 0x200>; + interrupts = <GIC_SPI 94 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_SBC5>; + clock-names = "spi"; + resets = <&tegra_car 104>; + reset-names = "spi"; + dmas = <&apbdma 27>, <&apbdma 27>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000de00 { + compatible = "nvidia,tegra124-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000de00 0x0 0x200>; + interrupts = <GIC_SPI 79 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA124_CLK_SBC6>; + clock-names = "spi"; + resets = <&tegra_car 105>; + reset-names = "spi"; + dmas = <&apbdma 28>, <&apbdma 28>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + rtc@0,7000e000 { + compatible = "nvidia,tegra124-rtc", "nvidia,tegra20-rtc"; + reg = <0x0 0x7000e000 0x0 0x100>; + interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_RTC>; + clock-names = "rtc"; + }; + + pmc@0,7000e400 { + compatible = "nvidia,tegra124-pmc"; + reg = <0x0 0x7000e400 0x0 0x400>; + clocks = <&tegra_car TEGRA124_CLK_PCLK>, <&clk32k_in>; + clock-names = "pclk", "clk32k_in"; + }; + + fuse@0,7000f800 { + compatible = "nvidia,tegra124-efuse"; + reg = <0x0 0x7000f800 0x0 0x400>; + clocks = <&tegra_car TEGRA124_CLK_FUSE>; + clock-names = "fuse"; + resets = <&tegra_car 39>; + reset-names = "fuse"; + }; + + mc: memory-controller@0,70019000 { + compatible = "nvidia,tegra132-mc"; + reg = <0x0 0x70019000 0x0 0x1000>; + clocks = <&tegra_car TEGRA124_CLK_MC>; + clock-names = "mc"; + + interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>; + + #iommu-cells = <1>; + }; + + emc: emc@0,7001b000 { + compatible = "nvidia,tegra132-emc", "nvidia,tegra124-emc"; + reg = <0x0 0x7001b000 0x0 0x1000>; + + nvidia,memory-controller = <&mc>; + }; + + sata@0,70020000 { + compatible = "nvidia,tegra124-ahci"; + reg = <0x0 0x70027000 0x0 0x2000>, /* AHCI */ + <0x0 0x70020000 0x0 0x7000>; /* SATA */ + interrupts = <GIC_SPI 23 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_SATA>, + <&tegra_car TEGRA124_CLK_SATA_OOB>, + <&tegra_car TEGRA124_CLK_CML1>, + <&tegra_car TEGRA124_CLK_PLL_E>; + clock-names = "sata", "sata-oob", "cml1", "pll_e"; + resets = <&tegra_car 124>, + <&tegra_car 123>, + <&tegra_car 129>; + reset-names = "sata", "sata-oob", "sata-cold"; + phys = <&padctl TEGRA_XUSB_PADCTL_SATA>; + phy-names = "sata-phy"; + status = "disabled"; + }; + + hda@0,70030000 { + compatible = "nvidia,tegra132-hda", "nvidia,tegra124-hda", + "nvidia,tegra30-hda"; + reg = <0x0 0x70030000 0x0 0x10000>; + interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_HDA>, + <&tegra_car TEGRA124_CLK_HDA2HDMI>, + <&tegra_car TEGRA124_CLK_HDA2CODEC_2X>; + clock-names = "hda", "hda2hdmi", "hda2codec_2x"; + resets = <&tegra_car 125>, /* hda */ + <&tegra_car 128>, /* hda2hdmi */ + <&tegra_car 111>; /* hda2codec_2x */ + reset-names = "hda", "hda2hdmi", "hda2codec_2x"; + status = "disabled"; + }; + + padctl: padctl@0,7009f000 { + compatible = "nvidia,tegra132-xusb-padctl", + "nvidia,tegra124-xusb-padctl"; + reg = <0x0 0x7009f000 0x0 0x1000>; + resets = <&tegra_car 142>; + reset-names = "padctl"; + + #phy-cells = <1>; + + phys { + pcie-0 { + status = "disabled"; + }; + + sata-0 { + status = "disabled"; + }; + + usb3-0 { + status = "disabled"; + }; + + usb3-1 { + status = "disabled"; + }; + + utmi-0 { + status = "disabled"; + }; + + utmi-1 { + status = "disabled"; + }; + + utmi-2 { + status = "disabled"; + }; + }; + }; + + sdhci@0,700b0000 { + compatible = "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0000 0x0 0x200>; + interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_SDMMC1>; + clock-names = "sdhci"; + resets = <&tegra_car 14>; + reset-names = "sdhci"; + status = "disabled"; + }; + + sdhci@0,700b0200 { + compatible = "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0200 0x0 0x200>; + interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_SDMMC2>; + clock-names = "sdhci"; + resets = <&tegra_car 9>; + reset-names = "sdhci"; + status = "disabled"; + }; + + sdhci@0,700b0400 { + compatible = "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0400 0x0 0x200>; + interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_SDMMC3>; + clock-names = "sdhci"; + resets = <&tegra_car 69>; + reset-names = "sdhci"; + status = "disabled"; + }; + + sdhci@0,700b0600 { + compatible = "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0600 0x0 0x200>; + interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_SDMMC4>; + clock-names = "sdhci"; + resets = <&tegra_car 15>; + reset-names = "sdhci"; + status = "disabled"; + }; + + soctherm: thermal-sensor@0,700e2000 { + compatible = "nvidia,tegra124-soctherm"; + reg = <0x0 0x700e2000 0x0 0x1000>; + interrupts = <GIC_SPI 48 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_TSENSOR>, + <&tegra_car TEGRA124_CLK_SOC_THERM>; + clock-names = "tsensor", "soctherm"; + resets = <&tegra_car 78>; + reset-names = "soctherm"; + #thermal-sensor-cells = <1>; + }; + + ahub@0,70300000 { + compatible = "nvidia,tegra124-ahub"; + reg = <0x0 0x70300000 0x0 0x200>, + <0x0 0x70300800 0x0 0x800>, + <0x0 0x70300200 0x0 0x600>; + interrupts = <GIC_SPI 103 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA124_CLK_D_AUDIO>, + <&tegra_car TEGRA124_CLK_APBIF>; + clock-names = "d_audio", "apbif"; + resets = <&tegra_car 106>, /* d_audio */ + <&tegra_car 107>, /* apbif */ + <&tegra_car 30>, /* i2s0 */ + <&tegra_car 11>, /* i2s1 */ + <&tegra_car 18>, /* i2s2 */ + <&tegra_car 101>, /* i2s3 */ + <&tegra_car 102>, /* i2s4 */ + <&tegra_car 108>, /* dam0 */ + <&tegra_car 109>, /* dam1 */ + <&tegra_car 110>, /* dam2 */ + <&tegra_car 10>, /* spdif */ + <&tegra_car 153>, /* amx */ + <&tegra_car 185>, /* amx1 */ + <&tegra_car 154>, /* adx */ + <&tegra_car 180>, /* adx1 */ + <&tegra_car 186>, /* afc0 */ + <&tegra_car 187>, /* afc1 */ + <&tegra_car 188>, /* afc2 */ + <&tegra_car 189>, /* afc3 */ + <&tegra_car 190>, /* afc4 */ + <&tegra_car 191>; /* afc5 */ + reset-names = "d_audio", "apbif", "i2s0", "i2s1", "i2s2", + "i2s3", "i2s4", "dam0", "dam1", "dam2", + "spdif", "amx", "amx1", "adx", "adx1", + "afc0", "afc1", "afc2", "afc3", "afc4", "afc5"; + dmas = <&apbdma 1>, <&apbdma 1>, + <&apbdma 2>, <&apbdma 2>, + <&apbdma 3>, <&apbdma 3>, + <&apbdma 4>, <&apbdma 4>, + <&apbdma 6>, <&apbdma 6>, + <&apbdma 7>, <&apbdma 7>, + <&apbdma 12>, <&apbdma 12>, + <&apbdma 13>, <&apbdma 13>, + <&apbdma 14>, <&apbdma 14>, + <&apbdma 29>, <&apbdma 29>; + dma-names = "rx0", "tx0", "rx1", "tx1", "rx2", "tx2", + "rx3", "tx3", "rx4", "tx4", "rx5", "tx5", + "rx6", "tx6", "rx7", "tx7", "rx8", "tx8", + "rx9", "tx9"; + ranges; + #address-cells = <2>; + #size-cells = <2>; + + tegra_i2s0: i2s@0,70301000 { + compatible = "nvidia,tegra124-i2s"; + reg = <0x0 0x70301000 0x0 0x100>; + nvidia,ahub-cif-ids = <4 4>; + clocks = <&tegra_car TEGRA124_CLK_I2S0>; + clock-names = "i2s"; + resets = <&tegra_car 30>; + reset-names = "i2s"; + status = "disabled"; + }; + + tegra_i2s1: i2s@0,70301100 { + compatible = "nvidia,tegra124-i2s"; + reg = <0x0 0x70301100 0x0 0x100>; + nvidia,ahub-cif-ids = <5 5>; + clocks = <&tegra_car TEGRA124_CLK_I2S1>; + clock-names = "i2s"; + resets = <&tegra_car 11>; + reset-names = "i2s"; + status = "disabled"; + }; + + tegra_i2s2: i2s@0,70301200 { + compatible = "nvidia,tegra124-i2s"; + reg = <0x0 0x70301200 0x0 0x100>; + nvidia,ahub-cif-ids = <6 6>; + clocks = <&tegra_car TEGRA124_CLK_I2S2>; + clock-names = "i2s"; + resets = <&tegra_car 18>; + reset-names = "i2s"; + status = "disabled"; + }; + + tegra_i2s3: i2s@0,70301300 { + compatible = "nvidia,tegra124-i2s"; + reg = <0x0 0x70301300 0x0 0x100>; + nvidia,ahub-cif-ids = <7 7>; + clocks = <&tegra_car TEGRA124_CLK_I2S3>; + clock-names = "i2s"; + resets = <&tegra_car 101>; + reset-names = "i2s"; + status = "disabled"; + }; + + tegra_i2s4: i2s@0,70301400 { + compatible = "nvidia,tegra124-i2s"; + reg = <0x0 0x70301400 0x0 0x100>; + nvidia,ahub-cif-ids = <8 8>; + clocks = <&tegra_car TEGRA124_CLK_I2S4>; + clock-names = "i2s"; + resets = <&tegra_car 102>; + reset-names = "i2s"; + status = "disabled"; + }; + }; + + usb@0,7d000000 { + compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci"; + reg = <0x0 0x7d000000 0x0 0x4000>; + interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA124_CLK_USBD>; + clock-names = "usb"; + resets = <&tegra_car 22>; + reset-names = "usb"; + nvidia,phy = <&phy1>; + status = "disabled"; + }; + + phy1: usb-phy@0,7d000000 { + compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy"; + reg = <0x0 0x7d000000 0x0 0x4000>, + <0x0 0x7d000000 0x0 0x4000>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA124_CLK_USBD>, + <&tegra_car TEGRA124_CLK_PLL_U>, + <&tegra_car TEGRA124_CLK_USBD>; + clock-names = "reg", "pll_u", "utmi-pads"; + resets = <&tegra_car 22>, <&tegra_car 22>; + reset-names = "usb", "utmi-pads"; + nvidia,hssync-start-delay = <0>; + nvidia,idle-wait-delay = <17>; + nvidia,elastic-limit = <16>; + nvidia,term-range-adj = <6>; + nvidia,xcvr-setup = <9>; + nvidia,xcvr-lsfslew = <0>; + nvidia,xcvr-lsrslew = <3>; + nvidia,hssquelch-level = <2>; + nvidia,hsdiscon-level = <5>; + nvidia,xcvr-hsslew = <12>; + nvidia,has-utmi-pad-registers; + status = "disabled"; + }; + + usb@0,7d004000 { + compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci"; + reg = <0x0 0x7d004000 0x0 0x4000>; + interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA124_CLK_USB2>; + clock-names = "usb"; + resets = <&tegra_car 58>; + reset-names = "usb"; + nvidia,phy = <&phy2>; + status = "disabled"; + }; + + phy2: usb-phy@0,7d004000 { + compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy"; + reg = <0x0 0x7d004000 0x0 0x4000>, + <0x0 0x7d000000 0x0 0x4000>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA124_CLK_USB2>, + <&tegra_car TEGRA124_CLK_PLL_U>, + <&tegra_car TEGRA124_CLK_USBD>; + clock-names = "reg", "pll_u", "utmi-pads"; + resets = <&tegra_car 58>, <&tegra_car 22>; + reset-names = "usb", "utmi-pads"; + nvidia,hssync-start-delay = <0>; + nvidia,idle-wait-delay = <17>; + nvidia,elastic-limit = <16>; + nvidia,term-range-adj = <6>; + nvidia,xcvr-setup = <9>; + nvidia,xcvr-lsfslew = <0>; + nvidia,xcvr-lsrslew = <3>; + nvidia,hssquelch-level = <2>; + nvidia,hsdiscon-level = <5>; + nvidia,xcvr-hsslew = <12>; + status = "disabled"; + }; + + usb@0,7d008000 { + compatible = "nvidia,tegra124-ehci", "nvidia,tegra30-ehci", "usb-ehci"; + reg = <0x0 0x7d008000 0x0 0x4000>; + interrupts = <GIC_SPI 97 IRQ_TYPE_LEVEL_HIGH>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA124_CLK_USB3>; + clock-names = "usb"; + resets = <&tegra_car 59>; + reset-names = "usb"; + nvidia,phy = <&phy3>; + status = "disabled"; + }; + + phy3: usb-phy@0,7d008000 { + compatible = "nvidia,tegra124-usb-phy", "nvidia,tegra30-usb-phy"; + reg = <0x0 0x7d008000 0x0 0x4000>, + <0x0 0x7d000000 0x0 0x4000>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA124_CLK_USB3>, + <&tegra_car TEGRA124_CLK_PLL_U>, + <&tegra_car TEGRA124_CLK_USBD>; + clock-names = "reg", "pll_u", "utmi-pads"; + resets = <&tegra_car 59>, <&tegra_car 22>; + reset-names = "usb", "utmi-pads"; + nvidia,hssync-start-delay = <0>; + nvidia,idle-wait-delay = <17>; + nvidia,elastic-limit = <16>; + nvidia,term-range-adj = <6>; + nvidia,xcvr-setup = <9>; + nvidia,xcvr-lsfslew = <0>; + nvidia,xcvr-lsrslew = <3>; + nvidia,hssquelch-level = <2>; + nvidia,hsdiscon-level = <5>; + nvidia,xcvr-hsslew = <12>; + status = "disabled"; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "nvidia,denver", "arm,armv8"; + reg = <0>; + }; + + cpu@1 { + device_type = "cpu"; + compatible = "nvidia,denver", "arm,armv8"; + reg = <1>; + }; + }; + + timer { + compatible = "arm,armv7-timer"; + interrupts = <GIC_PPI 13 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>, + <GIC_PPI 14 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>, + <GIC_PPI 11 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>, + <GIC_PPI 10 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>; + interrupt-parent = <&gic>; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi new file mode 100644 index 000000000000..2b7f88950d1e --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2180.dtsi @@ -0,0 +1,45 @@ +#include "tegra210.dtsi" + +/ { + model = "NVIDIA Jetson TX1"; + compatible = "nvidia,p2180", "nvidia,tegra210"; + + aliases { + rtc1 = "/rtc@0,7000e000"; + serial0 = &uarta; + }; + + memory { + device_type = "memory"; + reg = <0x0 0x80000000 0x1 0x0>; + }; + + /* debug port */ + serial@0,70006000 { + status = "okay"; + }; + + pmc@0,7000e400 { + nvidia,invert-interrupt; + }; + + /* eMMC */ + sdhci@0,700b0600 { + status = "okay"; + bus-width = <8>; + non-removable; + }; + + clocks { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <0>; + + clk32k_in: clock@0 { + compatible = "fixed-clock"; + reg = <0>; + #clock-cells = <0>; + clock-frequency = <32768>; + }; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts new file mode 100644 index 000000000000..1ddd8512e100 --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2371-0000.dts @@ -0,0 +1,9 @@ +/dts-v1/; + +#include "tegra210-p2530.dtsi" +#include "tegra210-p2595.dtsi" + +/ { + model = "NVIDIA Tegra210 P2371 (P2530/P2595) reference design"; + compatible = "nvidia,p2371-0000", "nvidia,tegra210"; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts new file mode 100644 index 000000000000..683b339a980c --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2371-2180.dts @@ -0,0 +1,9 @@ +/dts-v1/; + +#include "tegra210-p2180.dtsi" +#include "tegra210-p2597.dtsi" + +/ { + model = "NVIDIA Jetson TX1 Developer Kit"; + compatible = "nvidia,p2371-2180", "nvidia,tegra210"; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi new file mode 100644 index 000000000000..ece0dec61fae --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2530.dtsi @@ -0,0 +1,50 @@ +#include "tegra210.dtsi" + +/ { + model = "NVIDIA Tegra210 P2530 main board"; + compatible = "nvidia,p2530", "nvidia,tegra210"; + + aliases { + rtc1 = "/rtc@0,7000e000"; + serial0 = &uarta; + }; + + memory { + device_type = "memory"; + reg = <0x0 0x80000000 0x0 0xc0000000>; + }; + + /* debug port */ + serial@0,70006000 { + status = "okay"; + }; + + i2c@0,7000d000 { + status = "okay"; + clock-frequency = <400000>; + }; + + pmc@0,7000e400 { + nvidia,invert-interrupt; + }; + + /* eMMC */ + sdhci@0,700b0600 { + status = "okay"; + bus-width = <8>; + non-removable; + }; + + clocks { + compatible = "simple-bus"; + #address-cells = <1>; + #size-cells = <0>; + + clk32k_in: clock@0 { + compatible = "fixed-clock"; + reg = <0>; + #clock-cells = <0>; + clock-frequency = <32768>; + }; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts b/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts new file mode 100644 index 000000000000..58d27ddd57ff --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2571.dts @@ -0,0 +1,1302 @@ +/dts-v1/; + +#include <dt-bindings/input/input.h> +#include "tegra210-p2530.dtsi" + +/ { + model = "NVIDIA Tegra210 P2571 reference design"; + compatible = "nvidia,p2571", "nvidia,tegra210"; + + pinmux: pinmux@0,700008d4 { + pinctrl-names = "boot"; + pinctrl-0 = <&state_boot>; + + state_boot: pinmux { + pex_l0_rst_n_pa0 { + nvidia,pins = "pex_l0_rst_n_pa0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pex_l0_clkreq_n_pa1 { + nvidia,pins = "pex_l0_clkreq_n_pa1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pex_wake_n_pa2 { + nvidia,pins = "pex_wake_n_pa2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pex_l1_rst_n_pa3 { + nvidia,pins = "pex_l1_rst_n_pa3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pex_l1_clkreq_n_pa4 { + nvidia,pins = "pex_l1_clkreq_n_pa4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + sata_led_active_pa5 { + nvidia,pins = "sata_led_active_pa5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pa6 { + nvidia,pins = "pa6"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_fs_pb0 { + nvidia,pins = "dap1_fs_pb0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_din_pb1 { + nvidia,pins = "dap1_din_pb1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_dout_pb2 { + nvidia,pins = "dap1_dout_pb2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_sclk_pb3 { + nvidia,pins = "dap1_sclk_pb3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_mosi_pb4 { + nvidia,pins = "spi2_mosi_pb4"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_miso_pb5 { + nvidia,pins = "spi2_miso_pb5"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_sck_pb6 { + nvidia,pins = "spi2_sck_pb6"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_cs0_pb7 { + nvidia,pins = "spi2_cs0_pb7"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_mosi_pc0 { + nvidia,pins = "spi1_mosi_pc0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_miso_pc1 { + nvidia,pins = "spi1_miso_pc1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_sck_pc2 { + nvidia,pins = "spi1_sck_pc2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_cs0_pc3 { + nvidia,pins = "spi1_cs0_pc3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_cs1_pc4 { + nvidia,pins = "spi1_cs1_pc4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_sck_pc5 { + nvidia,pins = "spi4_sck_pc5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_cs0_pc6 { + nvidia,pins = "spi4_cs0_pc6"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_mosi_pc7 { + nvidia,pins = "spi4_mosi_pc7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_miso_pd0 { + nvidia,pins = "spi4_miso_pd0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_tx_pd1 { + nvidia,pins = "uart3_tx_pd1"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_rx_pd2 { + nvidia,pins = "uart3_rx_pd2"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_rts_pd3 { + nvidia,pins = "uart3_rts_pd3"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_cts_pd4 { + nvidia,pins = "uart3_cts_pd4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic1_clk_pe0 { + nvidia,pins = "dmic1_clk_pe0"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic1_dat_pe1 { + nvidia,pins = "dmic1_dat_pe1"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic2_clk_pe2 { + nvidia,pins = "dmic2_clk_pe2"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic2_dat_pe3 { + nvidia,pins = "dmic2_dat_pe3"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic3_clk_pe4 { + nvidia,pins = "dmic3_clk_pe4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic3_dat_pe5 { + nvidia,pins = "dmic3_dat_pe5"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pe6 { + nvidia,pins = "pe6"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pe7 { + nvidia,pins = "pe7"; + nvidia,function = "pwm3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gen3_i2c_scl_pf0 { + nvidia,pins = "gen3_i2c_scl_pf0"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen3_i2c_sda_pf1 { + nvidia,pins = "gen3_i2c_sda_pf1"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + uart2_tx_pg0 { + nvidia,pins = "uart2_tx_pg0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_rx_pg1 { + nvidia,pins = "uart2_rx_pg1"; + nvidia,function = "uartb"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_rts_pg2 { + nvidia,pins = "uart2_rts_pg2"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_cts_pg3 { + nvidia,pins = "uart2_cts_pg3"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_en_ph0 { + nvidia,pins = "wifi_en_ph0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_rst_ph1 { + nvidia,pins = "wifi_rst_ph1"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_wake_ap_ph2 { + nvidia,pins = "wifi_wake_ap_ph2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_wake_bt_ph3 { + nvidia,pins = "ap_wake_bt_ph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + bt_rst_ph4 { + nvidia,pins = "bt_rst_ph4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + bt_wake_ap_ph5 { + nvidia,pins = "bt_wake_ap_ph5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ph6 { + nvidia,pins = "ph6"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_wake_nfc_ph7 { + nvidia,pins = "ap_wake_nfc_ph7"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + nfc_en_pi0 { + nvidia,pins = "nfc_en_pi0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + nfc_int_pi1 { + nvidia,pins = "nfc_int_pi1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gps_en_pi2 { + nvidia,pins = "gps_en_pi2"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gps_rst_pi3 { + nvidia,pins = "gps_rst_pi3"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_tx_pi4 { + nvidia,pins = "uart4_tx_pi4"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_rx_pi5 { + nvidia,pins = "uart4_rx_pi5"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_rts_pi6 { + nvidia,pins = "uart4_rts_pi6"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_cts_pi7 { + nvidia,pins = "uart4_cts_pi7"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_sda_pj0 { + nvidia,pins = "gen1_i2c_sda_pj0"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_scl_pj1 { + nvidia,pins = "gen1_i2c_scl_pj1"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen2_i2c_scl_pj2 { + nvidia,pins = "gen2_i2c_scl_pj2"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + gen2_i2c_sda_pj3 { + nvidia,pins = "gen2_i2c_sda_pj3"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + dap4_fs_pj4 { + nvidia,pins = "dap4_fs_pj4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_din_pj5 { + nvidia,pins = "dap4_din_pj5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_dout_pj6 { + nvidia,pins = "dap4_dout_pj6"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_sclk_pj7 { + nvidia,pins = "dap4_sclk_pj7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk0 { + nvidia,pins = "pk0"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk1 { + nvidia,pins = "pk1"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk2 { + nvidia,pins = "pk2"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk3 { + nvidia,pins = "pk3"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk4 { + nvidia,pins = "pk4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk5 { + nvidia,pins = "pk5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk6 { + nvidia,pins = "pk6"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk7 { + nvidia,pins = "pk7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pl0 { + nvidia,pins = "pl0"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pl1 { + nvidia,pins = "pl1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_clk_pm0 { + nvidia,pins = "sdmmc1_clk_pm0"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_cmd_pm1 { + nvidia,pins = "sdmmc1_cmd_pm1"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat3_pm2 { + nvidia,pins = "sdmmc1_dat3_pm2"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat2_pm3 { + nvidia,pins = "sdmmc1_dat2_pm3"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat1_pm4 { + nvidia,pins = "sdmmc1_dat1_pm4"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat0_pm5 { + nvidia,pins = "sdmmc1_dat0_pm5"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_clk_pp0 { + nvidia,pins = "sdmmc3_clk_pp0"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_cmd_pp1 { + nvidia,pins = "sdmmc3_cmd_pp1"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat3_pp2 { + nvidia,pins = "sdmmc3_dat3_pp2"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat2_pp3 { + nvidia,pins = "sdmmc3_dat2_pp3"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat1_pp4 { + nvidia,pins = "sdmmc3_dat1_pp4"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat0_pp5 { + nvidia,pins = "sdmmc3_dat0_pp5"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_mclk_ps0 { + nvidia,pins = "cam1_mclk_ps0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam2_mclk_ps1 { + nvidia,pins = "cam2_mclk_ps1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_i2c_scl_ps2 { + nvidia,pins = "cam_i2c_scl_ps2"; + nvidia,function = "i2cvi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + cam_i2c_sda_ps3 { + nvidia,pins = "cam_i2c_sda_ps3"; + nvidia,function = "i2cvi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + cam_rst_ps4 { + nvidia,pins = "cam_rst_ps4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_af_en_ps5 { + nvidia,pins = "cam_af_en_ps5"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_flash_en_ps6 { + nvidia,pins = "cam_flash_en_ps6"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_pwdn_ps7 { + nvidia,pins = "cam1_pwdn_ps7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam2_pwdn_pt0 { + nvidia,pins = "cam2_pwdn_pt0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_strobe_pt1 { + nvidia,pins = "cam1_strobe_pt1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_tx_pu0 { + nvidia,pins = "uart1_tx_pu0"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_rx_pu1 { + nvidia,pins = "uart1_rx_pu1"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_rts_pu2 { + nvidia,pins = "uart1_rts_pu2"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_cts_pu3 { + nvidia,pins = "uart1_cts_pu3"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_bl_pwm_pv0 { + nvidia,pins = "lcd_bl_pwm_pv0"; + nvidia,function = "pwm0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_bl_en_pv1 { + nvidia,pins = "lcd_bl_en_pv1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_rst_pv2 { + nvidia,pins = "lcd_rst_pv2"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_gpio1_pv3 { + nvidia,pins = "lcd_gpio1_pv3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_gpio2_pv4 { + nvidia,pins = "lcd_gpio2_pv4"; + nvidia,function = "pwm1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_ready_pv5 { + nvidia,pins = "ap_ready_pv5"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_rst_pv6 { + nvidia,pins = "touch_rst_pv6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_clk_pv7 { + nvidia,pins = "touch_clk_pv7"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + modem_wake_ap_px0 { + nvidia,pins = "modem_wake_ap_px0"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_int_px1 { + nvidia,pins = "touch_int_px1"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + motion_int_px2 { + nvidia,pins = "motion_int_px2"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + als_prox_int_px3 { + nvidia,pins = "als_prox_int_px3"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + temp_alert_px4 { + nvidia,pins = "temp_alert_px4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_power_on_px5 { + nvidia,pins = "button_power_on_px5"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_vol_up_px6 { + nvidia,pins = "button_vol_up_px6"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_vol_down_px7 { + nvidia,pins = "button_vol_down_px7"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_slide_sw_py0 { + nvidia,pins = "button_slide_sw_py0"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_home_py1 { + nvidia,pins = "button_home_py1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_te_py2 { + nvidia,pins = "lcd_te_py2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_scl_py3 { + nvidia,pins = "pwr_i2c_scl_py3"; + nvidia,function = "i2cpmu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_sda_py4 { + nvidia,pins = "pwr_i2c_sda_py4"; + nvidia,function = "i2cpmu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + clk_32k_out_py5 { + nvidia,pins = "clk_32k_out_py5"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz0 { + nvidia,pins = "pz0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz1 { + nvidia,pins = "pz1"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz2 { + nvidia,pins = "pz2"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz3 { + nvidia,pins = "pz3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz4 { + nvidia,pins = "pz4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz5 { + nvidia,pins = "pz5"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_fs_paa0 { + nvidia,pins = "dap2_fs_paa0"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_sclk_paa1 { + nvidia,pins = "dap2_sclk_paa1"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_din_paa2 { + nvidia,pins = "dap2_din_paa2"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_dout_paa3 { + nvidia,pins = "dap2_dout_paa3"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + aud_mclk_pbb0 { + nvidia,pins = "aud_mclk_pbb0"; + nvidia,function = "aud"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dvfs_pwm_pbb1 { + nvidia,pins = "dvfs_pwm_pbb1"; + nvidia,function = "cldvfs"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dvfs_clk_pbb2 { + nvidia,pins = "dvfs_clk_pbb2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gpio_x1_aud_pbb3 { + nvidia,pins = "gpio_x1_aud_pbb3"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gpio_x3_aud_pbb4 { + nvidia,pins = "gpio_x3_aud_pbb4"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + hdmi_cec_pcc0 { + nvidia,pins = "hdmi_cec_pcc0"; + nvidia,function = "cec"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + hdmi_int_dp_hpd_pcc1 { + nvidia,pins = "hdmi_int_dp_hpd_pcc1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + spdif_out_pcc2 { + nvidia,pins = "spdif_out_pcc2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spdif_in_pcc3 { + nvidia,pins = "spdif_in_pcc3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + usb_vbus_en0_pcc4 { + nvidia,pins = "usb_vbus_en0_pcc4"; + nvidia,function = "usb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + usb_vbus_en1_pcc5 { + nvidia,pins = "usb_vbus_en1_pcc5"; + nvidia,function = "usb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + dp_hpd0_pcc6 { + nvidia,pins = "dp_hpd0_pcc6"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pcc7 { + nvidia,pins = "pcc7"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + spi2_cs1_pdd0 { + nvidia,pins = "spi2_cs1_pdd0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_sck_pee0 { + nvidia,pins = "qspi_sck_pee0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_cs_n_pee1 { + nvidia,pins = "qspi_cs_n_pee1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io0_pee2 { + nvidia,pins = "qspi_io0_pee2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io1_pee3 { + nvidia,pins = "qspi_io1_pee3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io2_pee4 { + nvidia,pins = "qspi_io2_pee4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io3_pee5 { + nvidia,pins = "qspi_io3_pee5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + core_pwr_req { + nvidia,pins = "core_pwr_req"; + nvidia,function = "core"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cpu_pwr_req { + nvidia,pins = "cpu_pwr_req"; + nvidia,function = "cpu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pwr_int_n { + nvidia,pins = "pwr_int_n"; + nvidia,function = "pmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + clk_32k_in { + nvidia,pins = "clk_32k_in"; + nvidia,function = "clk"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + jtag_rtck { + nvidia,pins = "jtag_rtck"; + nvidia,function = "jtag"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + clk_req { + nvidia,pins = "clk_req"; + nvidia,function = "sys"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + shutdown { + nvidia,pins = "shutdown"; + nvidia,function = "shutdown"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + }; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi new file mode 100644 index 000000000000..f3f91392214e --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2595.dtsi @@ -0,0 +1,1272 @@ +/ { + model = "NVIDIA Tegra210 P2595 I/O board"; + compatible = "nvidia,p2595", "nvidia,tegra210"; + + pinmux: pinmux@0,700008d4 { + pinctrl-names = "boot"; + pinctrl-0 = <&state_boot>; + + state_boot: pinmux { + pex_l0_rst_n_pa0 { + nvidia,pins = "pex_l0_rst_n_pa0"; + nvidia,function = "pe0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_l0_clkreq_n_pa1 { + nvidia,pins = "pex_l0_clkreq_n_pa1"; + nvidia,function = "pe0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_wake_n_pa2 { + nvidia,pins = "pex_wake_n_pa2"; + nvidia,function = "pe"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_l1_rst_n_pa3 { + nvidia,pins = "pex_l1_rst_n_pa3"; + nvidia,function = "pe1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_l1_clkreq_n_pa4 { + nvidia,pins = "pex_l1_clkreq_n_pa4"; + nvidia,function = "pe1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + sata_led_active_pa5 { + nvidia,pins = "sata_led_active_pa5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pa6 { + nvidia,pins = "pa6"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_fs_pb0 { + nvidia,pins = "dap1_fs_pb0"; + nvidia,function = "i2s1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_din_pb1 { + nvidia,pins = "dap1_din_pb1"; + nvidia,function = "i2s1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_dout_pb2 { + nvidia,pins = "dap1_dout_pb2"; + nvidia,function = "i2s1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_sclk_pb3 { + nvidia,pins = "dap1_sclk_pb3"; + nvidia,function = "i2s1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_mosi_pb4 { + nvidia,pins = "spi2_mosi_pb4"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_miso_pb5 { + nvidia,pins = "spi2_miso_pb5"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_sck_pb6 { + nvidia,pins = "spi2_sck_pb6"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_cs0_pb7 { + nvidia,pins = "spi2_cs0_pb7"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_mosi_pc0 { + nvidia,pins = "spi1_mosi_pc0"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_miso_pc1 { + nvidia,pins = "spi1_miso_pc1"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_sck_pc2 { + nvidia,pins = "spi1_sck_pc2"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_cs0_pc3 { + nvidia,pins = "spi1_cs0_pc3"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_cs1_pc4 { + nvidia,pins = "spi1_cs1_pc4"; + nvidia,function = "spi1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_sck_pc5 { + nvidia,pins = "spi4_sck_pc5"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_cs0_pc6 { + nvidia,pins = "spi4_cs0_pc6"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_mosi_pc7 { + nvidia,pins = "spi4_mosi_pc7"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_miso_pd0 { + nvidia,pins = "spi4_miso_pd0"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_tx_pd1 { + nvidia,pins = "uart3_tx_pd1"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_rx_pd2 { + nvidia,pins = "uart3_rx_pd2"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_rts_pd3 { + nvidia,pins = "uart3_rts_pd3"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_cts_pd4 { + nvidia,pins = "uart3_cts_pd4"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic1_clk_pe0 { + nvidia,pins = "dmic1_clk_pe0"; + nvidia,function = "dmic1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic1_dat_pe1 { + nvidia,pins = "dmic1_dat_pe1"; + nvidia,function = "dmic1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic2_clk_pe2 { + nvidia,pins = "dmic2_clk_pe2"; + nvidia,function = "dmic2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic2_dat_pe3 { + nvidia,pins = "dmic2_dat_pe3"; + nvidia,function = "dmic2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic3_clk_pe4 { + nvidia,pins = "dmic3_clk_pe4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic3_dat_pe5 { + nvidia,pins = "dmic3_dat_pe5"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pe6 { + nvidia,pins = "pe6"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pe7 { + nvidia,pins = "pe7"; + nvidia,function = "pwm3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gen3_i2c_scl_pf0 { + nvidia,pins = "gen3_i2c_scl_pf0"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen3_i2c_sda_pf1 { + nvidia,pins = "gen3_i2c_sda_pf1"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + uart2_tx_pg0 { + nvidia,pins = "uart2_tx_pg0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_rx_pg1 { + nvidia,pins = "uart2_rx_pg1"; + nvidia,function = "uartb"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_rts_pg2 { + nvidia,pins = "uart2_rts_pg2"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_cts_pg3 { + nvidia,pins = "uart2_cts_pg3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_en_ph0 { + nvidia,pins = "wifi_en_ph0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_rst_ph1 { + nvidia,pins = "wifi_rst_ph1"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_wake_ap_ph2 { + nvidia,pins = "wifi_wake_ap_ph2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_wake_bt_ph3 { + nvidia,pins = "ap_wake_bt_ph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + bt_rst_ph4 { + nvidia,pins = "bt_rst_ph4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + bt_wake_ap_ph5 { + nvidia,pins = "bt_wake_ap_ph5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ph6 { + nvidia,pins = "ph6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_wake_nfc_ph7 { + nvidia,pins = "ap_wake_nfc_ph7"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + nfc_en_pi0 { + nvidia,pins = "nfc_en_pi0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + nfc_int_pi1 { + nvidia,pins = "nfc_int_pi1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gps_en_pi2 { + nvidia,pins = "gps_en_pi2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gps_rst_pi3 { + nvidia,pins = "gps_rst_pi3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_tx_pi4 { + nvidia,pins = "uart4_tx_pi4"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_rx_pi5 { + nvidia,pins = "uart4_rx_pi5"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_rts_pi6 { + nvidia,pins = "uart4_rts_pi6"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_cts_pi7 { + nvidia,pins = "uart4_cts_pi7"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_sda_pj0 { + nvidia,pins = "gen1_i2c_sda_pj0"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_scl_pj1 { + nvidia,pins = "gen1_i2c_scl_pj1"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen2_i2c_scl_pj2 { + nvidia,pins = "gen2_i2c_scl_pj2"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + gen2_i2c_sda_pj3 { + nvidia,pins = "gen2_i2c_sda_pj3"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + dap4_fs_pj4 { + nvidia,pins = "dap4_fs_pj4"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_din_pj5 { + nvidia,pins = "dap4_din_pj5"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_dout_pj6 { + nvidia,pins = "dap4_dout_pj6"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_sclk_pj7 { + nvidia,pins = "dap4_sclk_pj7"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk0 { + nvidia,pins = "pk0"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk1 { + nvidia,pins = "pk1"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk2 { + nvidia,pins = "pk2"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk3 { + nvidia,pins = "pk3"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk4 { + nvidia,pins = "pk4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk5 { + nvidia,pins = "pk5"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk6 { + nvidia,pins = "pk6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk7 { + nvidia,pins = "pk7"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pl0 { + nvidia,pins = "pl0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pl1 { + nvidia,pins = "pl1"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_clk_pm0 { + nvidia,pins = "sdmmc1_clk_pm0"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_cmd_pm1 { + nvidia,pins = "sdmmc1_cmd_pm1"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat3_pm2 { + nvidia,pins = "sdmmc1_dat3_pm2"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat2_pm3 { + nvidia,pins = "sdmmc1_dat2_pm3"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat1_pm4 { + nvidia,pins = "sdmmc1_dat1_pm4"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat0_pm5 { + nvidia,pins = "sdmmc1_dat0_pm5"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_clk_pp0 { + nvidia,pins = "sdmmc3_clk_pp0"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_cmd_pp1 { + nvidia,pins = "sdmmc3_cmd_pp1"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat3_pp2 { + nvidia,pins = "sdmmc3_dat3_pp2"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat2_pp3 { + nvidia,pins = "sdmmc3_dat2_pp3"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat1_pp4 { + nvidia,pins = "sdmmc3_dat1_pp4"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat0_pp5 { + nvidia,pins = "sdmmc3_dat0_pp5"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_mclk_ps0 { + nvidia,pins = "cam1_mclk_ps0"; + nvidia,function = "extperiph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam2_mclk_ps1 { + nvidia,pins = "cam2_mclk_ps1"; + nvidia,function = "extperiph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_i2c_scl_ps2 { + nvidia,pins = "cam_i2c_scl_ps2"; + nvidia,function = "i2cvi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + cam_i2c_sda_ps3 { + nvidia,pins = "cam_i2c_sda_ps3"; + nvidia,function = "i2cvi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + cam_rst_ps4 { + nvidia,pins = "cam_rst_ps4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_af_en_ps5 { + nvidia,pins = "cam_af_en_ps5"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_flash_en_ps6 { + nvidia,pins = "cam_flash_en_ps6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_pwdn_ps7 { + nvidia,pins = "cam1_pwdn_ps7"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam2_pwdn_pt0 { + nvidia,pins = "cam2_pwdn_pt0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_strobe_pt1 { + nvidia,pins = "cam1_strobe_pt1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_tx_pu0 { + nvidia,pins = "uart1_tx_pu0"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_rx_pu1 { + nvidia,pins = "uart1_rx_pu1"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_rts_pu2 { + nvidia,pins = "uart1_rts_pu2"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_cts_pu3 { + nvidia,pins = "uart1_cts_pu3"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_bl_pwm_pv0 { + nvidia,pins = "lcd_bl_pwm_pv0"; + nvidia,function = "pwm0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_bl_en_pv1 { + nvidia,pins = "lcd_bl_en_pv1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_rst_pv2 { + nvidia,pins = "lcd_rst_pv2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_gpio1_pv3 { + nvidia,pins = "lcd_gpio1_pv3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_gpio2_pv4 { + nvidia,pins = "lcd_gpio2_pv4"; + nvidia,function = "pwm1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_ready_pv5 { + nvidia,pins = "ap_ready_pv5"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_rst_pv6 { + nvidia,pins = "touch_rst_pv6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_clk_pv7 { + nvidia,pins = "touch_clk_pv7"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + modem_wake_ap_px0 { + nvidia,pins = "modem_wake_ap_px0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_int_px1 { + nvidia,pins = "touch_int_px1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + motion_int_px2 { + nvidia,pins = "motion_int_px2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + als_prox_int_px3 { + nvidia,pins = "als_prox_int_px3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + temp_alert_px4 { + nvidia,pins = "temp_alert_px4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_power_on_px5 { + nvidia,pins = "button_power_on_px5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_vol_up_px6 { + nvidia,pins = "button_vol_up_px6"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_vol_down_px7 { + nvidia,pins = "button_vol_down_px7"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_slide_sw_py0 { + nvidia,pins = "button_slide_sw_py0"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_home_py1 { + nvidia,pins = "button_home_py1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_te_py2 { + nvidia,pins = "lcd_te_py2"; + nvidia,function = "displaya"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_scl_py3 { + nvidia,pins = "pwr_i2c_scl_py3"; + nvidia,function = "i2cpmu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_sda_py4 { + nvidia,pins = "pwr_i2c_sda_py4"; + nvidia,function = "i2cpmu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + clk_32k_out_py5 { + nvidia,pins = "clk_32k_out_py5"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz0 { + nvidia,pins = "pz0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz1 { + nvidia,pins = "pz1"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz2 { + nvidia,pins = "pz2"; + nvidia,function = "rsvd2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz3 { + nvidia,pins = "pz3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz4 { + nvidia,pins = "pz4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz5 { + nvidia,pins = "pz5"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_fs_paa0 { + nvidia,pins = "dap2_fs_paa0"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_sclk_paa1 { + nvidia,pins = "dap2_sclk_paa1"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_din_paa2 { + nvidia,pins = "dap2_din_paa2"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_dout_paa3 { + nvidia,pins = "dap2_dout_paa3"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + aud_mclk_pbb0 { + nvidia,pins = "aud_mclk_pbb0"; + nvidia,function = "aud"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dvfs_pwm_pbb1 { + nvidia,pins = "dvfs_pwm_pbb1"; + nvidia,function = "cldvfs"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dvfs_clk_pbb2 { + nvidia,pins = "dvfs_clk_pbb2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gpio_x1_aud_pbb3 { + nvidia,pins = "gpio_x1_aud_pbb3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gpio_x3_aud_pbb4 { + nvidia,pins = "gpio_x3_aud_pbb4"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + hdmi_cec_pcc0 { + nvidia,pins = "hdmi_cec_pcc0"; + nvidia,function = "cec"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + hdmi_int_dp_hpd_pcc1 { + nvidia,pins = "hdmi_int_dp_hpd_pcc1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + spdif_out_pcc2 { + nvidia,pins = "spdif_out_pcc2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spdif_in_pcc3 { + nvidia,pins = "spdif_in_pcc3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + usb_vbus_en0_pcc4 { + nvidia,pins = "usb_vbus_en0_pcc4"; + nvidia,function = "usb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + usb_vbus_en1_pcc5 { + nvidia,pins = "usb_vbus_en1_pcc5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + dp_hpd0_pcc6 { + nvidia,pins = "dp_hpd0_pcc6"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pcc7 { + nvidia,pins = "pcc7"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + spi2_cs1_pdd0 { + nvidia,pins = "spi2_cs1_pdd0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_sck_pee0 { + nvidia,pins = "qspi_sck_pee0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_cs_n_pee1 { + nvidia,pins = "qspi_cs_n_pee1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io0_pee2 { + nvidia,pins = "qspi_io0_pee2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io1_pee3 { + nvidia,pins = "qspi_io1_pee3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io2_pee4 { + nvidia,pins = "qspi_io2_pee4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io3_pee5 { + nvidia,pins = "qspi_io3_pee5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + core_pwr_req { + nvidia,pins = "core_pwr_req"; + nvidia,function = "core"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cpu_pwr_req { + nvidia,pins = "cpu_pwr_req"; + nvidia,function = "cpu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pwr_int_n { + nvidia,pins = "pwr_int_n"; + nvidia,function = "pmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + clk_32k_in { + nvidia,pins = "clk_32k_in"; + nvidia,function = "clk"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + jtag_rtck { + nvidia,pins = "jtag_rtck"; + nvidia,function = "jtag"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + clk_req { + nvidia,pins = "clk_req"; + nvidia,function = "sys"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + shutdown { + nvidia,pins = "shutdown"; + nvidia,function = "shutdown"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + }; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi new file mode 100644 index 000000000000..be3eccbe8013 --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210-p2597.dtsi @@ -0,0 +1,1270 @@ +/ { + model = "NVIDIA Tegra210 P2597 I/O board"; + compatible = "nvidia,p2597", "nvidia,tegra210"; + + pinmux: pinmux@0,700008d4 { + pinctrl-names = "boot"; + pinctrl-0 = <&state_boot>; + + state_boot: pinmux { + pex_l0_rst_n_pa0 { + nvidia,pins = "pex_l0_rst_n_pa0"; + nvidia,function = "pe0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_l0_clkreq_n_pa1 { + nvidia,pins = "pex_l0_clkreq_n_pa1"; + nvidia,function = "pe0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_wake_n_pa2 { + nvidia,pins = "pex_wake_n_pa2"; + nvidia,function = "pe"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_l1_rst_n_pa3 { + nvidia,pins = "pex_l1_rst_n_pa3"; + nvidia,function = "pe1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + pex_l1_clkreq_n_pa4 { + nvidia,pins = "pex_l1_clkreq_n_pa4"; + nvidia,function = "pe1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + sata_led_active_pa5 { + nvidia,pins = "sata_led_active_pa5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pa6 { + nvidia,pins = "pa6"; + nvidia,function = "sata"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_fs_pb0 { + nvidia,pins = "dap1_fs_pb0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_din_pb1 { + nvidia,pins = "dap1_din_pb1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_dout_pb2 { + nvidia,pins = "dap1_dout_pb2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap1_sclk_pb3 { + nvidia,pins = "dap1_sclk_pb3"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_mosi_pb4 { + nvidia,pins = "spi2_mosi_pb4"; + nvidia,function = "spi2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_miso_pb5 { + nvidia,pins = "spi2_miso_pb5"; + nvidia,function = "spi2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_sck_pb6 { + nvidia,pins = "spi2_sck_pb6"; + nvidia,function = "spi2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi2_cs0_pb7 { + nvidia,pins = "spi2_cs0_pb7"; + nvidia,function = "spi2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_mosi_pc0 { + nvidia,pins = "spi1_mosi_pc0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_miso_pc1 { + nvidia,pins = "spi1_miso_pc1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_sck_pc2 { + nvidia,pins = "spi1_sck_pc2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_cs0_pc3 { + nvidia,pins = "spi1_cs0_pc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi1_cs1_pc4 { + nvidia,pins = "spi1_cs1_pc4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_sck_pc5 { + nvidia,pins = "spi4_sck_pc5"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_cs0_pc6 { + nvidia,pins = "spi4_cs0_pc6"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_mosi_pc7 { + nvidia,pins = "spi4_mosi_pc7"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spi4_miso_pd0 { + nvidia,pins = "spi4_miso_pd0"; + nvidia,function = "spi4"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_tx_pd1 { + nvidia,pins = "uart3_tx_pd1"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_rx_pd2 { + nvidia,pins = "uart3_rx_pd2"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_rts_pd3 { + nvidia,pins = "uart3_rts_pd3"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart3_cts_pd4 { + nvidia,pins = "uart3_cts_pd4"; + nvidia,function = "uartc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic1_clk_pe0 { + nvidia,pins = "dmic1_clk_pe0"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic1_dat_pe1 { + nvidia,pins = "dmic1_dat_pe1"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic2_clk_pe2 { + nvidia,pins = "dmic2_clk_pe2"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic2_dat_pe3 { + nvidia,pins = "dmic2_dat_pe3"; + nvidia,function = "i2s3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic3_clk_pe4 { + nvidia,pins = "dmic3_clk_pe4"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dmic3_dat_pe5 { + nvidia,pins = "dmic3_dat_pe5"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pe6 { + nvidia,pins = "pe6"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pe7 { + nvidia,pins = "pe7"; + nvidia,function = "pwm3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gen3_i2c_scl_pf0 { + nvidia,pins = "gen3_i2c_scl_pf0"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen3_i2c_sda_pf1 { + nvidia,pins = "gen3_i2c_sda_pf1"; + nvidia,function = "i2c3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + uart2_tx_pg0 { + nvidia,pins = "uart2_tx_pg0"; + nvidia,function = "uartb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_rx_pg1 { + nvidia,pins = "uart2_rx_pg1"; + nvidia,function = "uartb"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_rts_pg2 { + nvidia,pins = "uart2_rts_pg2"; + nvidia,function = "uartb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart2_cts_pg3 { + nvidia,pins = "uart2_cts_pg3"; + nvidia,function = "uartb"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_en_ph0 { + nvidia,pins = "wifi_en_ph0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_rst_ph1 { + nvidia,pins = "wifi_rst_ph1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + wifi_wake_ap_ph2 { + nvidia,pins = "wifi_wake_ap_ph2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_wake_bt_ph3 { + nvidia,pins = "ap_wake_bt_ph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + bt_rst_ph4 { + nvidia,pins = "bt_rst_ph4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + bt_wake_ap_ph5 { + nvidia,pins = "bt_wake_ap_ph5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ph6 { + nvidia,pins = "ph6"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_wake_nfc_ph7 { + nvidia,pins = "ap_wake_nfc_ph7"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + nfc_en_pi0 { + nvidia,pins = "nfc_en_pi0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + nfc_int_pi1 { + nvidia,pins = "nfc_int_pi1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gps_en_pi2 { + nvidia,pins = "gps_en_pi2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gps_rst_pi3 { + nvidia,pins = "gps_rst_pi3"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_tx_pi4 { + nvidia,pins = "uart4_tx_pi4"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_rx_pi5 { + nvidia,pins = "uart4_rx_pi5"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_rts_pi6 { + nvidia,pins = "uart4_rts_pi6"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart4_cts_pi7 { + nvidia,pins = "uart4_cts_pi7"; + nvidia,function = "uartd"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_sda_pj0 { + nvidia,pins = "gen1_i2c_sda_pj0"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen1_i2c_scl_pj1 { + nvidia,pins = "gen1_i2c_scl_pj1"; + nvidia,function = "i2c1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + gen2_i2c_scl_pj2 { + nvidia,pins = "gen2_i2c_scl_pj2"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + gen2_i2c_sda_pj3 { + nvidia,pins = "gen2_i2c_sda_pj3"; + nvidia,function = "i2c2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + dap4_fs_pj4 { + nvidia,pins = "dap4_fs_pj4"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_din_pj5 { + nvidia,pins = "dap4_din_pj5"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_dout_pj6 { + nvidia,pins = "dap4_dout_pj6"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap4_sclk_pj7 { + nvidia,pins = "dap4_sclk_pj7"; + nvidia,function = "i2s4b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk0 { + nvidia,pins = "pk0"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk1 { + nvidia,pins = "pk1"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk2 { + nvidia,pins = "pk2"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk3 { + nvidia,pins = "pk3"; + nvidia,function = "i2s5b"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk4 { + nvidia,pins = "pk4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk5 { + nvidia,pins = "pk5"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk6 { + nvidia,pins = "pk6"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pk7 { + nvidia,pins = "pk7"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pl0 { + nvidia,pins = "pl0"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pl1 { + nvidia,pins = "pl1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_clk_pm0 { + nvidia,pins = "sdmmc1_clk_pm0"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_cmd_pm1 { + nvidia,pins = "sdmmc1_cmd_pm1"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat3_pm2 { + nvidia,pins = "sdmmc1_dat3_pm2"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat2_pm3 { + nvidia,pins = "sdmmc1_dat2_pm3"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat1_pm4 { + nvidia,pins = "sdmmc1_dat1_pm4"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc1_dat0_pm5 { + nvidia,pins = "sdmmc1_dat0_pm5"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_clk_pp0 { + nvidia,pins = "sdmmc3_clk_pp0"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_cmd_pp1 { + nvidia,pins = "sdmmc3_cmd_pp1"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat3_pp2 { + nvidia,pins = "sdmmc3_dat3_pp2"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat2_pp3 { + nvidia,pins = "sdmmc3_dat2_pp3"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat1_pp4 { + nvidia,pins = "sdmmc3_dat1_pp4"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + sdmmc3_dat0_pp5 { + nvidia,pins = "sdmmc3_dat0_pp5"; + nvidia,function = "sdmmc3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_mclk_ps0 { + nvidia,pins = "cam1_mclk_ps0"; + nvidia,function = "extperiph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam2_mclk_ps1 { + nvidia,pins = "cam2_mclk_ps1"; + nvidia,function = "extperiph3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_i2c_scl_ps2 { + nvidia,pins = "cam_i2c_scl_ps2"; + nvidia,function = "i2cvi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + cam_i2c_sda_ps3 { + nvidia,pins = "cam_i2c_sda_ps3"; + nvidia,function = "i2cvi"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + cam_rst_ps4 { + nvidia,pins = "cam_rst_ps4"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_af_en_ps5 { + nvidia,pins = "cam_af_en_ps5"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam_flash_en_ps6 { + nvidia,pins = "cam_flash_en_ps6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_pwdn_ps7 { + nvidia,pins = "cam1_pwdn_ps7"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam2_pwdn_pt0 { + nvidia,pins = "cam2_pwdn_pt0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cam1_strobe_pt1 { + nvidia,pins = "cam1_strobe_pt1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_tx_pu0 { + nvidia,pins = "uart1_tx_pu0"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_rx_pu1 { + nvidia,pins = "uart1_rx_pu1"; + nvidia,function = "uarta"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_rts_pu2 { + nvidia,pins = "uart1_rts_pu2"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + uart1_cts_pu3 { + nvidia,pins = "uart1_cts_pu3"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_bl_pwm_pv0 { + nvidia,pins = "lcd_bl_pwm_pv0"; + nvidia,function = "pwm0"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_bl_en_pv1 { + nvidia,pins = "lcd_bl_en_pv1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_rst_pv2 { + nvidia,pins = "lcd_rst_pv2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_gpio1_pv3 { + nvidia,pins = "lcd_gpio1_pv3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_gpio2_pv4 { + nvidia,pins = "lcd_gpio2_pv4"; + nvidia,function = "pwm1"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + ap_ready_pv5 { + nvidia,pins = "ap_ready_pv5"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_rst_pv6 { + nvidia,pins = "touch_rst_pv6"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_clk_pv7 { + nvidia,pins = "touch_clk_pv7"; + nvidia,function = "touch"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + modem_wake_ap_px0 { + nvidia,pins = "modem_wake_ap_px0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + touch_int_px1 { + nvidia,pins = "touch_int_px1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + motion_int_px2 { + nvidia,pins = "motion_int_px2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + als_prox_int_px3 { + nvidia,pins = "als_prox_int_px3"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + temp_alert_px4 { + nvidia,pins = "temp_alert_px4"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_power_on_px5 { + nvidia,pins = "button_power_on_px5"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_vol_up_px6 { + nvidia,pins = "button_vol_up_px6"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_vol_down_px7 { + nvidia,pins = "button_vol_down_px7"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_slide_sw_py0 { + nvidia,pins = "button_slide_sw_py0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + button_home_py1 { + nvidia,pins = "button_home_py1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + lcd_te_py2 { + nvidia,pins = "lcd_te_py2"; + nvidia,function = "displaya"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_scl_py3 { + nvidia,pins = "pwr_i2c_scl_py3"; + nvidia,function = "i2cpmu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + pwr_i2c_sda_py4 { + nvidia,pins = "pwr_i2c_sda_py4"; + nvidia,function = "i2cpmu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + clk_32k_out_py5 { + nvidia,pins = "clk_32k_out_py5"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz0 { + nvidia,pins = "pz0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz1 { + nvidia,pins = "pz1"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz2 { + nvidia,pins = "pz2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz3 { + nvidia,pins = "pz3"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz4 { + nvidia,pins = "pz4"; + nvidia,function = "sdmmc1"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pz5 { + nvidia,pins = "pz5"; + nvidia,function = "soc"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_fs_paa0 { + nvidia,pins = "dap2_fs_paa0"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_sclk_paa1 { + nvidia,pins = "dap2_sclk_paa1"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_din_paa2 { + nvidia,pins = "dap2_din_paa2"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dap2_dout_paa3 { + nvidia,pins = "dap2_dout_paa3"; + nvidia,function = "i2s2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + aud_mclk_pbb0 { + nvidia,pins = "aud_mclk_pbb0"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dvfs_pwm_pbb1 { + nvidia,pins = "dvfs_pwm_pbb1"; + nvidia,function = "cldvfs"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + dvfs_clk_pbb2 { + nvidia,pins = "dvfs_clk_pbb2"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gpio_x1_aud_pbb3 { + nvidia,pins = "gpio_x1_aud_pbb3"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + gpio_x3_aud_pbb4 { + nvidia,pins = "gpio_x3_aud_pbb4"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + hdmi_cec_pcc0 { + nvidia,pins = "hdmi_cec_pcc0"; + nvidia,function = "cec"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + hdmi_int_dp_hpd_pcc1 { + nvidia,pins = "hdmi_int_dp_hpd_pcc1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + spdif_out_pcc2 { + nvidia,pins = "spdif_out_pcc2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + spdif_in_pcc3 { + nvidia,pins = "spdif_in_pcc3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + usb_vbus_en0_pcc4 { + nvidia,pins = "usb_vbus_en0_pcc4"; + nvidia,function = "usb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + usb_vbus_en1_pcc5 { + nvidia,pins = "usb_vbus_en1_pcc5"; + nvidia,function = "usb"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_ENABLE>; + }; + dp_hpd0_pcc6 { + nvidia,pins = "dp_hpd0_pcc6"; + nvidia,function = "dp"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pcc7 { + nvidia,pins = "pcc7"; + nvidia,function = "rsvd0"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + nvidia,io-hv = <TEGRA_PIN_DISABLE>; + }; + spi2_cs1_pdd0 { + nvidia,pins = "spi2_cs1_pdd0"; + nvidia,function = "spi2"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_sck_pee0 { + nvidia,pins = "qspi_sck_pee0"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_cs_n_pee1 { + nvidia,pins = "qspi_cs_n_pee1"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io0_pee2 { + nvidia,pins = "qspi_io0_pee2"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io1_pee3 { + nvidia,pins = "qspi_io1_pee3"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io2_pee4 { + nvidia,pins = "qspi_io2_pee4"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + qspi_io3_pee5 { + nvidia,pins = "qspi_io3_pee5"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + core_pwr_req { + nvidia,pins = "core_pwr_req"; + nvidia,function = "core"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + cpu_pwr_req { + nvidia,pins = "cpu_pwr_req"; + nvidia,function = "cpu"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + pwr_int_n { + nvidia,pins = "pwr_int_n"; + nvidia,function = "pmi"; + nvidia,pull = <TEGRA_PIN_PULL_UP>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + clk_32k_in { + nvidia,pins = "clk_32k_in"; + nvidia,function = "clk"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_ENABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + jtag_rtck { + nvidia,pins = "jtag_rtck"; + nvidia,function = "jtag"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + clk_req { + nvidia,pins = "clk_req"; + nvidia,function = "rsvd1"; + nvidia,pull = <TEGRA_PIN_PULL_DOWN>; + nvidia,tristate = <TEGRA_PIN_ENABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + shutdown { + nvidia,pins = "shutdown"; + nvidia,function = "shutdown"; + nvidia,pull = <TEGRA_PIN_PULL_NONE>; + nvidia,tristate = <TEGRA_PIN_DISABLE>; + nvidia,enable-input = <TEGRA_PIN_DISABLE>; + nvidia,open-drain = <TEGRA_PIN_DISABLE>; + }; + }; + }; + + /* MMC/SD */ + sdhci@0,700b0000 { + status = "okay"; + bus-width = <4>; + no-1-8-v; + + cd-gpios = <&gpio TEGRA_GPIO(Z, 1) GPIO_ACTIVE_LOW>; + }; +}; diff --git a/arch/arm64/boot/dts/nvidia/tegra210.dtsi b/arch/arm64/boot/dts/nvidia/tegra210.dtsi new file mode 100644 index 000000000000..bc23f4dea002 --- /dev/null +++ b/arch/arm64/boot/dts/nvidia/tegra210.dtsi @@ -0,0 +1,805 @@ +#include <dt-bindings/clock/tegra210-car.h> +#include <dt-bindings/gpio/tegra-gpio.h> +#include <dt-bindings/memory/tegra210-mc.h> +#include <dt-bindings/pinctrl/pinctrl-tegra.h> +#include <dt-bindings/interrupt-controller/arm-gic.h> + +/ { + compatible = "nvidia,tegra210"; + interrupt-parent = <&lic>; + #address-cells = <2>; + #size-cells = <2>; + + host1x@0,50000000 { + compatible = "nvidia,tegra210-host1x", "simple-bus"; + reg = <0x0 0x50000000 0x0 0x00034000>; + interrupts = <GIC_SPI 65 IRQ_TYPE_LEVEL_HIGH>, /* syncpt */ + <GIC_SPI 67 IRQ_TYPE_LEVEL_HIGH>; /* general */ + clocks = <&tegra_car TEGRA210_CLK_HOST1X>; + clock-names = "host1x"; + resets = <&tegra_car 28>; + reset-names = "host1x"; + + #address-cells = <2>; + #size-cells = <2>; + + ranges = <0x0 0x54000000 0x0 0x54000000 0x0 0x01000000>; + + dpaux1: dpaux@0,54040000 { + compatible = "nvidia,tegra210-dpaux"; + reg = <0x0 0x54040000 0x0 0x00040000>; + interrupts = <GIC_SPI 11 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_DPAUX1>, + <&tegra_car TEGRA210_CLK_PLL_DP>; + clock-names = "dpaux", "parent"; + resets = <&tegra_car 207>; + reset-names = "dpaux"; + status = "disabled"; + }; + + vi@0,54080000 { + compatible = "nvidia,tegra210-vi"; + reg = <0x0 0x54080000 0x0 0x00040000>; + interrupts = <GIC_SPI 69 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + tsec@0,54100000 { + compatible = "nvidia,tegra210-tsec"; + reg = <0x0 0x54100000 0x0 0x00040000>; + }; + + dc@0,54200000 { + compatible = "nvidia,tegra210-dc"; + reg = <0x0 0x54200000 0x0 0x00040000>; + interrupts = <GIC_SPI 73 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_DISP1>, + <&tegra_car TEGRA210_CLK_PLL_P>; + clock-names = "dc", "parent"; + resets = <&tegra_car 27>; + reset-names = "dc"; + + iommus = <&mc TEGRA_SWGROUP_DC>; + + nvidia,head = <0>; + }; + + dc@0,54240000 { + compatible = "nvidia,tegra210-dc"; + reg = <0x0 0x54240000 0x0 0x00040000>; + interrupts = <GIC_SPI 74 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_DISP2>, + <&tegra_car TEGRA210_CLK_PLL_P>; + clock-names = "dc", "parent"; + resets = <&tegra_car 26>; + reset-names = "dc"; + + iommus = <&mc TEGRA_SWGROUP_DCB>; + + nvidia,head = <1>; + }; + + dsi@0,54300000 { + compatible = "nvidia,tegra210-dsi"; + reg = <0x0 0x54300000 0x0 0x00040000>; + clocks = <&tegra_car TEGRA210_CLK_DSIA>, + <&tegra_car TEGRA210_CLK_DSIALP>, + <&tegra_car TEGRA210_CLK_PLL_D_OUT0>; + clock-names = "dsi", "lp", "parent"; + resets = <&tegra_car 48>; + reset-names = "dsi"; + nvidia,mipi-calibrate = <&mipi 0x0c0>; /* DSIA & DSIB pads */ + + status = "disabled"; + + #address-cells = <1>; + #size-cells = <0>; + }; + + vic@0,54340000 { + compatible = "nvidia,tegra210-vic"; + reg = <0x0 0x54340000 0x0 0x00040000>; + status = "disabled"; + }; + + nvjpg@0,54380000 { + compatible = "nvidia,tegra210-nvjpg"; + reg = <0x0 0x54380000 0x0 0x00040000>; + status = "disabled"; + }; + + dsi@0,54400000 { + compatible = "nvidia,tegra210-dsi"; + reg = <0x0 0x54400000 0x0 0x00040000>; + clocks = <&tegra_car TEGRA210_CLK_DSIB>, + <&tegra_car TEGRA210_CLK_DSIBLP>, + <&tegra_car TEGRA210_CLK_PLL_D_OUT0>; + clock-names = "dsi", "lp", "parent"; + resets = <&tegra_car 82>; + reset-names = "dsi"; + nvidia,mipi-calibrate = <&mipi 0x300>; /* DSIC & DSID pads */ + + status = "disabled"; + + #address-cells = <1>; + #size-cells = <0>; + }; + + nvdec@0,54480000 { + compatible = "nvidia,tegra210-nvdec"; + reg = <0x0 0x54480000 0x0 0x00040000>; + status = "disabled"; + }; + + nvenc@0,544c0000 { + compatible = "nvidia,tegra210-nvenc"; + reg = <0x0 0x544c0000 0x0 0x00040000>; + status = "disabled"; + }; + + tsec@0,54500000 { + compatible = "nvidia,tegra210-tsec"; + reg = <0x0 0x54500000 0x0 0x00040000>; + status = "disabled"; + }; + + sor@0,54540000 { + compatible = "nvidia,tegra210-sor"; + reg = <0x0 0x54540000 0x0 0x00040000>; + interrupts = <GIC_SPI 76 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_SOR0>, + <&tegra_car TEGRA210_CLK_PLL_D_OUT0>, + <&tegra_car TEGRA210_CLK_PLL_DP>, + <&tegra_car TEGRA210_CLK_SOR_SAFE>; + clock-names = "sor", "parent", "dp", "safe"; + resets = <&tegra_car 182>; + reset-names = "sor"; + status = "disabled"; + }; + + sor@0,54580000 { + compatible = "nvidia,tegra210-sor1"; + reg = <0x0 0x54580000 0x0 0x00040000>; + interrupts = <GIC_SPI 75 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_SOR1>, + <&tegra_car TEGRA210_CLK_PLL_D2_OUT0>, + <&tegra_car TEGRA210_CLK_PLL_DP>, + <&tegra_car TEGRA210_CLK_SOR_SAFE>; + clock-names = "sor", "parent", "dp", "safe"; + resets = <&tegra_car 183>; + reset-names = "sor"; + status = "disabled"; + }; + + dpaux: dpaux@0,545c0000 { + compatible = "nvidia,tegra124-dpaux"; + reg = <0x0 0x545c0000 0x0 0x00040000>; + interrupts = <GIC_SPI 159 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_DPAUX>, + <&tegra_car TEGRA210_CLK_PLL_DP>; + clock-names = "dpaux", "parent"; + resets = <&tegra_car 181>; + reset-names = "dpaux"; + status = "disabled"; + }; + + isp@0,54600000 { + compatible = "nvidia,tegra210-isp"; + reg = <0x0 0x54600000 0x0 0x00040000>; + interrupts = <GIC_SPI 71 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + isp@0,54680000 { + compatible = "nvidia,tegra210-isp"; + reg = <0x0 0x54680000 0x0 0x00040000>; + interrupts = <GIC_SPI 70 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + + i2c@0,546c0000 { + compatible = "nvidia,tegra210-i2c-vi"; + reg = <0x0 0x546c0000 0x0 0x00040000>; + interrupts = <GIC_SPI 17 IRQ_TYPE_LEVEL_HIGH>; + status = "disabled"; + }; + }; + + gic: interrupt-controller@0,50041000 { + compatible = "arm,gic-400"; + #interrupt-cells = <3>; + interrupt-controller; + reg = <0x0 0x50041000 0x0 0x1000>, + <0x0 0x50042000 0x0 0x2000>, + <0x0 0x50044000 0x0 0x2000>, + <0x0 0x50046000 0x0 0x2000>; + interrupts = <GIC_PPI 9 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_HIGH)>; + interrupt-parent = <&gic>; + }; + + gpu@0,57000000 { + compatible = "nvidia,gm20b"; + reg = <0x0 0x57000000 0x0 0x01000000>, + <0x0 0x58000000 0x0 0x01000000>; + interrupts = <GIC_SPI 157 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 158 IRQ_TYPE_LEVEL_HIGH>; + interrupt-names = "stall", "nonstall"; + clocks = <&tegra_car TEGRA210_CLK_GPU>, + <&tegra_car TEGRA210_CLK_PLL_P_OUT5>; + clock-names = "gpu", "pwr"; + resets = <&tegra_car 184>; + reset-names = "gpu"; + status = "disabled"; + }; + + lic: interrupt-controller@0,60004000 { + compatible = "nvidia,tegra210-ictlr"; + reg = <0x0 0x60004000 0x0 0x40>, /* primary controller */ + <0x0 0x60004100 0x0 0x40>, /* secondary controller */ + <0x0 0x60004200 0x0 0x40>, /* tertiary controller */ + <0x0 0x60004300 0x0 0x40>, /* quaternary controller */ + <0x0 0x60004400 0x0 0x40>, /* quinary controller */ + <0x0 0x60004500 0x0 0x40>; /* senary controller */ + interrupt-controller; + #interrupt-cells = <3>; + interrupt-parent = <&gic>; + }; + + timer@0,60005000 { + compatible = "nvidia,tegra210-timer", "nvidia,tegra20-timer"; + reg = <0x0 0x60005000 0x0 0x400>; + interrupts = <GIC_SPI 0 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 1 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 41 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 42 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 121 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 122 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_TIMER>; + clock-names = "timer"; + }; + + tegra_car: clock@0,60006000 { + compatible = "nvidia,tegra210-car"; + reg = <0x0 0x60006000 0x0 0x1000>; + #clock-cells = <1>; + #reset-cells = <1>; + }; + + flow-controller@0,60007000 { + compatible = "nvidia,tegra210-flowctrl"; + reg = <0x0 0x60007000 0x0 0x1000>; + }; + + gpio: gpio@0,6000d000 { + compatible = "nvidia,tegra210-gpio", "nvidia,tegra124-gpio", "nvidia,tegra30-gpio"; + reg = <0x0 0x6000d000 0x0 0x1000>; + interrupts = <GIC_SPI 32 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 33 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 34 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 35 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 55 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 87 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 89 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 125 IRQ_TYPE_LEVEL_HIGH>; + #gpio-cells = <2>; + gpio-controller; + #interrupt-cells = <2>; + interrupt-controller; + }; + + apbdma: dma@0,60020000 { + compatible = "nvidia,tegra210-apbdma", "nvidia,tegra148-apbdma"; + reg = <0x0 0x60020000 0x0 0x1400>; + interrupts = <GIC_SPI 104 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 105 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 106 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 107 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 108 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 109 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 110 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 111 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 112 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 113 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 114 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 115 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 116 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 117 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 118 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 119 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 128 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 129 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 130 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 131 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 132 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 133 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 134 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 135 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 136 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 137 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 138 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 139 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 140 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 141 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 142 IRQ_TYPE_LEVEL_HIGH>, + <GIC_SPI 143 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_APBDMA>; + clock-names = "dma"; + resets = <&tegra_car 34>; + reset-names = "dma"; + #dma-cells = <1>; + }; + + apbmisc@0,70000800 { + compatible = "nvidia,tegra210-apbmisc", "nvidia,tegra20-apbmisc"; + reg = <0x0 0x70000800 0x0 0x64>, /* Chip revision */ + <0x0 0x7000e864 0x0 0x04>; /* Strapping options */ + }; + + pinmux: pinmux@0,700008d4 { + compatible = "nvidia,tegra210-pinmux"; + reg = <0x0 0x700008d4 0x0 0x29c>, /* Pad control registers */ + <0x0 0x70003000 0x0 0x294>; /* Mux registers */ + }; + + /* + * There are two serial driver i.e. 8250 based simple serial + * driver and APB DMA based serial driver for higher baudrate + * and performace. To enable the 8250 based driver, the compatible + * is "nvidia,tegra124-uart", "nvidia,tegra20-uart" and to enable + * the APB DMA based serial driver, the comptible is + * "nvidia,tegra124-hsuart", "nvidia,tegra30-hsuart". + */ + uarta: serial@0,70006000 { + compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006000 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 36 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_UARTA>; + clock-names = "serial"; + resets = <&tegra_car 6>; + reset-names = "serial"; + dmas = <&apbdma 8>, <&apbdma 8>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + uartb: serial@0,70006040 { + compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006040 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 37 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_UARTB>; + clock-names = "serial"; + resets = <&tegra_car 7>; + reset-names = "serial"; + dmas = <&apbdma 9>, <&apbdma 9>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + uartc: serial@0,70006200 { + compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006200 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 46 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_UARTC>; + clock-names = "serial"; + resets = <&tegra_car 55>; + reset-names = "serial"; + dmas = <&apbdma 10>, <&apbdma 10>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + uartd: serial@0,70006300 { + compatible = "nvidia,tegra210-uart", "nvidia,tegra20-uart"; + reg = <0x0 0x70006300 0x0 0x40>; + reg-shift = <2>; + interrupts = <GIC_SPI 90 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_UARTD>; + clock-names = "serial"; + resets = <&tegra_car 65>; + reset-names = "serial"; + dmas = <&apbdma 19>, <&apbdma 19>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + pwm: pwm@0,7000a000 { + compatible = "nvidia,tegra210-pwm", "nvidia,tegra20-pwm"; + reg = <0x0 0x7000a000 0x0 0x100>; + #pwm-cells = <2>; + clocks = <&tegra_car TEGRA210_CLK_PWM>; + clock-names = "pwm"; + resets = <&tegra_car 17>; + reset-names = "pwm"; + status = "disabled"; + }; + + i2c@0,7000c000 { + compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c000 0x0 0x100>; + interrupts = <GIC_SPI 38 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_I2C1>; + clock-names = "div-clk"; + resets = <&tegra_car 12>; + reset-names = "i2c"; + dmas = <&apbdma 21>, <&apbdma 21>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000c400 { + compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c400 0x0 0x100>; + interrupts = <GIC_SPI 84 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_I2C2>; + clock-names = "div-clk"; + resets = <&tegra_car 54>; + reset-names = "i2c"; + dmas = <&apbdma 22>, <&apbdma 22>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000c500 { + compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c500 0x0 0x100>; + interrupts = <GIC_SPI 92 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_I2C3>; + clock-names = "div-clk"; + resets = <&tegra_car 67>; + reset-names = "i2c"; + dmas = <&apbdma 23>, <&apbdma 23>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000c700 { + compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000c700 0x0 0x100>; + interrupts = <GIC_SPI 120 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_I2C4>; + clock-names = "div-clk"; + resets = <&tegra_car 103>; + reset-names = "i2c"; + dmas = <&apbdma 26>, <&apbdma 26>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000d000 { + compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000d000 0x0 0x100>; + interrupts = <GIC_SPI 53 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_I2C5>; + clock-names = "div-clk"; + resets = <&tegra_car 47>; + reset-names = "i2c"; + dmas = <&apbdma 24>, <&apbdma 24>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + i2c@0,7000d100 { + compatible = "nvidia,tegra210-i2c", "nvidia,tegra114-i2c"; + reg = <0x0 0x7000d100 0x0 0x100>; + interrupts = <GIC_SPI 63 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_I2C6>; + clock-names = "div-clk"; + resets = <&tegra_car 166>; + reset-names = "i2c"; + dmas = <&apbdma 30>, <&apbdma 30>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000d400 { + compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000d400 0x0 0x200>; + interrupts = <GIC_SPI 59 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_SBC1>; + clock-names = "spi"; + resets = <&tegra_car 41>; + reset-names = "spi"; + dmas = <&apbdma 15>, <&apbdma 15>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000d600 { + compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000d600 0x0 0x200>; + interrupts = <GIC_SPI 82 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_SBC2>; + clock-names = "spi"; + resets = <&tegra_car 44>; + reset-names = "spi"; + dmas = <&apbdma 16>, <&apbdma 16>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000d800 { + compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000d800 0x0 0x200>; + interrupts = <GIC_SPI 83 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_SBC3>; + clock-names = "spi"; + resets = <&tegra_car 46>; + reset-names = "spi"; + dmas = <&apbdma 17>, <&apbdma 17>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + spi@0,7000da00 { + compatible = "nvidia,tegra210-spi", "nvidia,tegra114-spi"; + reg = <0x0 0x7000da00 0x0 0x200>; + interrupts = <GIC_SPI 93 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_SBC4>; + clock-names = "spi"; + resets = <&tegra_car 68>; + reset-names = "spi"; + dmas = <&apbdma 18>, <&apbdma 18>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + rtc@0,7000e000 { + compatible = "nvidia,tegra210-rtc", "nvidia,tegra20-rtc"; + reg = <0x0 0x7000e000 0x0 0x100>; + interrupts = <GIC_SPI 2 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_RTC>; + clock-names = "rtc"; + }; + + pmc: pmc@0,7000e400 { + compatible = "nvidia,tegra210-pmc"; + reg = <0x0 0x7000e400 0x0 0x400>; + clocks = <&tegra_car TEGRA210_CLK_PCLK>, <&clk32k_in>; + clock-names = "pclk", "clk32k_in"; + + #power-domain-cells = <1>; + }; + + fuse@0,7000f800 { + compatible = "nvidia,tegra210-efuse"; + reg = <0x0 0x7000f800 0x0 0x400>; + clocks = <&tegra_car TEGRA210_CLK_FUSE>; + clock-names = "fuse"; + resets = <&tegra_car 39>; + reset-names = "fuse"; + }; + + mc: memory-controller@0,70019000 { + compatible = "nvidia,tegra210-mc"; + reg = <0x0 0x70019000 0x0 0x1000>; + clocks = <&tegra_car TEGRA210_CLK_MC>; + clock-names = "mc"; + + interrupts = <GIC_SPI 77 IRQ_TYPE_LEVEL_HIGH>; + + #iommu-cells = <1>; + }; + + hda@0,70030000 { + compatible = "nvidia,tegra210-hda", "nvidia,tegra30-hda"; + reg = <0x0 0x70030000 0x0 0x10000>; + interrupts = <GIC_SPI 81 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_HDA>, + <&tegra_car TEGRA210_CLK_HDA2HDMI>, + <&tegra_car TEGRA210_CLK_HDA2CODEC_2X>; + clock-names = "hda", "hda2hdmi", "hda2codec_2x"; + resets = <&tegra_car 125>, /* hda */ + <&tegra_car 128>, /* hda2hdmi */ + <&tegra_car 111>; /* hda2codec_2x */ + reset-names = "hda", "hda2hdmi", "hda2codec_2x"; + status = "disabled"; + }; + + sdhci@0,700b0000 { + compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0000 0x0 0x200>; + interrupts = <GIC_SPI 14 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_SDMMC1>; + clock-names = "sdhci"; + resets = <&tegra_car 14>; + reset-names = "sdhci"; + status = "disabled"; + }; + + sdhci@0,700b0200 { + compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0200 0x0 0x200>; + interrupts = <GIC_SPI 15 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_SDMMC2>; + clock-names = "sdhci"; + resets = <&tegra_car 9>; + reset-names = "sdhci"; + status = "disabled"; + }; + + sdhci@0,700b0400 { + compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0400 0x0 0x200>; + interrupts = <GIC_SPI 19 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_SDMMC3>; + clock-names = "sdhci"; + resets = <&tegra_car 69>; + reset-names = "sdhci"; + status = "disabled"; + }; + + sdhci@0,700b0600 { + compatible = "nvidia,tegra210-sdhci", "nvidia,tegra124-sdhci"; + reg = <0x0 0x700b0600 0x0 0x200>; + interrupts = <GIC_SPI 31 IRQ_TYPE_LEVEL_HIGH>; + clocks = <&tegra_car TEGRA210_CLK_SDMMC4>; + clock-names = "sdhci"; + resets = <&tegra_car 15>; + reset-names = "sdhci"; + status = "disabled"; + }; + + mipi: mipi@0,700e3000 { + compatible = "nvidia,tegra210-mipi"; + reg = <0x0 0x700e3000 0x0 0x100>; + clocks = <&tegra_car TEGRA210_CLK_MIPI_CAL>; + clock-names = "mipi-cal"; + #nvidia,mipi-calibrate-cells = <1>; + }; + + spi@0,70410000 { + compatible = "nvidia,tegra210-qspi"; + reg = <0x0 0x70410000 0x0 0x1000>; + interrupts = <GIC_SPI 10 IRQ_TYPE_LEVEL_HIGH>; + #address-cells = <1>; + #size-cells = <0>; + clocks = <&tegra_car TEGRA210_CLK_QSPI>; + clock-names = "qspi"; + resets = <&tegra_car 211>; + reset-names = "qspi"; + dmas = <&apbdma 5>, <&apbdma 5>; + dma-names = "rx", "tx"; + status = "disabled"; + }; + + usb@0,7d000000 { + compatible = "nvidia,tegra210-ehci", "nvidia,tegra30-ehci", "usb-ehci"; + reg = <0x0 0x7d000000 0x0 0x4000>; + interrupts = <GIC_SPI 20 IRQ_TYPE_LEVEL_HIGH>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA210_CLK_USBD>; + clock-names = "usb"; + resets = <&tegra_car 22>; + reset-names = "usb"; + nvidia,phy = <&phy1>; + status = "disabled"; + }; + + phy1: usb-phy@0,7d000000 { + compatible = "nvidia,tegra210-usb-phy", "nvidia,tegra30-usb-phy"; + reg = <0x0 0x7d000000 0x0 0x4000>, + <0x0 0x7d000000 0x0 0x4000>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA210_CLK_USBD>, + <&tegra_car TEGRA210_CLK_PLL_U>, + <&tegra_car TEGRA210_CLK_USBD>; + clock-names = "reg", "pll_u", "utmi-pads"; + resets = <&tegra_car 22>, <&tegra_car 22>; + reset-names = "usb", "utmi-pads"; + nvidia,hssync-start-delay = <0>; + nvidia,idle-wait-delay = <17>; + nvidia,elastic-limit = <16>; + nvidia,term-range-adj = <6>; + nvidia,xcvr-setup = <9>; + nvidia,xcvr-lsfslew = <0>; + nvidia,xcvr-lsrslew = <3>; + nvidia,hssquelch-level = <2>; + nvidia,hsdiscon-level = <5>; + nvidia,xcvr-hsslew = <12>; + nvidia,has-utmi-pad-registers; + status = "disabled"; + }; + + usb@0,7d004000 { + compatible = "nvidia,tegra210-ehci", "nvidia,tegra30-ehci", "usb-ehci"; + reg = <0x0 0x7d004000 0x0 0x4000>; + interrupts = <GIC_SPI 21 IRQ_TYPE_LEVEL_HIGH>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA210_CLK_USB2>; + clock-names = "usb"; + resets = <&tegra_car 58>; + reset-names = "usb"; + nvidia,phy = <&phy2>; + status = "disabled"; + }; + + phy2: usb-phy@0,7d004000 { + compatible = "nvidia,tegra210-usb-phy", "nvidia,tegra30-usb-phy"; + reg = <0x0 0x7d004000 0x0 0x4000>, + <0x0 0x7d000000 0x0 0x4000>; + phy_type = "utmi"; + clocks = <&tegra_car TEGRA210_CLK_USB2>, + <&tegra_car TEGRA210_CLK_PLL_U>, + <&tegra_car TEGRA210_CLK_USBD>; + clock-names = "reg", "pll_u", "utmi-pads"; + resets = <&tegra_car 58>, <&tegra_car 22>; + reset-names = "usb", "utmi-pads"; + nvidia,hssync-start-delay = <0>; + nvidia,idle-wait-delay = <17>; + nvidia,elastic-limit = <16>; + nvidia,term-range-adj = <6>; + nvidia,xcvr-setup = <9>; + nvidia,xcvr-lsfslew = <0>; + nvidia,xcvr-lsrslew = <3>; + nvidia,hssquelch-level = <2>; + nvidia,hsdiscon-level = <5>; + nvidia,xcvr-hsslew = <12>; + status = "disabled"; + }; + + cpus { + #address-cells = <1>; + #size-cells = <0>; + + cpu@0 { + device_type = "cpu"; + compatible = "arm,cortex-a57"; + reg = <0>; + }; + + cpu@1 { + device_type = "cpu"; + compatible = "arm,cortex-a57"; + reg = <1>; + }; + + cpu@2 { + device_type = "cpu"; + compatible = "arm,cortex-a57"; + reg = <2>; + }; + + cpu@3 { + device_type = "cpu"; + compatible = "arm,cortex-a57"; + reg = <3>; + }; + }; + + timer { + compatible = "arm,armv8-timer"; + interrupts = <GIC_PPI 13 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>, + <GIC_PPI 14 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>, + <GIC_PPI 11 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>, + <GIC_PPI 10 + (GIC_CPU_MASK_SIMPLE(4) | IRQ_TYPE_LEVEL_LOW)>; + interrupt-parent = <&gic>; + }; +}; diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index 74c132d901bd..6a8685051b67 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -11,7 +11,7 @@ -#define NR_syscalls 323 /* length of syscall table */ +#define NR_syscalls 324 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about diff --git a/arch/ia64/include/uapi/asm/unistd.h b/arch/ia64/include/uapi/asm/unistd.h index 762edce7572e..41369a103838 100644 --- a/arch/ia64/include/uapi/asm/unistd.h +++ b/arch/ia64/include/uapi/asm/unistd.h @@ -336,5 +336,6 @@ #define __NR_membarrier 1344 #define __NR_kcmp 1345 #define __NR_mlock2 1346 +#define __NR_copy_file_range 1347 #endif /* _UAPI_ASM_IA64_UNISTD_H */ diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 534a74acb849..477c55e244ec 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1772,5 +1772,6 @@ sys_call_table: data8 sys_membarrier data8 sys_kcmp // 1345 data8 sys_mlock2 + data8 sys_copy_file_range .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls diff --git a/arch/powerpc/platforms/cell/spufs/file.c b/arch/powerpc/platforms/cell/spufs/file.c index 5038fd578e65..2936a0044c04 100644 --- a/arch/powerpc/platforms/cell/spufs/file.c +++ b/arch/powerpc/platforms/cell/spufs/file.c @@ -1799,9 +1799,9 @@ static int spufs_mfc_fsync(struct file *file, loff_t start, loff_t end, int data struct inode *inode = file_inode(file); int err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (!err) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = spufs_mfc_flush(file, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return err; } diff --git a/arch/powerpc/platforms/cell/spufs/inode.c b/arch/powerpc/platforms/cell/spufs/inode.c index ad4840f86be1..dfa863876778 100644 --- a/arch/powerpc/platforms/cell/spufs/inode.c +++ b/arch/powerpc/platforms/cell/spufs/inode.c @@ -163,7 +163,7 @@ static void spufs_prune_dir(struct dentry *dir) { struct dentry *dentry, *tmp; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry_safe(dentry, tmp, &dir->d_subdirs, d_child) { spin_lock(&dentry->d_lock); if (simple_positive(dentry)) { @@ -180,7 +180,7 @@ static void spufs_prune_dir(struct dentry *dir) } } shrink_dcache_parent(dir); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } /* Caller must hold parent->i_mutex */ @@ -225,9 +225,9 @@ static int spufs_dir_close(struct inode *inode, struct file *file) parent = d_inode(dir->d_parent); ctx = SPUFS_I(d_inode(dir))->i_ctx; - mutex_lock_nested(&parent->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(parent, I_MUTEX_PARENT); ret = spufs_rmdir(parent, dir); - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); WARN_ON(ret); return dcache_dir_close(inode, file); @@ -270,7 +270,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dget(dentry); inc_nlink(dir); @@ -291,7 +291,7 @@ spufs_mkdir(struct inode *dir, struct dentry *dentry, unsigned int flags, if (ret) spufs_rmdir(dir, dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/arch/s390/hypfs/inode.c b/arch/s390/hypfs/inode.c index b2e5902bd8f4..0f3da2cb2bd6 100644 --- a/arch/s390/hypfs/inode.c +++ b/arch/s390/hypfs/inode.c @@ -67,7 +67,7 @@ static void hypfs_remove(struct dentry *dentry) struct dentry *parent; parent = dentry->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(d_inode(parent), dentry); @@ -76,7 +76,7 @@ static void hypfs_remove(struct dentry *dentry) } d_delete(dentry); dput(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } static void hypfs_delete_tree(struct dentry *root) @@ -331,7 +331,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, struct dentry *dentry; struct inode *inode; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) { dentry = ERR_PTR(-ENOMEM); @@ -359,7 +359,7 @@ static struct dentry *hypfs_create_file(struct dentry *parent, const char *name, d_instantiate(dentry, inode); dget(dentry); fail: - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return dentry; } diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 1544fabcd7f9..c57fd1ea9689 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void) } /** - * __arch_wb_cache_pmem - write back a cache range with CLWB + * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) * instruction. This function requires explicit ordering with an - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + * arch_wmb_pmem() call. */ -static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; + void *vaddr = (void __force *)addr; void *vend = vaddr + size; void *p; @@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, len = copy_from_iter_nocache(vaddr, bytes, i); if (__iter_needs_pmem_wb(i)) - __arch_wb_cache_pmem(vaddr, bytes); + arch_wb_cache_pmem(addr, bytes); return len; } @@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) void *vaddr = (void __force *)addr; memset(vaddr, 0, size); - __arch_wb_cache_pmem(vaddr, size); + arch_wb_cache_pmem(addr, size); } static inline bool __arch_has_wmb_pmem(void) diff --git a/block/Makefile b/block/Makefile index db5f622c9d67..9eda2322b2d4 100644 --- a/block/Makefile +++ b/block/Makefile @@ -5,7 +5,7 @@ obj-$(CONFIG_BLOCK) := bio.o elevator.o blk-core.o blk-tag.o blk-sysfs.o \ blk-flush.o blk-settings.o blk-ioc.o blk-map.o \ blk-exec.o blk-merge.o blk-softirq.o blk-timeout.o \ - blk-iopoll.o blk-lib.o blk-mq.o blk-mq-tag.o \ + blk-lib.o blk-mq.o blk-mq-tag.o \ blk-mq-sysfs.o blk-mq-cpu.o blk-mq-cpumap.o ioctl.o \ genhd.o scsi_ioctl.o partition-generic.o ioprio.o \ badblocks.o partitions/ diff --git a/block/bio-integrity.c b/block/bio-integrity.c index f6325d573c10..711e4d8de6fa 100644 --- a/block/bio-integrity.c +++ b/block/bio-integrity.c @@ -66,7 +66,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, } if (unlikely(!bip)) - return NULL; + return ERR_PTR(-ENOMEM); memset(bip, 0, sizeof(*bip)); @@ -89,7 +89,7 @@ struct bio_integrity_payload *bio_integrity_alloc(struct bio *bio, return bip; err: mempool_free(bip, bs->bio_integrity_pool); - return NULL; + return ERR_PTR(-ENOMEM); } EXPORT_SYMBOL(bio_integrity_alloc); @@ -298,10 +298,10 @@ int bio_integrity_prep(struct bio *bio) /* Allocate bio integrity payload and integrity vectors */ bip = bio_integrity_alloc(bio, GFP_NOIO, nr_pages); - if (unlikely(bip == NULL)) { + if (IS_ERR(bip)) { printk(KERN_ERR "could not allocate data integrity bioset\n"); kfree(buf); - return -EIO; + return PTR_ERR(bip); } bip->bip_flags |= BIP_BLOCK_INTEGRITY; @@ -465,9 +465,8 @@ int bio_integrity_clone(struct bio *bio, struct bio *bio_src, BUG_ON(bip_src == NULL); bip = bio_integrity_alloc(bio, gfp_mask, bip_src->bip_vcnt); - - if (bip == NULL) - return -EIO; + if (IS_ERR(bip)) + return PTR_ERR(bip); memcpy(bip->bip_vec, bip_src->bip_vec, bip_src->bip_vcnt * sizeof(struct bio_vec)); diff --git a/block/blk-core.c b/block/blk-core.c index 476244d59309..ab51685988c2 100644 --- a/block/blk-core.c +++ b/block/blk-core.c @@ -680,6 +680,13 @@ static void blk_queue_usage_counter_release(struct percpu_ref *ref) wake_up_all(&q->mq_freeze_wq); } +static void blk_rq_timed_out_timer(unsigned long data) +{ + struct request_queue *q = (struct request_queue *)data; + + kblockd_schedule_work(&q->timeout_work); +} + struct request_queue *blk_alloc_queue_node(gfp_t gfp_mask, int node_id) { struct request_queue *q; @@ -841,6 +848,7 @@ blk_init_allocated_queue(struct request_queue *q, request_fn_proc *rfn, if (blk_init_rl(&q->root_rl, q, GFP_KERNEL)) goto fail; + INIT_WORK(&q->timeout_work, blk_timeout_work); q->request_fn = rfn; q->prep_rq_fn = NULL; q->unprep_rq_fn = NULL; diff --git a/block/blk-mq.c b/block/blk-mq.c index 6889d7183a2a..4c0622fae413 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -603,8 +603,6 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, blk_mq_complete_request(rq, -EIO); return; } - if (rq->cmd_flags & REQ_NO_TIMEOUT) - return; if (time_after_eq(jiffies, rq->deadline)) { if (!blk_mark_rq_complete(rq)) @@ -615,15 +613,19 @@ static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, } } -static void blk_mq_rq_timer(unsigned long priv) +static void blk_mq_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *)priv; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); struct blk_mq_timeout_data data = { .next = 0, .next_set = 0, }; int i; + if (blk_queue_enter(q, true)) + return; + blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); if (data.next_set) { @@ -638,6 +640,7 @@ static void blk_mq_rq_timer(unsigned long priv) blk_mq_tag_idle(hctx); } } + blk_queue_exit(q); } /* @@ -2008,7 +2011,7 @@ struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, hctxs[i]->queue_num = i; } - setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); + INIT_WORK(&q->timeout_work, blk_mq_timeout_work); blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); q->nr_queues = nr_cpu_ids; diff --git a/block/blk-timeout.c b/block/blk-timeout.c index 3610af561748..a30441a200c0 100644 --- a/block/blk-timeout.c +++ b/block/blk-timeout.c @@ -127,13 +127,16 @@ static void blk_rq_check_expired(struct request *rq, unsigned long *next_timeout } } -void blk_rq_timed_out_timer(unsigned long data) +void blk_timeout_work(struct work_struct *work) { - struct request_queue *q = (struct request_queue *) data; + struct request_queue *q = + container_of(work, struct request_queue, timeout_work); unsigned long flags, next = 0; struct request *rq, *tmp; int next_set = 0; + if (blk_queue_enter(q, true)) + return; spin_lock_irqsave(q->queue_lock, flags); list_for_each_entry_safe(rq, tmp, &q->timeout_list, timeout_list) @@ -143,6 +146,7 @@ void blk_rq_timed_out_timer(unsigned long data) mod_timer(&q->timeout, round_jiffies_up(next)); spin_unlock_irqrestore(q->queue_lock, flags); + blk_queue_exit(q); } /** @@ -193,9 +197,6 @@ void blk_add_timer(struct request *req) struct request_queue *q = req->q; unsigned long expiry; - if (req->cmd_flags & REQ_NO_TIMEOUT) - return; - /* blk-mq has its own handler, so we don't need ->rq_timed_out_fn */ if (!q->mq_ops && !q->rq_timed_out_fn) return; diff --git a/block/blk.h b/block/blk.h index c43926d3d74d..70e4aee9cdcb 100644 --- a/block/blk.h +++ b/block/blk.h @@ -93,7 +93,7 @@ static inline void blk_flush_integrity(void) } #endif -void blk_rq_timed_out_timer(unsigned long data); +void blk_timeout_work(struct work_struct *work); unsigned long blk_rq_timeout(unsigned long timeout); void blk_add_timer(struct request *req); void blk_delete_timer(struct request *); diff --git a/block/ioctl.c b/block/ioctl.c index 2c84683aada5..77f5d17779d6 100644 --- a/block/ioctl.c +++ b/block/ioctl.c @@ -455,12 +455,12 @@ static int blkdev_daxset(struct block_device *bdev, unsigned long argp) if (arg && !blkdev_dax_capable(bdev)) return -ENOTTY; - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); if (bdev->bd_map_count == 0) inode_set_flags(bdev->bd_inode, arg, S_DAX); else rc = -EBUSY; - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); return rc; } #else diff --git a/crypto/af_alg.c b/crypto/af_alg.c index a8e7aa3e257b..f5e18c2a4852 100644 --- a/crypto/af_alg.c +++ b/crypto/af_alg.c @@ -76,6 +76,8 @@ int af_alg_register_type(const struct af_alg_type *type) goto unlock; type->ops->owner = THIS_MODULE; + if (type->ops_nokey) + type->ops_nokey->owner = THIS_MODULE; node->type = type; list_add(&node->list, &alg_types); err = 0; @@ -125,6 +127,26 @@ int af_alg_release(struct socket *sock) } EXPORT_SYMBOL_GPL(af_alg_release); +void af_alg_release_parent(struct sock *sk) +{ + struct alg_sock *ask = alg_sk(sk); + unsigned int nokey = ask->nokey_refcnt; + bool last = nokey && !ask->refcnt; + + sk = ask->parent; + ask = alg_sk(sk); + + lock_sock(sk); + ask->nokey_refcnt -= nokey; + if (!last) + last = !--ask->refcnt; + release_sock(sk); + + if (last) + sock_put(sk); +} +EXPORT_SYMBOL_GPL(af_alg_release_parent); + static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) { const u32 forbidden = CRYPTO_ALG_INTERNAL; @@ -133,6 +155,7 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct sockaddr_alg *sa = (void *)uaddr; const struct af_alg_type *type; void *private; + int err; if (sock->state == SS_CONNECTED) return -EINVAL; @@ -160,16 +183,22 @@ static int alg_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) return PTR_ERR(private); } + err = -EBUSY; lock_sock(sk); + if (ask->refcnt | ask->nokey_refcnt) + goto unlock; swap(ask->type, type); swap(ask->private, private); + err = 0; + +unlock: release_sock(sk); alg_do_release(type, private); - return 0; + return err; } static int alg_setkey(struct sock *sk, char __user *ukey, @@ -202,11 +231,15 @@ static int alg_setsockopt(struct socket *sock, int level, int optname, struct sock *sk = sock->sk; struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; - int err = -ENOPROTOOPT; + int err = -EBUSY; lock_sock(sk); + if (ask->refcnt) + goto unlock; + type = ask->type; + err = -ENOPROTOOPT; if (level != SOL_ALG || !type) goto unlock; @@ -238,6 +271,7 @@ int af_alg_accept(struct sock *sk, struct socket *newsock) struct alg_sock *ask = alg_sk(sk); const struct af_alg_type *type; struct sock *sk2; + unsigned int nokey; int err; lock_sock(sk); @@ -257,20 +291,29 @@ int af_alg_accept(struct sock *sk, struct socket *newsock) security_sk_clone(sk, sk2); err = type->accept(ask->private, sk2); - if (err) { - sk_free(sk2); + + nokey = err == -ENOKEY; + if (nokey && type->accept_nokey) + err = type->accept_nokey(ask->private, sk2); + + if (err) goto unlock; - } sk2->sk_family = PF_ALG; - sock_hold(sk); + if (nokey || !ask->refcnt++) + sock_hold(sk); + ask->nokey_refcnt += nokey; alg_sk(sk2)->parent = sk; alg_sk(sk2)->type = type; + alg_sk(sk2)->nokey_refcnt = nokey; newsock->ops = type->ops; newsock->state = SS_CONNECTED; + if (nokey) + newsock->ops = type->ops_nokey; + err = 0; unlock: diff --git a/crypto/ahash.c b/crypto/ahash.c index 9c1dc8d6106a..d19b52324cf5 100644 --- a/crypto/ahash.c +++ b/crypto/ahash.c @@ -451,6 +451,7 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm) struct ahash_alg *alg = crypto_ahash_alg(hash); hash->setkey = ahash_nosetkey; + hash->has_setkey = false; hash->export = ahash_no_export; hash->import = ahash_no_import; @@ -463,8 +464,10 @@ static int crypto_ahash_init_tfm(struct crypto_tfm *tfm) hash->finup = alg->finup ?: ahash_def_finup; hash->digest = alg->digest; - if (alg->setkey) + if (alg->setkey) { hash->setkey = alg->setkey; + hash->has_setkey = true; + } if (alg->export) hash->export = alg->export; if (alg->import) diff --git a/crypto/algif_hash.c b/crypto/algif_hash.c index b4c24fe3dcfb..608a7562839d 100644 --- a/crypto/algif_hash.c +++ b/crypto/algif_hash.c @@ -34,6 +34,11 @@ struct hash_ctx { struct ahash_request req; }; +struct algif_hash_tfm { + struct crypto_ahash *hash; + bool has_key; +}; + static int hash_sendmsg(struct socket *sock, struct msghdr *msg, size_t ignored) { @@ -235,19 +240,151 @@ static struct proto_ops algif_hash_ops = { .accept = hash_accept, }; +static int hash_check_key(struct socket *sock) +{ + int err = 0; + struct sock *psk; + struct alg_sock *pask; + struct algif_hash_tfm *tfm; + struct sock *sk = sock->sk; + struct alg_sock *ask = alg_sk(sk); + + lock_sock(sk); + if (ask->refcnt) + goto unlock_child; + + psk = ask->parent; + pask = alg_sk(ask->parent); + tfm = pask->private; + + err = -ENOKEY; + lock_sock_nested(psk, SINGLE_DEPTH_NESTING); + if (!tfm->has_key) + goto unlock; + + if (!pask->refcnt++) + sock_hold(psk); + + ask->refcnt = 1; + sock_put(psk); + + err = 0; + +unlock: + release_sock(psk); +unlock_child: + release_sock(sk); + + return err; +} + +static int hash_sendmsg_nokey(struct socket *sock, struct msghdr *msg, + size_t size) +{ + int err; + + err = hash_check_key(sock); + if (err) + return err; + + return hash_sendmsg(sock, msg, size); +} + +static ssize_t hash_sendpage_nokey(struct socket *sock, struct page *page, + int offset, size_t size, int flags) +{ + int err; + + err = hash_check_key(sock); + if (err) + return err; + + return hash_sendpage(sock, page, offset, size, flags); +} + +static int hash_recvmsg_nokey(struct socket *sock, struct msghdr *msg, + size_t ignored, int flags) +{ + int err; + + err = hash_check_key(sock); + if (err) + return err; + + return hash_recvmsg(sock, msg, ignored, flags); +} + +static int hash_accept_nokey(struct socket *sock, struct socket *newsock, + int flags) +{ + int err; + + err = hash_check_key(sock); + if (err) + return err; + + return hash_accept(sock, newsock, flags); +} + +static struct proto_ops algif_hash_ops_nokey = { + .family = PF_ALG, + + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .getname = sock_no_getname, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .getsockopt = sock_no_getsockopt, + .mmap = sock_no_mmap, + .bind = sock_no_bind, + .setsockopt = sock_no_setsockopt, + .poll = sock_no_poll, + + .release = af_alg_release, + .sendmsg = hash_sendmsg_nokey, + .sendpage = hash_sendpage_nokey, + .recvmsg = hash_recvmsg_nokey, + .accept = hash_accept_nokey, +}; + static void *hash_bind(const char *name, u32 type, u32 mask) { - return crypto_alloc_ahash(name, type, mask); + struct algif_hash_tfm *tfm; + struct crypto_ahash *hash; + + tfm = kzalloc(sizeof(*tfm), GFP_KERNEL); + if (!tfm) + return ERR_PTR(-ENOMEM); + + hash = crypto_alloc_ahash(name, type, mask); + if (IS_ERR(hash)) { + kfree(tfm); + return ERR_CAST(hash); + } + + tfm->hash = hash; + + return tfm; } static void hash_release(void *private) { - crypto_free_ahash(private); + struct algif_hash_tfm *tfm = private; + + crypto_free_ahash(tfm->hash); + kfree(tfm); } static int hash_setkey(void *private, const u8 *key, unsigned int keylen) { - return crypto_ahash_setkey(private, key, keylen); + struct algif_hash_tfm *tfm = private; + int err; + + err = crypto_ahash_setkey(tfm->hash, key, keylen); + tfm->has_key = !err; + + return err; } static void hash_sock_destruct(struct sock *sk) @@ -261,12 +398,14 @@ static void hash_sock_destruct(struct sock *sk) af_alg_release_parent(sk); } -static int hash_accept_parent(void *private, struct sock *sk) +static int hash_accept_parent_nokey(void *private, struct sock *sk) { struct hash_ctx *ctx; struct alg_sock *ask = alg_sk(sk); - unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(private); - unsigned ds = crypto_ahash_digestsize(private); + struct algif_hash_tfm *tfm = private; + struct crypto_ahash *hash = tfm->hash; + unsigned len = sizeof(*ctx) + crypto_ahash_reqsize(hash); + unsigned ds = crypto_ahash_digestsize(hash); ctx = sock_kmalloc(sk, len, GFP_KERNEL); if (!ctx) @@ -286,7 +425,7 @@ static int hash_accept_parent(void *private, struct sock *sk) ask->private = ctx; - ahash_request_set_tfm(&ctx->req, private); + ahash_request_set_tfm(&ctx->req, hash); ahash_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, af_alg_complete, &ctx->completion); @@ -295,12 +434,24 @@ static int hash_accept_parent(void *private, struct sock *sk) return 0; } +static int hash_accept_parent(void *private, struct sock *sk) +{ + struct algif_hash_tfm *tfm = private; + + if (!tfm->has_key && crypto_ahash_has_setkey(tfm->hash)) + return -ENOKEY; + + return hash_accept_parent_nokey(private, sk); +} + static const struct af_alg_type algif_type_hash = { .bind = hash_bind, .release = hash_release, .setkey = hash_setkey, .accept = hash_accept_parent, + .accept_nokey = hash_accept_parent_nokey, .ops = &algif_hash_ops, + .ops_nokey = &algif_hash_ops_nokey, .name = "hash", .owner = THIS_MODULE }; diff --git a/crypto/algif_skcipher.c b/crypto/algif_skcipher.c index eaa9f9be5b87..38c1aa89d3a0 100644 --- a/crypto/algif_skcipher.c +++ b/crypto/algif_skcipher.c @@ -31,6 +31,11 @@ struct skcipher_sg_list { struct scatterlist sg[0]; }; +struct skcipher_tfm { + struct crypto_skcipher *skcipher; + bool has_key; +}; + struct skcipher_ctx { struct list_head tsgl; struct af_alg_sgl rsgl; @@ -387,7 +392,8 @@ static int skcipher_sendmsg(struct socket *sock, struct msghdr *msg, sgl = list_entry(ctx->tsgl.prev, struct skcipher_sg_list, list); sg = sgl->sg; - sg_unmark_end(sg + sgl->cur); + if (sgl->cur) + sg_unmark_end(sg + sgl->cur - 1); do { i = sgl->cur; plen = min_t(size_t, len, PAGE_SIZE); @@ -642,13 +648,6 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, lock_sock(sk); while (msg_data_left(msg)) { - sgl = list_first_entry(&ctx->tsgl, - struct skcipher_sg_list, list); - sg = sgl->sg; - - while (!sg->length) - sg++; - if (!ctx->used) { err = skcipher_wait_for_data(sk, flags); if (err) @@ -669,6 +668,13 @@ static int skcipher_recvmsg_sync(struct socket *sock, struct msghdr *msg, if (!used) goto free; + sgl = list_first_entry(&ctx->tsgl, + struct skcipher_sg_list, list); + sg = sgl->sg; + + while (!sg->length) + sg++; + skcipher_request_set_crypt(&ctx->req, sg, ctx->rsgl.sg, used, ctx->iv); @@ -748,19 +754,139 @@ static struct proto_ops algif_skcipher_ops = { .poll = skcipher_poll, }; +static int skcipher_check_key(struct socket *sock) +{ + int err = 0; + struct sock *psk; + struct alg_sock *pask; + struct skcipher_tfm *tfm; + struct sock *sk = sock->sk; + struct alg_sock *ask = alg_sk(sk); + + lock_sock(sk); + if (ask->refcnt) + goto unlock_child; + + psk = ask->parent; + pask = alg_sk(ask->parent); + tfm = pask->private; + + err = -ENOKEY; + lock_sock_nested(psk, SINGLE_DEPTH_NESTING); + if (!tfm->has_key) + goto unlock; + + if (!pask->refcnt++) + sock_hold(psk); + + ask->refcnt = 1; + sock_put(psk); + + err = 0; + +unlock: + release_sock(psk); +unlock_child: + release_sock(sk); + + return err; +} + +static int skcipher_sendmsg_nokey(struct socket *sock, struct msghdr *msg, + size_t size) +{ + int err; + + err = skcipher_check_key(sock); + if (err) + return err; + + return skcipher_sendmsg(sock, msg, size); +} + +static ssize_t skcipher_sendpage_nokey(struct socket *sock, struct page *page, + int offset, size_t size, int flags) +{ + int err; + + err = skcipher_check_key(sock); + if (err) + return err; + + return skcipher_sendpage(sock, page, offset, size, flags); +} + +static int skcipher_recvmsg_nokey(struct socket *sock, struct msghdr *msg, + size_t ignored, int flags) +{ + int err; + + err = skcipher_check_key(sock); + if (err) + return err; + + return skcipher_recvmsg(sock, msg, ignored, flags); +} + +static struct proto_ops algif_skcipher_ops_nokey = { + .family = PF_ALG, + + .connect = sock_no_connect, + .socketpair = sock_no_socketpair, + .getname = sock_no_getname, + .ioctl = sock_no_ioctl, + .listen = sock_no_listen, + .shutdown = sock_no_shutdown, + .getsockopt = sock_no_getsockopt, + .mmap = sock_no_mmap, + .bind = sock_no_bind, + .accept = sock_no_accept, + .setsockopt = sock_no_setsockopt, + + .release = af_alg_release, + .sendmsg = skcipher_sendmsg_nokey, + .sendpage = skcipher_sendpage_nokey, + .recvmsg = skcipher_recvmsg_nokey, + .poll = skcipher_poll, +}; + static void *skcipher_bind(const char *name, u32 type, u32 mask) { - return crypto_alloc_skcipher(name, type, mask); + struct skcipher_tfm *tfm; + struct crypto_skcipher *skcipher; + + tfm = kzalloc(sizeof(*tfm), GFP_KERNEL); + if (!tfm) + return ERR_PTR(-ENOMEM); + + skcipher = crypto_alloc_skcipher(name, type, mask); + if (IS_ERR(skcipher)) { + kfree(tfm); + return ERR_CAST(skcipher); + } + + tfm->skcipher = skcipher; + + return tfm; } static void skcipher_release(void *private) { - crypto_free_skcipher(private); + struct skcipher_tfm *tfm = private; + + crypto_free_skcipher(tfm->skcipher); + kfree(tfm); } static int skcipher_setkey(void *private, const u8 *key, unsigned int keylen) { - return crypto_skcipher_setkey(private, key, keylen); + struct skcipher_tfm *tfm = private; + int err; + + err = crypto_skcipher_setkey(tfm->skcipher, key, keylen); + tfm->has_key = !err; + + return err; } static void skcipher_wait(struct sock *sk) @@ -788,24 +914,26 @@ static void skcipher_sock_destruct(struct sock *sk) af_alg_release_parent(sk); } -static int skcipher_accept_parent(void *private, struct sock *sk) +static int skcipher_accept_parent_nokey(void *private, struct sock *sk) { struct skcipher_ctx *ctx; struct alg_sock *ask = alg_sk(sk); - unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(private); + struct skcipher_tfm *tfm = private; + struct crypto_skcipher *skcipher = tfm->skcipher; + unsigned int len = sizeof(*ctx) + crypto_skcipher_reqsize(skcipher); ctx = sock_kmalloc(sk, len, GFP_KERNEL); if (!ctx) return -ENOMEM; - ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(private), + ctx->iv = sock_kmalloc(sk, crypto_skcipher_ivsize(skcipher), GFP_KERNEL); if (!ctx->iv) { sock_kfree_s(sk, ctx, len); return -ENOMEM; } - memset(ctx->iv, 0, crypto_skcipher_ivsize(private)); + memset(ctx->iv, 0, crypto_skcipher_ivsize(skcipher)); INIT_LIST_HEAD(&ctx->tsgl); ctx->len = len; @@ -818,7 +946,7 @@ static int skcipher_accept_parent(void *private, struct sock *sk) ask->private = ctx; - skcipher_request_set_tfm(&ctx->req, private); + skcipher_request_set_tfm(&ctx->req, skcipher); skcipher_request_set_callback(&ctx->req, CRYPTO_TFM_REQ_MAY_BACKLOG, af_alg_complete, &ctx->completion); @@ -827,12 +955,24 @@ static int skcipher_accept_parent(void *private, struct sock *sk) return 0; } +static int skcipher_accept_parent(void *private, struct sock *sk) +{ + struct skcipher_tfm *tfm = private; + + if (!tfm->has_key && crypto_skcipher_has_setkey(tfm->skcipher)) + return -ENOKEY; + + return skcipher_accept_parent_nokey(private, sk); +} + static const struct af_alg_type algif_type_skcipher = { .bind = skcipher_bind, .release = skcipher_release, .setkey = skcipher_setkey, .accept = skcipher_accept_parent, + .accept_nokey = skcipher_accept_parent_nokey, .ops = &algif_skcipher_ops, + .ops_nokey = &algif_skcipher_ops_nokey, .name = "skcipher", .owner = THIS_MODULE }; diff --git a/crypto/crc32c_generic.c b/crypto/crc32c_generic.c index 06f1b60f02b2..4c0a0e271876 100644 --- a/crypto/crc32c_generic.c +++ b/crypto/crc32c_generic.c @@ -172,4 +172,3 @@ MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations wrapper for lib/crc32c"); MODULE_LICENSE("GPL"); MODULE_ALIAS_CRYPTO("crc32c"); MODULE_ALIAS_CRYPTO("crc32c-generic"); -MODULE_SOFTDEP("pre: crc32c"); diff --git a/crypto/shash.c b/crypto/shash.c index ecb1e3d39bf0..88a27de79848 100644 --- a/crypto/shash.c +++ b/crypto/shash.c @@ -355,8 +355,10 @@ int crypto_init_shash_ops_async(struct crypto_tfm *tfm) crt->finup = shash_async_finup; crt->digest = shash_async_digest; - if (alg->setkey) + if (alg->setkey) { crt->setkey = shash_async_setkey; + crt->has_setkey = true; + } if (alg->export) crt->export = shash_async_export; if (alg->import) diff --git a/crypto/skcipher.c b/crypto/skcipher.c index 7591928be7ca..d199c0b1751c 100644 --- a/crypto/skcipher.c +++ b/crypto/skcipher.c @@ -118,6 +118,7 @@ static int crypto_init_skcipher_ops_blkcipher(struct crypto_tfm *tfm) skcipher->decrypt = skcipher_decrypt_blkcipher; skcipher->ivsize = crypto_blkcipher_ivsize(blkcipher); + skcipher->has_setkey = calg->cra_blkcipher.max_keysize; return 0; } @@ -210,6 +211,7 @@ static int crypto_init_skcipher_ops_ablkcipher(struct crypto_tfm *tfm) skcipher->ivsize = crypto_ablkcipher_ivsize(ablkcipher); skcipher->reqsize = crypto_ablkcipher_reqsize(ablkcipher) + sizeof(struct ablkcipher_request); + skcipher->has_setkey = calg->cra_ablkcipher.max_keysize; return 0; } diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 6682c5daf742..6e6bc1059301 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -32,6 +32,7 @@ #include <linux/hardirq.h> #include <linux/pstore.h> #include <linux/vmalloc.h> +#include <linux/mm.h> /* kvfree() */ #include <acpi/apei.h> #include "apei-internal.h" @@ -532,10 +533,7 @@ retry: return -ENOMEM; memcpy(new_entries, entries, erst_record_id_cache.len * sizeof(entries[0])); - if (erst_record_id_cache.size < PAGE_SIZE) - kfree(entries); - else - vfree(entries); + kvfree(entries); erst_record_id_cache.entries = entries = new_entries; erst_record_id_cache.size = new_size; } diff --git a/drivers/amba/Kconfig b/drivers/amba/Kconfig index 4a5c9d279059..294ba6f36396 100644 --- a/drivers/amba/Kconfig +++ b/drivers/amba/Kconfig @@ -4,7 +4,7 @@ config ARM_AMBA if ARM_AMBA config TEGRA_AHB - bool "Enable AHB driver for NVIDIA Tegra SoCs" + bool default y if ARCH_TEGRA help Adds AHB configuration functionality for NVIDIA Tegra SoCs, diff --git a/drivers/base/devtmpfs.c b/drivers/base/devtmpfs.c index 68f03141e432..44a74cf1372c 100644 --- a/drivers/base/devtmpfs.c +++ b/drivers/base/devtmpfs.c @@ -215,9 +215,9 @@ static int handle_create(const char *nodename, umode_t mode, kuid_t uid, newattrs.ia_uid = uid; newattrs.ia_gid = gid; newattrs.ia_valid = ATTR_MODE|ATTR_UID|ATTR_GID; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); notify_change(dentry, &newattrs, NULL); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); /* mark as kernel-created inode */ d_inode(dentry)->i_private = &thread; @@ -244,7 +244,7 @@ static int dev_rmdir(const char *name) err = -ENOENT; } dput(dentry); - mutex_unlock(&d_inode(parent.dentry)->i_mutex); + inode_unlock(d_inode(parent.dentry)); path_put(&parent); return err; } @@ -321,9 +321,9 @@ static int handle_remove(const char *nodename, struct device *dev) newattrs.ia_mode = stat.mode & ~0777; newattrs.ia_valid = ATTR_UID|ATTR_GID|ATTR_MODE; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); notify_change(dentry, &newattrs, NULL); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); err = vfs_unlink(d_inode(parent.dentry), dentry, NULL); if (!err || err == -ENOENT) deleted = 1; @@ -332,7 +332,7 @@ static int handle_remove(const char *nodename, struct device *dev) err = -ENOENT; } dput(dentry); - mutex_unlock(&d_inode(parent.dentry)->i_mutex); + inode_unlock(d_inode(parent.dentry)); path_put(&parent); if (deleted && strchr(nodename, '/')) diff --git a/drivers/block/aoe/aoecmd.c b/drivers/block/aoe/aoecmd.c index ad80c85e0857..d048d2009e89 100644 --- a/drivers/block/aoe/aoecmd.c +++ b/drivers/block/aoe/aoecmd.c @@ -964,9 +964,9 @@ aoecmd_sleepwork(struct work_struct *work) ssize = get_capacity(d->gd); bd = bdget_disk(d->gd, 0); if (bd) { - mutex_lock(&bd->bd_inode->i_mutex); + inode_lock(bd->bd_inode); i_size_write(bd->bd_inode, (loff_t)ssize<<9); - mutex_unlock(&bd->bd_inode->i_mutex); + inode_unlock(bd->bd_inode); bdput(bd); } spin_lock_irq(&d->lock); diff --git a/drivers/block/drbd/drbd_actlog.c b/drivers/block/drbd/drbd_actlog.c index b3868e7a1ffd..10459a145062 100644 --- a/drivers/block/drbd/drbd_actlog.c +++ b/drivers/block/drbd/drbd_actlog.c @@ -288,7 +288,162 @@ bool drbd_al_begin_io_prepare(struct drbd_device *device, struct drbd_interval * return need_transaction; } -static int al_write_transaction(struct drbd_device *device); +#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) +/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT + * are still coupled, or assume too much about their relation. + * Code below will not work if this is violated. + * Will be cleaned up with some followup patch. + */ +# error FIXME +#endif + +static unsigned int al_extent_to_bm_page(unsigned int al_enr) +{ + return al_enr >> + /* bit to page */ + ((PAGE_SHIFT + 3) - + /* al extent number to bit */ + (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); +} + +static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) +{ + const unsigned int stripes = device->ldev->md.al_stripes; + const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; + + /* transaction number, modulo on-disk ring buffer wrap around */ + unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); + + /* ... to aligned 4k on disk block */ + t = ((t % stripes) * stripe_size_4kB) + t/stripes; + + /* ... to 512 byte sector in activity log */ + t *= 8; + + /* ... plus offset to the on disk position */ + return device->ldev->md.md_offset + device->ldev->md.al_offset + t; +} + +static int __al_write_transaction(struct drbd_device *device, struct al_transaction_on_disk *buffer) +{ + struct lc_element *e; + sector_t sector; + int i, mx; + unsigned extent_nr; + unsigned crc = 0; + int err = 0; + + memset(buffer, 0, sizeof(*buffer)); + buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); + buffer->tr_number = cpu_to_be32(device->al_tr_number); + + i = 0; + + /* Even though no one can start to change this list + * once we set the LC_LOCKED -- from drbd_al_begin_io(), + * lc_try_lock_for_transaction() --, someone may still + * be in the process of changing it. */ + spin_lock_irq(&device->al_lock); + list_for_each_entry(e, &device->act_log->to_be_changed, list) { + if (i == AL_UPDATES_PER_TRANSACTION) { + i++; + break; + } + buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); + buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); + if (e->lc_number != LC_FREE) + drbd_bm_mark_for_writeout(device, + al_extent_to_bm_page(e->lc_number)); + i++; + } + spin_unlock_irq(&device->al_lock); + BUG_ON(i > AL_UPDATES_PER_TRANSACTION); + + buffer->n_updates = cpu_to_be16(i); + for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { + buffer->update_slot_nr[i] = cpu_to_be16(-1); + buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); + } + + buffer->context_size = cpu_to_be16(device->act_log->nr_elements); + buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); + + mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, + device->act_log->nr_elements - device->al_tr_cycle); + for (i = 0; i < mx; i++) { + unsigned idx = device->al_tr_cycle + i; + extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; + buffer->context[i] = cpu_to_be32(extent_nr); + } + for (; i < AL_CONTEXT_PER_TRANSACTION; i++) + buffer->context[i] = cpu_to_be32(LC_FREE); + + device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; + if (device->al_tr_cycle >= device->act_log->nr_elements) + device->al_tr_cycle = 0; + + sector = al_tr_number_to_on_disk_sector(device); + + crc = crc32c(0, buffer, 4096); + buffer->crc32c = cpu_to_be32(crc); + + if (drbd_bm_write_hinted(device)) + err = -EIO; + else { + bool write_al_updates; + rcu_read_lock(); + write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; + rcu_read_unlock(); + if (write_al_updates) { + if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { + err = -EIO; + drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); + } else { + device->al_tr_number++; + device->al_writ_cnt++; + } + } + } + + return err; +} + +static int al_write_transaction(struct drbd_device *device) +{ + struct al_transaction_on_disk *buffer; + int err; + + if (!get_ldev(device)) { + drbd_err(device, "disk is %s, cannot start al transaction\n", + drbd_disk_str(device->state.disk)); + return -EIO; + } + + /* The bitmap write may have failed, causing a state change. */ + if (device->state.disk < D_INCONSISTENT) { + drbd_err(device, + "disk is %s, cannot write al transaction\n", + drbd_disk_str(device->state.disk)); + put_ldev(device); + return -EIO; + } + + /* protects md_io_buffer, al_tr_cycle, ... */ + buffer = drbd_md_get_buffer(device, __func__); + if (!buffer) { + drbd_err(device, "disk failed while waiting for md_io buffer\n"); + put_ldev(device); + return -ENODEV; + } + + err = __al_write_transaction(device, buffer); + + drbd_md_put_buffer(device); + put_ldev(device); + + return err; +} + void drbd_al_begin_io_commit(struct drbd_device *device) { @@ -420,153 +575,6 @@ void drbd_al_complete_io(struct drbd_device *device, struct drbd_interval *i) wake_up(&device->al_wait); } -#if (PAGE_SHIFT + 3) < (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT) -/* Currently BM_BLOCK_SHIFT, BM_EXT_SHIFT and AL_EXTENT_SHIFT - * are still coupled, or assume too much about their relation. - * Code below will not work if this is violated. - * Will be cleaned up with some followup patch. - */ -# error FIXME -#endif - -static unsigned int al_extent_to_bm_page(unsigned int al_enr) -{ - return al_enr >> - /* bit to page */ - ((PAGE_SHIFT + 3) - - /* al extent number to bit */ - (AL_EXTENT_SHIFT - BM_BLOCK_SHIFT)); -} - -static sector_t al_tr_number_to_on_disk_sector(struct drbd_device *device) -{ - const unsigned int stripes = device->ldev->md.al_stripes; - const unsigned int stripe_size_4kB = device->ldev->md.al_stripe_size_4k; - - /* transaction number, modulo on-disk ring buffer wrap around */ - unsigned int t = device->al_tr_number % (device->ldev->md.al_size_4k); - - /* ... to aligned 4k on disk block */ - t = ((t % stripes) * stripe_size_4kB) + t/stripes; - - /* ... to 512 byte sector in activity log */ - t *= 8; - - /* ... plus offset to the on disk position */ - return device->ldev->md.md_offset + device->ldev->md.al_offset + t; -} - -int al_write_transaction(struct drbd_device *device) -{ - struct al_transaction_on_disk *buffer; - struct lc_element *e; - sector_t sector; - int i, mx; - unsigned extent_nr; - unsigned crc = 0; - int err = 0; - - if (!get_ldev(device)) { - drbd_err(device, "disk is %s, cannot start al transaction\n", - drbd_disk_str(device->state.disk)); - return -EIO; - } - - /* The bitmap write may have failed, causing a state change. */ - if (device->state.disk < D_INCONSISTENT) { - drbd_err(device, - "disk is %s, cannot write al transaction\n", - drbd_disk_str(device->state.disk)); - put_ldev(device); - return -EIO; - } - - /* protects md_io_buffer, al_tr_cycle, ... */ - buffer = drbd_md_get_buffer(device, __func__); - if (!buffer) { - drbd_err(device, "disk failed while waiting for md_io buffer\n"); - put_ldev(device); - return -ENODEV; - } - - memset(buffer, 0, sizeof(*buffer)); - buffer->magic = cpu_to_be32(DRBD_AL_MAGIC); - buffer->tr_number = cpu_to_be32(device->al_tr_number); - - i = 0; - - /* Even though no one can start to change this list - * once we set the LC_LOCKED -- from drbd_al_begin_io(), - * lc_try_lock_for_transaction() --, someone may still - * be in the process of changing it. */ - spin_lock_irq(&device->al_lock); - list_for_each_entry(e, &device->act_log->to_be_changed, list) { - if (i == AL_UPDATES_PER_TRANSACTION) { - i++; - break; - } - buffer->update_slot_nr[i] = cpu_to_be16(e->lc_index); - buffer->update_extent_nr[i] = cpu_to_be32(e->lc_new_number); - if (e->lc_number != LC_FREE) - drbd_bm_mark_for_writeout(device, - al_extent_to_bm_page(e->lc_number)); - i++; - } - spin_unlock_irq(&device->al_lock); - BUG_ON(i > AL_UPDATES_PER_TRANSACTION); - - buffer->n_updates = cpu_to_be16(i); - for ( ; i < AL_UPDATES_PER_TRANSACTION; i++) { - buffer->update_slot_nr[i] = cpu_to_be16(-1); - buffer->update_extent_nr[i] = cpu_to_be32(LC_FREE); - } - - buffer->context_size = cpu_to_be16(device->act_log->nr_elements); - buffer->context_start_slot_nr = cpu_to_be16(device->al_tr_cycle); - - mx = min_t(int, AL_CONTEXT_PER_TRANSACTION, - device->act_log->nr_elements - device->al_tr_cycle); - for (i = 0; i < mx; i++) { - unsigned idx = device->al_tr_cycle + i; - extent_nr = lc_element_by_index(device->act_log, idx)->lc_number; - buffer->context[i] = cpu_to_be32(extent_nr); - } - for (; i < AL_CONTEXT_PER_TRANSACTION; i++) - buffer->context[i] = cpu_to_be32(LC_FREE); - - device->al_tr_cycle += AL_CONTEXT_PER_TRANSACTION; - if (device->al_tr_cycle >= device->act_log->nr_elements) - device->al_tr_cycle = 0; - - sector = al_tr_number_to_on_disk_sector(device); - - crc = crc32c(0, buffer, 4096); - buffer->crc32c = cpu_to_be32(crc); - - if (drbd_bm_write_hinted(device)) - err = -EIO; - else { - bool write_al_updates; - rcu_read_lock(); - write_al_updates = rcu_dereference(device->ldev->disk_conf)->al_updates; - rcu_read_unlock(); - if (write_al_updates) { - if (drbd_md_sync_page_io(device, device->ldev, sector, WRITE)) { - err = -EIO; - drbd_chk_io_error(device, 1, DRBD_META_IO_ERROR); - } else { - device->al_tr_number++; - device->al_writ_cnt++; - } - } - } - - drbd_md_put_buffer(device); - put_ldev(device); - - return err; -} - static int _try_lc_del(struct drbd_device *device, struct lc_element *al_ext) { int rv; @@ -606,21 +614,24 @@ void drbd_al_shrink(struct drbd_device *device) wake_up(&device->al_wait); } -int drbd_initialize_al(struct drbd_device *device, void *buffer) +int drbd_al_initialize(struct drbd_device *device, void *buffer) { struct al_transaction_on_disk *al = buffer; struct drbd_md *md = &device->ldev->md; - sector_t al_base = md->md_offset + md->al_offset; int al_size_4k = md->al_stripes * md->al_stripe_size_4k; int i; - memset(al, 0, 4096); - al->magic = cpu_to_be32(DRBD_AL_MAGIC); - al->transaction_type = cpu_to_be16(AL_TR_INITIALIZED); - al->crc32c = cpu_to_be32(crc32c(0, al, 4096)); + __al_write_transaction(device, al); + /* There may or may not have been a pending transaction. */ + spin_lock_irq(&device->al_lock); + lc_committed(device->act_log); + spin_unlock_irq(&device->al_lock); - for (i = 0; i < al_size_4k; i++) { - int err = drbd_md_sync_page_io(device, device->ldev, al_base + i * 8, WRITE); + /* The rest of the transactions will have an empty "updates" list, and + * are written out only to provide the context, and to initialize the + * on-disk ring buffer. */ + for (i = 1; i < al_size_4k; i++) { + int err = __al_write_transaction(device, al); if (err) return err; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 9462d2752850..92d6fc020a65 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -24,7 +24,7 @@ #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt -#include <linux/bitops.h> +#include <linux/bitmap.h> #include <linux/vmalloc.h> #include <linux/string.h> #include <linux/drbd.h> @@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number) } } -static void bm_vk_free(void *ptr, int v) +static inline void bm_vk_free(void *ptr) { - if (v) - vfree(ptr); - else - kfree(ptr); + kvfree(ptr); } /* @@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) { struct page **old_pages = b->bm_pages; struct page **new_pages, *page; - unsigned int i, bytes, vmalloced = 0; + unsigned int i, bytes; unsigned long have = b->bm_number_of_pages; BUG_ON(have == 0 && old_pages != NULL); @@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) PAGE_KERNEL); if (!new_pages) return NULL; - vmalloced = 1; } if (want >= have) { @@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); if (!page) { bm_free_pages(new_pages + have, i - have); - bm_vk_free(new_pages, vmalloced); + bm_vk_free(new_pages); return NULL; } /* we want to know which page it is @@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) */ } - if (vmalloced) - b->bm_flags |= BM_P_VMALLOCED; - else - b->bm_flags &= ~BM_P_VMALLOCED; - return new_pages; } @@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device) if (!expect(device->bitmap)) return; bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); - bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); + bm_vk_free(device->bitmap->bm_pages); kfree(device->bitmap); device->bitmap = NULL; } @@ -479,8 +470,14 @@ void drbd_bm_cleanup(struct drbd_device *device) * this masks out the remaining bits. * Returns the number of bits cleared. */ +#ifndef BITS_PER_PAGE #define BITS_PER_PAGE (1UL << (PAGE_SHIFT + 3)) #define BITS_PER_PAGE_MASK (BITS_PER_PAGE - 1) +#else +# if BITS_PER_PAGE != (1UL << (PAGE_SHIFT + 3)) +# error "ambiguous BITS_PER_PAGE" +# endif +#endif #define BITS_PER_LONG_MASK (BITS_PER_LONG - 1) static int bm_clear_surplus(struct drbd_bitmap *b) { @@ -559,21 +556,19 @@ static unsigned long bm_count_bits(struct drbd_bitmap *b) unsigned long *p_addr; unsigned long bits = 0; unsigned long mask = (1UL << (b->bm_bits & BITS_PER_LONG_MASK)) -1; - int idx, i, last_word; + int idx, last_word; /* all but last page */ for (idx = 0; idx < b->bm_number_of_pages - 1; idx++) { p_addr = __bm_map_pidx(b, idx); - for (i = 0; i < LWPP; i++) - bits += hweight_long(p_addr[i]); + bits += bitmap_weight(p_addr, BITS_PER_PAGE); __bm_unmap(p_addr); cond_resched(); } /* last (or only) page */ last_word = ((b->bm_bits - 1) & BITS_PER_PAGE_MASK) >> LN2_BPL; p_addr = __bm_map_pidx(b, idx); - for (i = 0; i < last_word; i++) - bits += hweight_long(p_addr[i]); + bits += bitmap_weight(p_addr, last_word * BITS_PER_LONG); p_addr[last_word] &= cpu_to_lel(mask); bits += hweight_long(p_addr[last_word]); /* 32bit arch, may have an unused padding long */ @@ -639,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi unsigned long want, have, onpages; /* number of pages */ struct page **npages, **opages = NULL; int err = 0, growing; - int opages_vmalloced; if (!expect(b)) return -ENOMEM; @@ -652,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi if (capacity == b->bm_dev_capacity) goto out; - opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); - if (capacity == 0) { spin_lock_irq(&b->bm_lock); opages = b->bm_pages; @@ -667,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi b->bm_dev_capacity = 0; spin_unlock_irq(&b->bm_lock); bm_free_pages(opages, onpages); - bm_vk_free(opages, opages_vmalloced); + bm_vk_free(opages); goto out; } bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); @@ -740,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi spin_unlock_irq(&b->bm_lock); if (opages != npages) - bm_vk_free(opages, opages_vmalloced); + bm_vk_free(opages); if (!growing) b->bm_set = bm_count_bits(b); drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); @@ -1419,6 +1411,9 @@ static inline void bm_set_full_words_within_one_page(struct drbd_bitmap *b, int bits; int changed = 0; unsigned long *paddr = kmap_atomic(b->bm_pages[page_nr]); + + /* I think it is more cache line friendly to hweight_long then set to ~0UL, + * than to first bitmap_weight() all words, then bitmap_fill() all words */ for (i = first_word; i < last_word; i++) { bits = hweight_long(paddr[i]); paddr[i] = ~0UL; @@ -1628,8 +1623,7 @@ int drbd_bm_e_weight(struct drbd_device *device, unsigned long enr) int n = e-s; p_addr = bm_map_pidx(b, bm_word_to_page_idx(b, s)); bm = p_addr + MLPP(s); - while (n--) - count += hweight_long(*bm++); + count += bitmap_weight(bm, n * BITS_PER_LONG); bm_unmap(p_addr); } else { drbd_err(device, "start offset (%d) too large in drbd_bm_e_weight\n", s); diff --git a/drivers/block/drbd/drbd_debugfs.c b/drivers/block/drbd/drbd_debugfs.c index 6b88a35fb048..4de95bbff486 100644 --- a/drivers/block/drbd/drbd_debugfs.c +++ b/drivers/block/drbd/drbd_debugfs.c @@ -434,12 +434,12 @@ static int drbd_single_open(struct file *file, int (*show)(struct seq_file *, vo if (!parent || d_really_is_negative(parent)) goto out; /* serialize with d_delete() */ - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); /* Make sure the object is still alive */ if (simple_positive(file->f_path.dentry) && kref_get_unless_zero(kref)) ret = 0; - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) { ret = single_open(file, show, data); if (ret) @@ -771,6 +771,13 @@ static int device_data_gen_id_show(struct seq_file *m, void *ignored) return 0; } +static int device_ed_gen_id_show(struct seq_file *m, void *ignored) +{ + struct drbd_device *device = m->private; + seq_printf(m, "0x%016llX\n", (unsigned long long)device->ed_uuid); + return 0; +} + #define drbd_debugfs_device_attr(name) \ static int device_ ## name ## _open(struct inode *inode, struct file *file) \ { \ @@ -796,6 +803,7 @@ drbd_debugfs_device_attr(oldest_requests) drbd_debugfs_device_attr(act_log_extents) drbd_debugfs_device_attr(resync_extents) drbd_debugfs_device_attr(data_gen_id) +drbd_debugfs_device_attr(ed_gen_id) void drbd_debugfs_device_add(struct drbd_device *device) { @@ -839,6 +847,7 @@ void drbd_debugfs_device_add(struct drbd_device *device) DCF(act_log_extents); DCF(resync_extents); DCF(data_gen_id); + DCF(ed_gen_id); #undef DCF return; @@ -854,6 +863,7 @@ void drbd_debugfs_device_cleanup(struct drbd_device *device) drbd_debugfs_remove(&device->debugfs_vol_act_log_extents); drbd_debugfs_remove(&device->debugfs_vol_resync_extents); drbd_debugfs_remove(&device->debugfs_vol_data_gen_id); + drbd_debugfs_remove(&device->debugfs_vol_ed_gen_id); drbd_debugfs_remove(&device->debugfs_vol); } diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index e66d453a5f2b..34bc84efc29e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -77,13 +77,6 @@ extern int fault_devs; extern char usermode_helper[]; -/* I don't remember why XCPU ... - * This is used to wake the asender, - * and to interrupt sending the sending task - * on disconnect. - */ -#define DRBD_SIG SIGXCPU - /* This is used to stop/restart our threads. * Cannot use SIGTERM nor SIGKILL, since these * are sent out by init on runlevel changes @@ -292,6 +285,9 @@ struct drbd_device_work { extern int drbd_wait_misc(struct drbd_device *, struct drbd_interval *); +extern void lock_all_resources(void); +extern void unlock_all_resources(void); + struct drbd_request { struct drbd_work w; struct drbd_device *device; @@ -504,7 +500,6 @@ enum { MD_NO_FUA, /* Users wants us to not use FUA/FLUSH on meta data dev */ - SUSPEND_IO, /* suspend application io */ BITMAP_IO, /* suspend application io; once no more io in flight, start bitmap io */ BITMAP_IO_QUEUED, /* Started bitmap IO */ @@ -541,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */ /* definition of bits in bm_flags to be used in drbd_bm_lock * and drbd_bitmap_io and friends. */ enum bm_flag { - /* do we need to kfree, or vfree bm_pages? */ - BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ - /* currently locked for bulk operation */ BM_LOCKED_MASK = 0xf, @@ -632,12 +624,6 @@ struct bm_io_work { void (*done)(struct drbd_device *device, int rv); }; -enum write_ordering_e { - WO_none, - WO_drain_io, - WO_bdev_flush, -}; - struct fifo_buffer { unsigned int head_index; unsigned int size; @@ -650,8 +636,7 @@ extern struct fifo_buffer *fifo_alloc(int fifo_size); enum { NET_CONGESTED, /* The data socket is congested */ RESOLVE_CONFLICTS, /* Set on one node, cleared on the peer! */ - SEND_PING, /* whether asender should send a ping asap */ - SIGNAL_ASENDER, /* whether asender wants to be interrupted */ + SEND_PING, GOT_PING_ACK, /* set when we receive a ping_ack packet, ping_wait gets woken */ CONN_WD_ST_CHG_REQ, /* A cluster wide state change on the connection is active */ CONN_WD_ST_CHG_OKAY, @@ -670,6 +655,8 @@ enum { DEVICE_WORK_PENDING, /* tell worker that some device has pending work */ }; +enum which_state { NOW, OLD = NOW, NEW }; + struct drbd_resource { char *name; #ifdef CONFIG_DEBUG_FS @@ -755,7 +742,8 @@ struct drbd_connection { unsigned long last_reconnect_jif; struct drbd_thread receiver; struct drbd_thread worker; - struct drbd_thread asender; + struct drbd_thread ack_receiver; + struct workqueue_struct *ack_sender; /* cached pointers, * so we can look up the oldest pending requests more quickly. @@ -774,6 +762,8 @@ struct drbd_connection { struct drbd_thread_timing_details r_timing_details[DRBD_THREAD_DETAILS_HIST]; struct { + unsigned long last_sent_barrier_jif; + /* whether this sender thread * has processed a single write yet. */ bool seen_any_write_yet; @@ -788,6 +778,17 @@ struct drbd_connection { } send; }; +static inline bool has_net_conf(struct drbd_connection *connection) +{ + bool has_net_conf; + + rcu_read_lock(); + has_net_conf = rcu_dereference(connection->net_conf); + rcu_read_unlock(); + + return has_net_conf; +} + void __update_timing_details( struct drbd_thread_timing_details *tdp, unsigned int *cb_nr, @@ -811,6 +812,7 @@ struct drbd_peer_device { struct list_head peer_devices; struct drbd_device *device; struct drbd_connection *connection; + struct work_struct send_acks_work; #ifdef CONFIG_DEBUG_FS struct dentry *debugfs_peer_dev; #endif @@ -829,6 +831,7 @@ struct drbd_device { struct dentry *debugfs_vol_act_log_extents; struct dentry *debugfs_vol_resync_extents; struct dentry *debugfs_vol_data_gen_id; + struct dentry *debugfs_vol_ed_gen_id; #endif unsigned int vnr; /* volume number within the connection */ @@ -873,6 +876,7 @@ struct drbd_device { atomic_t rs_pending_cnt; /* RS request/data packets on the wire */ atomic_t unacked_cnt; /* Need to send replies for */ atomic_t local_cnt; /* Waiting for local completion */ + atomic_t suspend_cnt; /* Interval tree of pending local requests */ struct rb_root read_requests; @@ -1020,6 +1024,12 @@ static inline struct drbd_peer_device *first_peer_device(struct drbd_device *dev return list_first_entry_or_null(&device->peer_devices, struct drbd_peer_device, peer_devices); } +static inline struct drbd_peer_device * +conn_peer_device(struct drbd_connection *connection, int volume_number) +{ + return idr_find(&connection->peer_devices, volume_number); +} + #define for_each_resource(resource, _resources) \ list_for_each_entry(resource, _resources, resources) @@ -1113,7 +1123,7 @@ extern int drbd_send_ov_request(struct drbd_peer_device *, sector_t sector, int extern int drbd_send_bitmap(struct drbd_device *device); extern void drbd_send_sr_reply(struct drbd_peer_device *, enum drbd_state_rv retcode); extern void conn_send_sr_reply(struct drbd_connection *connection, enum drbd_state_rv retcode); -extern void drbd_free_ldev(struct drbd_backing_dev *ldev); +extern void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev); extern void drbd_device_cleanup(struct drbd_device *device); void drbd_print_uuids(struct drbd_device *device, const char *text); @@ -1424,7 +1434,7 @@ extern struct bio_set *drbd_md_io_bio_set; /* to allocate from that set */ extern struct bio *bio_alloc_drbd(gfp_t gfp_mask); -extern rwlock_t global_state_lock; +extern struct mutex resources_mutex; extern int conn_lowest_minor(struct drbd_connection *connection); extern enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsigned int minor); @@ -1454,6 +1464,9 @@ extern int is_valid_ar_handle(struct drbd_request *, sector_t); /* drbd_nl.c */ + +extern struct mutex notification_mutex; + extern void drbd_suspend_io(struct drbd_device *device); extern void drbd_resume_io(struct drbd_device *device); extern char *ppsize(char *buf, unsigned long long size); @@ -1536,7 +1549,9 @@ extern void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req); /* drbd_receiver.c */ extern int drbd_receiver(struct drbd_thread *thi); -extern int drbd_asender(struct drbd_thread *thi); +extern int drbd_ack_receiver(struct drbd_thread *thi); +extern void drbd_send_ping_wf(struct work_struct *ws); +extern void drbd_send_acks_wf(struct work_struct *ws); extern bool drbd_rs_c_min_rate_throttle(struct drbd_device *device); extern bool drbd_rs_should_slow_down(struct drbd_device *device, sector_t sector, bool throttle_if_app_is_waiting); @@ -1649,7 +1664,7 @@ extern int __drbd_change_sync(struct drbd_device *device, sector_t sector, int s #define drbd_rs_failed_io(device, sector, size) \ __drbd_change_sync(device, sector, size, RECORD_RS_FAILED) extern void drbd_al_shrink(struct drbd_device *device); -extern int drbd_initialize_al(struct drbd_device *, void *); +extern int drbd_al_initialize(struct drbd_device *, void *); /* drbd_nl.c */ /* state info broadcast */ @@ -1668,6 +1683,29 @@ struct sib_info { }; void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib); +extern void notify_resource_state(struct sk_buff *, + unsigned int, + struct drbd_resource *, + struct resource_info *, + enum drbd_notification_type); +extern void notify_device_state(struct sk_buff *, + unsigned int, + struct drbd_device *, + struct device_info *, + enum drbd_notification_type); +extern void notify_connection_state(struct sk_buff *, + unsigned int, + struct drbd_connection *, + struct connection_info *, + enum drbd_notification_type); +extern void notify_peer_device_state(struct sk_buff *, + unsigned int, + struct drbd_peer_device *, + struct peer_device_info *, + enum drbd_notification_type); +extern void notify_helper(enum drbd_notification_type, struct drbd_device *, + struct drbd_connection *, const char *, int); + /* * inline helper functions *************************/ @@ -1694,19 +1732,6 @@ static inline int drbd_peer_req_has_active_page(struct drbd_peer_request *peer_r return 0; } -static inline enum drbd_state_rv -_drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) -{ - enum drbd_state_rv rv; - - read_lock(&global_state_lock); - rv = __drbd_set_state(device, ns, flags, done); - read_unlock(&global_state_lock); - - return rv; -} - static inline union drbd_state drbd_read_state(struct drbd_device *device) { struct drbd_resource *resource = device->resource; @@ -1937,16 +1962,21 @@ drbd_device_post_work(struct drbd_device *device, int work_bit) extern void drbd_flush_workqueue(struct drbd_work_queue *work_queue); -static inline void wake_asender(struct drbd_connection *connection) +/* To get the ack_receiver out of the blocking network stack, + * so it can change its sk_rcvtimeo from idle- to ping-timeout, + * and send a ping, we need to send a signal. + * Which signal we send is irrelevant. */ +static inline void wake_ack_receiver(struct drbd_connection *connection) { - if (test_bit(SIGNAL_ASENDER, &connection->flags)) - force_sig(DRBD_SIG, connection->asender.task); + struct task_struct *task = connection->ack_receiver.task; + if (task && get_t_state(&connection->ack_receiver) == RUNNING) + force_sig(SIGXCPU, task); } static inline void request_ping(struct drbd_connection *connection) { set_bit(SEND_PING, &connection->flags); - wake_asender(connection); + wake_ack_receiver(connection); } extern void *conn_prepare_command(struct drbd_connection *, struct drbd_socket *); @@ -2230,7 +2260,7 @@ static inline bool may_inc_ap_bio(struct drbd_device *device) if (drbd_suspended(device)) return false; - if (test_bit(SUSPEND_IO, &device->flags)) + if (atomic_read(&device->suspend_cnt)) return false; /* to avoid potential deadlock or bitmap corruption, diff --git a/drivers/block/drbd/drbd_main.c b/drivers/block/drbd/drbd_main.c index 74d97f4bac34..5b43dfb79819 100644 --- a/drivers/block/drbd/drbd_main.c +++ b/drivers/block/drbd/drbd_main.c @@ -117,6 +117,7 @@ module_param_string(usermode_helper, usermode_helper, sizeof(usermode_helper), 0 */ struct idr drbd_devices; struct list_head drbd_resources; +struct mutex resources_mutex; struct kmem_cache *drbd_request_cache; struct kmem_cache *drbd_ee_cache; /* peer requests */ @@ -1435,8 +1436,8 @@ static int we_should_drop_the_connection(struct drbd_connection *connection, str /* long elapsed = (long)(jiffies - device->last_received); */ drop_it = connection->meta.socket == sock - || !connection->asender.task - || get_t_state(&connection->asender) != RUNNING + || !connection->ack_receiver.task + || get_t_state(&connection->ack_receiver) != RUNNING || connection->cstate < C_WF_REPORT_PARAMS; if (drop_it) @@ -1793,15 +1794,6 @@ int drbd_send(struct drbd_connection *connection, struct socket *sock, drbd_update_congested(connection); } do { - /* STRANGE - * tcp_sendmsg does _not_ use its size parameter at all ? - * - * -EAGAIN on timeout, -EINTR on signal. - */ -/* THINK - * do we need to block DRBD_SIG if sock == &meta.socket ?? - * otherwise wake_asender() might interrupt some send_*Ack ! - */ rv = kernel_sendmsg(sock, &msg, &iov, 1, size); if (rv == -EAGAIN) { if (we_should_drop_the_connection(connection, sock)) @@ -2000,7 +1992,7 @@ void drbd_device_cleanup(struct drbd_device *device) drbd_bm_cleanup(device); } - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; clear_bit(AL_SUSPENDED, &device->flags); @@ -2179,7 +2171,7 @@ void drbd_destroy_device(struct kref *kref) if (device->this_bdev) bdput(device->this_bdev); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; drbd_release_all_peer_reqs(device); @@ -2563,7 +2555,7 @@ int set_resource_options(struct drbd_resource *resource, struct res_opts *res_op cpumask_copy(resource->cpu_mask, new_cpu_mask); for_each_connection_rcu(connection, resource) { connection->receiver.reset_cpu_mask = 1; - connection->asender.reset_cpu_mask = 1; + connection->ack_receiver.reset_cpu_mask = 1; connection->worker.reset_cpu_mask = 1; } } @@ -2590,7 +2582,7 @@ struct drbd_resource *drbd_create_resource(const char *name) kref_init(&resource->kref); idr_init(&resource->devices); INIT_LIST_HEAD(&resource->connections); - resource->write_ordering = WO_bdev_flush; + resource->write_ordering = WO_BDEV_FLUSH; list_add_tail_rcu(&resource->resources, &drbd_resources); mutex_init(&resource->conf_update); mutex_init(&resource->adm_mutex); @@ -2652,8 +2644,8 @@ struct drbd_connection *conn_create(const char *name, struct res_opts *res_opts) connection->receiver.connection = connection; drbd_thread_init(resource, &connection->worker, drbd_worker, "worker"); connection->worker.connection = connection; - drbd_thread_init(resource, &connection->asender, drbd_asender, "asender"); - connection->asender.connection = connection; + drbd_thread_init(resource, &connection->ack_receiver, drbd_ack_receiver, "ack_recv"); + connection->ack_receiver.connection = connection; kref_init(&connection->kref); @@ -2702,8 +2694,8 @@ static int init_submitter(struct drbd_device *device) { /* opencoded create_singlethread_workqueue(), * to be able to say "drbd%d", ..., minor */ - device->submit.wq = alloc_workqueue("drbd%u_submit", - WQ_UNBOUND | WQ_MEM_RECLAIM, 1, device->minor); + device->submit.wq = + alloc_ordered_workqueue("drbd%u_submit", WQ_MEM_RECLAIM, device->minor); if (!device->submit.wq) return -ENOMEM; @@ -2820,6 +2812,7 @@ enum drbd_ret_code drbd_create_device(struct drbd_config_context *adm_ctx, unsig goto out_idr_remove_from_resource; } kref_get(&connection->kref); + INIT_WORK(&peer_device->send_acks_work, drbd_send_acks_wf); } if (init_submitter(device)) { @@ -2923,7 +2916,7 @@ static int __init drbd_init(void) drbd_proc = NULL; /* play safe for drbd_cleanup */ idr_init(&drbd_devices); - rwlock_init(&global_state_lock); + mutex_init(&resources_mutex); INIT_LIST_HEAD(&drbd_resources); err = drbd_genl_register(); @@ -2971,18 +2964,6 @@ fail: return err; } -void drbd_free_ldev(struct drbd_backing_dev *ldev) -{ - if (ldev == NULL) - return; - - blkdev_put(ldev->backing_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - blkdev_put(ldev->md_bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); - - kfree(ldev->disk_conf); - kfree(ldev); -} - static void drbd_free_one_sock(struct drbd_socket *ds) { struct socket *s; @@ -3277,6 +3258,10 @@ int drbd_md_read(struct drbd_device *device, struct drbd_backing_dev *bdev) * and read it. */ bdev->md.meta_dev_idx = bdev->disk_conf->meta_dev_idx; bdev->md.md_offset = drbd_md_ss(bdev); + /* Even for (flexible or indexed) external meta data, + * initially restrict us to the 4k superblock for now. + * Affects the paranoia out-of-range access check in drbd_md_sync_page_io(). */ + bdev->md.md_size_sect = 8; if (drbd_md_sync_page_io(device, bdev, bdev->md.md_offset, READ)) { /* NOTE: can't do normal error processing here as this is @@ -3578,7 +3563,9 @@ void drbd_queue_bitmap_io(struct drbd_device *device, spin_lock_irq(&device->resource->req_lock); set_bit(BITMAP_IO, &device->flags); - if (atomic_read(&device->ap_bio_cnt) == 0) { + /* don't wait for pending application IO if the caller indicates that + * application IO does not conflict anyways. */ + if (flags == BM_LOCKED_CHANGE_ALLOWED || atomic_read(&device->ap_bio_cnt) == 0) { if (!test_and_set_bit(BITMAP_IO_QUEUED, &device->flags)) drbd_queue_work(&first_peer_device(device)->connection->sender_work, &device->bm_io_work.w); @@ -3746,6 +3733,27 @@ int drbd_wait_misc(struct drbd_device *device, struct drbd_interval *i) return 0; } +void lock_all_resources(void) +{ + struct drbd_resource *resource; + int __maybe_unused i = 0; + + mutex_lock(&resources_mutex); + local_irq_disable(); + for_each_resource(resource, &drbd_resources) + spin_lock_nested(&resource->req_lock, i++); +} + +void unlock_all_resources(void) +{ + struct drbd_resource *resource; + + for_each_resource(resource, &drbd_resources) + spin_unlock(&resource->req_lock); + local_irq_enable(); + mutex_unlock(&resources_mutex); +} + #ifdef CONFIG_DRBD_FAULT_INJECTION /* Fault insertion support including random number generator shamelessly * stolen from kernel/rcutorture.c */ diff --git a/drivers/block/drbd/drbd_nl.c b/drivers/block/drbd/drbd_nl.c index e80cbefbc2b5..c055c5e12f24 100644 --- a/drivers/block/drbd/drbd_nl.c +++ b/drivers/block/drbd/drbd_nl.c @@ -36,6 +36,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" #include <asm/unaligned.h> #include <linux/drbd_limits.h> #include <linux/kthread.h> @@ -75,11 +76,24 @@ int drbd_adm_get_status(struct sk_buff *skb, struct genl_info *info); int drbd_adm_get_timeout_type(struct sk_buff *skb, struct genl_info *info); /* .dumpit */ int drbd_adm_get_status_all(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_devices_done(struct netlink_callback *cb); +int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_connections_done(struct netlink_callback *cb); +int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb); +int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb); +int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb); #include <linux/drbd_genl_api.h> #include "drbd_nla.h" #include <linux/genl_magic_func.h> +static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ +static atomic_t notify_genl_seq = ATOMIC_INIT(2); /* two. */ + +DEFINE_MUTEX(notification_mutex); + /* used blkdev_get_by_path, to claim our meta data device(s) */ static char *drbd_m_holder = "Hands off! this is DRBD's meta data device."; @@ -349,6 +363,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) sib.sib_reason = SIB_HELPER_PRE; sib.helper_name = cmd; drbd_bcast_event(device, &sib); + notify_helper(NOTIFY_CALL, device, connection, cmd, 0); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) drbd_warn(device, "helper command: %s %s %s exit code %u (0x%x)\n", @@ -361,6 +376,7 @@ int drbd_khelper(struct drbd_device *device, char *cmd) sib.sib_reason = SIB_HELPER_POST; sib.helper_exit_code = ret; drbd_bcast_event(device, &sib); + notify_helper(NOTIFY_RESPONSE, device, connection, cmd, ret); if (current == connection->worker.task) clear_bit(CALLBACK_PENDING, &connection->flags); @@ -388,6 +404,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) drbd_info(connection, "helper command: %s %s %s\n", usermode_helper, cmd, resource_name); /* TODO: conn_bcast_event() ?? */ + notify_helper(NOTIFY_CALL, NULL, connection, cmd, 0); ret = call_usermodehelper(usermode_helper, argv, envp, UMH_WAIT_PROC); if (ret) @@ -399,6 +416,7 @@ static int conn_khelper(struct drbd_connection *connection, char *cmd) usermode_helper, cmd, resource_name, (ret >> 8) & 0xff, ret); /* TODO: conn_bcast_event() ?? */ + notify_helper(NOTIFY_RESPONSE, NULL, connection, cmd, ret); if (ret < 0) /* Ignore any ERRNOs we got. */ ret = 0; @@ -847,9 +865,11 @@ char *ppsize(char *buf, unsigned long long size) * and can be long lived. * This changes an device->flag, is triggered by drbd internals, * and should be short-lived. */ +/* It needs to be a counter, since multiple threads might + independently suspend and resume IO. */ void drbd_suspend_io(struct drbd_device *device) { - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); if (drbd_suspended(device)) return; wait_event(device->misc_wait, !atomic_read(&device->ap_bio_cnt)); @@ -857,8 +877,8 @@ void drbd_suspend_io(struct drbd_device *device) void drbd_resume_io(struct drbd_device *device) { - clear_bit(SUSPEND_IO, &device->flags); - wake_up(&device->misc_wait); + if (atomic_dec_and_test(&device->suspend_cnt)) + wake_up(&device->misc_wait); } /** @@ -871,27 +891,32 @@ void drbd_resume_io(struct drbd_device *device) enum determine_dev_size drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct resize_parms *rs) __must_hold(local) { - sector_t prev_first_sect, prev_size; /* previous meta location */ - sector_t la_size_sect, u_size; + struct md_offsets_and_sizes { + u64 last_agreed_sect; + u64 md_offset; + s32 al_offset; + s32 bm_offset; + u32 md_size_sect; + + u32 al_stripes; + u32 al_stripe_size_4k; + } prev; + sector_t u_size, size; struct drbd_md *md = &device->ldev->md; - u32 prev_al_stripe_size_4k; - u32 prev_al_stripes; - sector_t size; char ppb[10]; void *buffer; int md_moved, la_size_changed; enum determine_dev_size rv = DS_UNCHANGED; - /* race: - * application request passes inc_ap_bio, - * but then cannot get an AL-reference. - * this function later may wait on ap_bio_cnt == 0. -> deadlock. + /* We may change the on-disk offsets of our meta data below. Lock out + * anything that may cause meta data IO, to avoid acting on incomplete + * layout changes or scribbling over meta data that is in the process + * of being moved. * - * to avoid that: - * Suspend IO right here. - * still lock the act_log to not trigger ASSERTs there. - */ + * Move is not exactly correct, btw, currently we have all our meta + * data in core memory, to "move" it we just write it all out, there + * are no reads. */ drbd_suspend_io(device); buffer = drbd_md_get_buffer(device, __func__); /* Lock meta-data IO */ if (!buffer) { @@ -899,19 +924,17 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct return DS_ERROR; } - /* no wait necessary anymore, actually we could assert that */ - wait_event(device->al_wait, lc_try_lock(device->act_log)); - - prev_first_sect = drbd_md_first_sector(device->ldev); - prev_size = device->ldev->md.md_size_sect; - la_size_sect = device->ldev->md.la_size_sect; + /* remember current offset and sizes */ + prev.last_agreed_sect = md->la_size_sect; + prev.md_offset = md->md_offset; + prev.al_offset = md->al_offset; + prev.bm_offset = md->bm_offset; + prev.md_size_sect = md->md_size_sect; + prev.al_stripes = md->al_stripes; + prev.al_stripe_size_4k = md->al_stripe_size_4k; if (rs) { /* rs is non NULL if we should change the AL layout only */ - - prev_al_stripes = md->al_stripes; - prev_al_stripe_size_4k = md->al_stripe_size_4k; - md->al_stripes = rs->al_stripes; md->al_stripe_size_4k = rs->al_stripe_size / 4; md->al_size_4k = (u64)rs->al_stripes * rs->al_stripe_size / 4; @@ -924,7 +947,7 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct rcu_read_unlock(); size = drbd_new_dev_size(device, device->ldev, u_size, flags & DDSF_FORCED); - if (size < la_size_sect) { + if (size < prev.last_agreed_sect) { if (rs && u_size == 0) { /* Remove "rs &&" later. This check should always be active, but right now the receiver expects the permissive behavior */ @@ -945,30 +968,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct err = drbd_bm_resize(device, size, !(flags & DDSF_NO_RESYNC)); if (unlikely(err)) { /* currently there is only one error: ENOMEM! */ - size = drbd_bm_capacity(device)>>1; + size = drbd_bm_capacity(device); if (size == 0) { drbd_err(device, "OUT OF MEMORY! " "Could not allocate bitmap!\n"); } else { drbd_err(device, "BM resizing failed. " - "Leaving size unchanged at size = %lu KB\n", - (unsigned long)size); + "Leaving size unchanged\n"); } rv = DS_ERROR; } /* racy, see comments above. */ drbd_set_my_capacity(device, size); - device->ldev->md.la_size_sect = size; + md->la_size_sect = size; drbd_info(device, "size = %s (%llu KB)\n", ppsize(ppb, size>>1), (unsigned long long)size>>1); } if (rv <= DS_ERROR) goto err_out; - la_size_changed = (la_size_sect != device->ldev->md.la_size_sect); + la_size_changed = (prev.last_agreed_sect != md->la_size_sect); - md_moved = prev_first_sect != drbd_md_first_sector(device->ldev) - || prev_size != device->ldev->md.md_size_sect; + md_moved = prev.md_offset != md->md_offset + || prev.md_size_sect != md->md_size_sect; if (la_size_changed || md_moved || rs) { u32 prev_flags; @@ -977,20 +999,29 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct * Clear the timer, to avoid scary "timer expired!" messages, * "Superblock" is written out at least twice below, anyways. */ del_timer(&device->md_sync_timer); - drbd_al_shrink(device); /* All extents inactive. */ + /* We won't change the "al-extents" setting, we just may need + * to move the on-disk location of the activity log ringbuffer. + * Lock for transaction is good enough, it may well be "dirty" + * or even "starving". */ + wait_event(device->al_wait, lc_try_lock_for_transaction(device->act_log)); + + /* mark current on-disk bitmap and activity log as unreliable */ prev_flags = md->flags; - md->flags &= ~MDF_PRIMARY_IND; + md->flags |= MDF_FULL_SYNC | MDF_AL_DISABLED; drbd_md_write(device, buffer); + drbd_al_initialize(device, buffer); + drbd_info(device, "Writing the whole bitmap, %s\n", la_size_changed && md_moved ? "size changed and md moved" : la_size_changed ? "size changed" : "md moved"); /* next line implicitly does drbd_suspend_io()+drbd_resume_io() */ drbd_bitmap_io(device, md_moved ? &drbd_bm_write_all : &drbd_bm_write, "size changed", BM_LOCKED_MASK); - drbd_initialize_al(device, buffer); + /* on-disk bitmap and activity log is authoritative again + * (unless there was an IO error meanwhile...) */ md->flags = prev_flags; drbd_md_write(device, buffer); @@ -999,20 +1030,22 @@ drbd_determine_dev_size(struct drbd_device *device, enum dds_flags flags, struct md->al_stripes, md->al_stripe_size_4k * 4); } - if (size > la_size_sect) - rv = la_size_sect ? DS_GREW : DS_GREW_FROM_ZERO; - if (size < la_size_sect) + if (size > prev.last_agreed_sect) + rv = prev.last_agreed_sect ? DS_GREW : DS_GREW_FROM_ZERO; + if (size < prev.last_agreed_sect) rv = DS_SHRUNK; if (0) { err_out: - if (rs) { - md->al_stripes = prev_al_stripes; - md->al_stripe_size_4k = prev_al_stripe_size_4k; - md->al_size_4k = (u64)prev_al_stripes * prev_al_stripe_size_4k; - - drbd_md_set_sector_offsets(device, device->ldev); - } + /* restore previous offset and sizes */ + md->la_size_sect = prev.last_agreed_sect; + md->md_offset = prev.md_offset; + md->al_offset = prev.al_offset; + md->bm_offset = prev.bm_offset; + md->md_size_sect = prev.md_size_sect; + md->al_stripes = prev.al_stripes; + md->al_stripe_size_4k = prev.al_stripe_size_4k; + md->al_size_4k = (u64)prev.al_stripes * prev.al_stripe_size_4k; } lc_unlock(device->act_log); wake_up(&device->al_wait); @@ -1115,8 +1148,7 @@ static int drbd_check_al_size(struct drbd_device *device, struct disk_conf *dc) lc_destroy(n); return -EBUSY; } else { - if (t) - lc_destroy(t); + lc_destroy(t); } drbd_md_mark_dirty(device); /* we changed device->act_log->nr_elemens */ return 0; @@ -1151,21 +1183,20 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi if (b) { struct drbd_connection *connection = first_peer_device(device)->connection; + blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); + if (blk_queue_discard(b) && (connection->cstate < C_CONNECTED || connection->agreed_features & FF_TRIM)) { - /* For now, don't allow more than one activity log extent worth of data - * to be discarded in one go. We may need to rework drbd_al_begin_io() - * to allow for even larger discard ranges */ - blk_queue_max_discard_sectors(q, DRBD_MAX_DISCARD_SECTORS); - + /* We don't care, stacking below should fix it for the local device. + * Whether or not it is a suitable granularity on the remote device + * is not our problem, really. If you care, you need to + * use devices with similar topology on all peers. */ + q->limits.discard_granularity = 512; queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, q); - /* REALLY? Is stacking secdiscard "legal"? */ - if (blk_queue_secdiscard(b)) - queue_flag_set_unlocked(QUEUE_FLAG_SECDISCARD, q); } else { blk_queue_max_discard_sectors(q, 0); queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, q); - queue_flag_clear_unlocked(QUEUE_FLAG_SECDISCARD, q); + q->limits.discard_granularity = 0; } blk_queue_stack_limits(q, b); @@ -1177,6 +1208,12 @@ static void drbd_setup_queue_param(struct drbd_device *device, struct drbd_backi q->backing_dev_info.ra_pages = b->backing_dev_info.ra_pages; } } + /* To avoid confusion, if this queue does not support discard, clear + * max_discard_sectors, which is what lsblk -D reports to the user. */ + if (!blk_queue_discard(q)) { + blk_queue_max_discard_sectors(q, 0); + q->limits.discard_granularity = 0; + } } void drbd_reconsider_max_bio_size(struct drbd_device *device, struct drbd_backing_dev *bdev) @@ -1241,8 +1278,8 @@ static void conn_reconfig_done(struct drbd_connection *connection) connection->cstate == C_STANDALONE; spin_unlock_irq(&connection->resource->req_lock); if (stop_threads) { - /* asender is implicitly stopped by receiver - * in conn_disconnect() */ + /* ack_receiver thread and ack_sender workqueue are implicitly + * stopped by receiver in conn_disconnect() */ drbd_thread_stop(&connection->receiver); drbd_thread_stop(&connection->worker); } @@ -1389,13 +1426,13 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) goto fail_unlock; } - write_lock_irq(&global_state_lock); + lock_all_resources(); retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); if (retcode == NO_ERROR) { rcu_assign_pointer(device->ldev->disk_conf, new_disk_conf); drbd_resync_after_changed(device); } - write_unlock_irq(&global_state_lock); + unlock_all_resources(); if (retcode != NO_ERROR) goto fail_unlock; @@ -1418,7 +1455,7 @@ int drbd_adm_disk_opts(struct sk_buff *skb, struct genl_info *info) set_bit(MD_NO_FUA, &device->flags); if (write_ordering_changed(old_disk_conf, new_disk_conf)) - drbd_bump_write_ordering(device->resource, NULL, WO_bdev_flush); + drbd_bump_write_ordering(device->resource, NULL, WO_BDEV_FLUSH); drbd_md_sync(device); @@ -1449,6 +1486,88 @@ success: return 0; } +static struct block_device *open_backing_dev(struct drbd_device *device, + const char *bdev_path, void *claim_ptr, bool do_bd_link) +{ + struct block_device *bdev; + int err = 0; + + bdev = blkdev_get_by_path(bdev_path, + FMODE_READ | FMODE_WRITE | FMODE_EXCL, claim_ptr); + if (IS_ERR(bdev)) { + drbd_err(device, "open(\"%s\") failed with %ld\n", + bdev_path, PTR_ERR(bdev)); + return bdev; + } + + if (!do_bd_link) + return bdev; + + err = bd_link_disk_holder(bdev, device->vdisk); + if (err) { + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); + drbd_err(device, "bd_link_disk_holder(\"%s\", ...) failed with %d\n", + bdev_path, err); + bdev = ERR_PTR(err); + } + return bdev; +} + +static int open_backing_devices(struct drbd_device *device, + struct disk_conf *new_disk_conf, + struct drbd_backing_dev *nbc) +{ + struct block_device *bdev; + + bdev = open_backing_dev(device, new_disk_conf->backing_dev, device, true); + if (IS_ERR(bdev)) + return ERR_OPEN_DISK; + nbc->backing_bdev = bdev; + + /* + * meta_dev_idx >= 0: external fixed size, possibly multiple + * drbd sharing one meta device. TODO in that case, paranoia + * check that [md_bdev, meta_dev_idx] is not yet used by some + * other drbd minor! (if you use drbd.conf + drbdadm, that + * should check it for you already; but if you don't, or + * someone fooled it, we need to double check here) + */ + bdev = open_backing_dev(device, new_disk_conf->meta_dev, + /* claim ptr: device, if claimed exclusively; shared drbd_m_holder, + * if potentially shared with other drbd minors */ + (new_disk_conf->meta_dev_idx < 0) ? (void*)device : (void*)drbd_m_holder, + /* avoid double bd_claim_by_disk() for the same (source,target) tuple, + * as would happen with internal metadata. */ + (new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_FLEX_INT && + new_disk_conf->meta_dev_idx != DRBD_MD_INDEX_INTERNAL)); + if (IS_ERR(bdev)) + return ERR_OPEN_MD_DISK; + nbc->md_bdev = bdev; + return NO_ERROR; +} + +static void close_backing_dev(struct drbd_device *device, struct block_device *bdev, + bool do_bd_unlink) +{ + if (!bdev) + return; + if (do_bd_unlink) + bd_unlink_disk_holder(bdev, device->vdisk); + blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); +} + +void drbd_backing_dev_free(struct drbd_device *device, struct drbd_backing_dev *ldev) +{ + if (ldev == NULL) + return; + + close_backing_dev(device, ldev->md_bdev, ldev->md_bdev != ldev->backing_bdev); + close_backing_dev(device, ldev->backing_bdev, true); + + kfree(ldev->disk_conf); + kfree(ldev); +} + int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -1462,7 +1581,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) sector_t min_md_device_sectors; struct drbd_backing_dev *nbc = NULL; /* new_backing_conf */ struct disk_conf *new_disk_conf = NULL; - struct block_device *bdev; struct lru_cache *resync_lru = NULL; struct fifo_buffer *new_plan = NULL; union drbd_state ns, os; @@ -1478,7 +1596,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) device = adm_ctx.device; mutex_lock(&adm_ctx.resource->adm_mutex); peer_device = first_peer_device(device); - connection = peer_device ? peer_device->connection : NULL; + connection = peer_device->connection; conn_reconfig_start(connection); /* if you want to reconfigure, please tear down first */ @@ -1539,12 +1657,6 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto fail; } - write_lock_irq(&global_state_lock); - retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); - write_unlock_irq(&global_state_lock); - if (retcode != NO_ERROR) - goto fail; - rcu_read_lock(); nc = rcu_dereference(connection->net_conf); if (nc) { @@ -1556,35 +1668,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) } rcu_read_unlock(); - bdev = blkdev_get_by_path(new_disk_conf->backing_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, device); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->backing_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_DISK; - goto fail; - } - nbc->backing_bdev = bdev; - - /* - * meta_dev_idx >= 0: external fixed size, possibly multiple - * drbd sharing one meta device. TODO in that case, paranoia - * check that [md_bdev, meta_dev_idx] is not yet used by some - * other drbd minor! (if you use drbd.conf + drbdadm, that - * should check it for you already; but if you don't, or - * someone fooled it, we need to double check here) - */ - bdev = blkdev_get_by_path(new_disk_conf->meta_dev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - (new_disk_conf->meta_dev_idx < 0) ? - (void *)device : (void *)drbd_m_holder); - if (IS_ERR(bdev)) { - drbd_err(device, "open(\"%s\") failed with %ld\n", new_disk_conf->meta_dev, - PTR_ERR(bdev)); - retcode = ERR_OPEN_MD_DISK; + retcode = open_backing_devices(device, new_disk_conf, nbc); + if (retcode != NO_ERROR) goto fail; - } - nbc->md_bdev = bdev; if ((nbc->backing_bdev == nbc->md_bdev) != (new_disk_conf->meta_dev_idx == DRBD_MD_INDEX_INTERNAL || @@ -1707,6 +1793,13 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) goto force_diskless_dec; } + lock_all_resources(); + retcode = drbd_resync_after_valid(device, new_disk_conf->resync_after); + if (retcode != NO_ERROR) { + unlock_all_resources(); + goto force_diskless_dec; + } + /* Reset the "barriers don't work" bits here, then force meta data to * be written, to ensure we determine if barriers are supported. */ if (new_disk_conf->md_flushes) @@ -1727,7 +1820,9 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) new_disk_conf = NULL; new_plan = NULL; - drbd_bump_write_ordering(device->resource, device->ldev, WO_bdev_flush); + drbd_resync_after_changed(device); + drbd_bump_write_ordering(device->resource, device->ldev, WO_BDEV_FLUSH); + unlock_all_resources(); if (drbd_md_test_flag(device->ldev, MDF_CRASHED_PRIMARY)) set_bit(CRASHED_PRIMARY, &device->flags); @@ -1875,12 +1970,8 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) fail: conn_reconfig_done(connection); if (nbc) { - if (nbc->backing_bdev) - blkdev_put(nbc->backing_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); - if (nbc->md_bdev) - blkdev_put(nbc->md_bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL); + close_backing_dev(device, nbc->md_bdev, nbc->md_bdev != nbc->backing_bdev); + close_backing_dev(device, nbc->backing_bdev, true); kfree(nbc); } kfree(new_disk_conf); @@ -1895,6 +1986,7 @@ int drbd_adm_attach(struct sk_buff *skb, struct genl_info *info) static int adm_detach(struct drbd_device *device, int force) { enum drbd_state_rv retcode; + void *buffer; int ret; if (force) { @@ -1905,13 +1997,16 @@ static int adm_detach(struct drbd_device *device, int force) } drbd_suspend_io(device); /* so no-one is stuck in drbd_al_begin_io */ - drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ - retcode = drbd_request_state(device, NS(disk, D_FAILED)); - drbd_md_put_buffer(device); + buffer = drbd_md_get_buffer(device, __func__); /* make sure there is no in-flight meta-data IO */ + if (buffer) { + retcode = drbd_request_state(device, NS(disk, D_FAILED)); + drbd_md_put_buffer(device); + } else /* already <= D_FAILED */ + retcode = SS_NOTHING_TO_DO; /* D_FAILED will transition to DISKLESS. */ + drbd_resume_io(device); ret = wait_event_interruptible(device->misc_wait, device->state.disk != D_FAILED); - drbd_resume_io(device); if ((int)retcode == (int)SS_IS_DISKLESS) retcode = SS_NOTHING_TO_DO; if (ret) @@ -2245,8 +2340,31 @@ int drbd_adm_net_opts(struct sk_buff *skb, struct genl_info *info) return 0; } +static void connection_to_info(struct connection_info *info, + struct drbd_connection *connection) +{ + info->conn_connection_state = connection->cstate; + info->conn_role = conn_highest_peer(connection); +} + +static void peer_device_to_info(struct peer_device_info *info, + struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + + info->peer_repl_state = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, device->state.conn); + info->peer_disk_state = device->state.pdsk; + info->peer_resync_susp_user = device->state.user_isp; + info->peer_resync_susp_peer = device->state.peer_isp; + info->peer_resync_susp_dependency = device->state.aftr_isp; +} + int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) { + struct connection_info connection_info; + enum drbd_notification_type flags; + unsigned int peer_devices = 0; struct drbd_config_context adm_ctx; struct drbd_peer_device *peer_device; struct net_conf *old_net_conf, *new_net_conf = NULL; @@ -2347,6 +2465,22 @@ int drbd_adm_connect(struct sk_buff *skb, struct genl_info *info) connection->peer_addr_len = nla_len(adm_ctx.peer_addr); memcpy(&connection->peer_addr, nla_data(adm_ctx.peer_addr), connection->peer_addr_len); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + peer_devices++; + } + + connection_to_info(&connection_info, connection); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + mutex_lock(¬ification_mutex); + notify_connection_state(NULL, 0, connection, &connection_info, NOTIFY_CREATE | flags); + idr_for_each_entry(&connection->peer_devices, peer_device, i) { + struct peer_device_info peer_device_info; + + peer_device_to_info(&peer_device_info, peer_device); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, NOTIFY_CREATE | flags); + } + mutex_unlock(¬ification_mutex); mutex_unlock(&adm_ctx.resource->conf_update); rcu_read_lock(); @@ -2428,6 +2562,8 @@ static enum drbd_state_rv conn_try_disconnect(struct drbd_connection *connection drbd_err(connection, "unexpected rv2=%d in conn_try_disconnect()\n", rv2); + /* Unlike in DRBD 9, the state engine has generated + * NOTIFY_DESTROY events before clearing connection->net_conf. */ } return rv; } @@ -2585,6 +2721,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) mutex_unlock(&device->resource->conf_update); synchronize_rcu(); kfree(old_disk_conf); + new_disk_conf = NULL; } ddsf = (rs.resize_force ? DDSF_FORCED : 0) | (rs.no_resync ? DDSF_NO_RESYNC : 0); @@ -2618,6 +2755,7 @@ int drbd_adm_resize(struct sk_buff *skb, struct genl_info *info) fail_ldev: put_ldev(device); + kfree(new_disk_conf); goto fail; } @@ -2855,7 +2993,30 @@ int drbd_adm_resume_io(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); device = adm_ctx.device; if (test_bit(NEW_CUR_UUID, &device->flags)) { - drbd_uuid_new_current(device); + if (get_ldev_if_state(device, D_ATTACHING)) { + drbd_uuid_new_current(device); + put_ldev(device); + } else { + /* This is effectively a multi-stage "forced down". + * The NEW_CUR_UUID bit is supposedly only set, if we + * lost the replication connection, and are configured + * to freeze IO and wait for some fence-peer handler. + * So we still don't have a replication connection. + * And now we don't have a local disk either. After + * resume, we will fail all pending and new IO, because + * we don't have any data anymore. Which means we will + * eventually be able to terminate all users of this + * device, and then take it down. By bumping the + * "effective" data uuid, we make sure that you really + * need to tear down before you reconfigure, we will + * the refuse to re-connect or re-attach (because no + * matching real data uuid exists). + */ + u64 val; + get_random_bytes(&val, sizeof(u64)); + drbd_set_ed_uuid(device, val); + drbd_warn(device, "Resumed without access to data; please tear down before attempting to re-configure.\n"); + } clear_bit(NEW_CUR_UUID, &device->flags); } drbd_suspend_io(device); @@ -2910,6 +3071,486 @@ nla_put_failure: } /* + * The generic netlink dump callbacks are called outside the genl_lock(), so + * they cannot use the simple attribute parsing code which uses global + * attribute tables. + */ +static struct nlattr *find_cfg_context_attr(const struct nlmsghdr *nlh, int attr) +{ + const unsigned hdrlen = GENL_HDRLEN + GENL_MAGIC_FAMILY_HDRSZ; + const int maxtype = ARRAY_SIZE(drbd_cfg_context_nl_policy) - 1; + struct nlattr *nla; + + nla = nla_find(nlmsg_attrdata(nlh, hdrlen), nlmsg_attrlen(nlh, hdrlen), + DRBD_NLA_CFG_CONTEXT); + if (!nla) + return NULL; + return drbd_nla_find_nested(maxtype, nla, __nla_type(attr)); +} + +static void resource_to_info(struct resource_info *, struct drbd_resource *); + +int drbd_adm_dump_resources(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_genlmsghdr *dh; + struct drbd_resource *resource; + struct resource_info resource_info; + struct resource_statistics resource_statistics; + int err; + + rcu_read_lock(); + if (cb->args[0]) { + for_each_resource_rcu(resource, &drbd_resources) + if (resource == (struct drbd_resource *)cb->args[0]) + goto found_resource; + err = 0; /* resource was probably deleted */ + goto out; + } + resource = list_entry(&drbd_resources, + struct drbd_resource, resources); + +found_resource: + list_for_each_entry_continue_rcu(resource, &drbd_resources, resources) { + goto put_result; + } + err = 0; + goto out; + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_RESOURCES); + err = -ENOMEM; + if (!dh) + goto out; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + err = nla_put_drbd_cfg_context(skb, resource, NULL, NULL); + if (err) + goto out; + err = res_opts_to_skb(skb, &resource->res_opts, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + resource_to_info(&resource_info, resource); + err = resource_info_to_skb(skb, &resource_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + resource_statistics.res_stat_write_ordering = resource->write_ordering; + err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[0] = (long)resource; + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} + +static void device_to_statistics(struct device_statistics *s, + struct drbd_device *device) +{ + memset(s, 0, sizeof(*s)); + s->dev_upper_blocked = !may_inc_ap_bio(device); + if (get_ldev(device)) { + struct drbd_md *md = &device->ldev->md; + u64 *history_uuids = (u64 *)s->history_uuids; + struct request_queue *q; + int n; + + spin_lock_irq(&md->uuid_lock); + s->dev_current_uuid = md->uuid[UI_CURRENT]; + BUILD_BUG_ON(sizeof(s->history_uuids) < UI_HISTORY_END - UI_HISTORY_START + 1); + for (n = 0; n < UI_HISTORY_END - UI_HISTORY_START + 1; n++) + history_uuids[n] = md->uuid[UI_HISTORY_START + n]; + for (; n < HISTORY_UUIDS; n++) + history_uuids[n] = 0; + s->history_uuids_len = HISTORY_UUIDS; + spin_unlock_irq(&md->uuid_lock); + + s->dev_disk_flags = md->flags; + q = bdev_get_queue(device->ldev->backing_bdev); + s->dev_lower_blocked = + bdi_congested(&q->backing_dev_info, + (1 << WB_async_congested) | + (1 << WB_sync_congested)); + put_ldev(device); + } + s->dev_size = drbd_get_capacity(device->this_bdev); + s->dev_read = device->read_cnt; + s->dev_write = device->writ_cnt; + s->dev_al_writes = device->al_writ_cnt; + s->dev_bm_writes = device->bm_writ_cnt; + s->dev_upper_pending = atomic_read(&device->ap_bio_cnt); + s->dev_lower_pending = atomic_read(&device->local_cnt); + s->dev_al_suspended = test_bit(AL_SUSPENDED, &device->flags); + s->dev_exposed_data_uuid = device->ed_uuid; +} + +static int put_resource_in_arg0(struct netlink_callback *cb, int holder_nr) +{ + if (cb->args[0]) { + struct drbd_resource *resource = + (struct drbd_resource *)cb->args[0]; + kref_put(&resource->kref, drbd_destroy_resource); + } + + return 0; +} + +int drbd_adm_dump_devices_done(struct netlink_callback *cb) { + return put_resource_in_arg0(cb, 7); +} + +static void device_to_info(struct device_info *, struct drbd_device *); + +int drbd_adm_dump_devices(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource; + struct drbd_device *uninitialized_var(device); + int minor, err, retcode; + struct drbd_genlmsghdr *dh; + struct device_info device_info; + struct device_statistics device_statistics; + struct idr *idr_to_search; + + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0] && !cb->args[1]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + cb->args[0] = (long)resource; + } + } + + rcu_read_lock(); + minor = cb->args[1]; + idr_to_search = resource ? &resource->devices : &drbd_devices; + device = idr_get_next(idr_to_search, &minor); + if (!device) { + err = 0; + goto out; + } + idr_for_each_entry_continue(idr_to_search, device, minor) { + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + err = 0; + goto out; /* no more devices */ + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_DEVICES); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + dh->minor = device->minor; + err = nla_put_drbd_cfg_context(skb, device->resource, NULL, device); + if (err) + goto out; + if (get_ldev(device)) { + struct disk_conf *disk_conf = + rcu_dereference(device->ldev->disk_conf); + + err = disk_conf_to_skb(skb, disk_conf, !capable(CAP_SYS_ADMIN)); + put_ldev(device); + if (err) + goto out; + } + device_to_info(&device_info, device); + err = device_info_to_skb(skb, &device_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + + device_to_statistics(&device_statistics, device); + err = device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[1] = minor + 1; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} + +int drbd_adm_dump_connections_done(struct netlink_callback *cb) +{ + return put_resource_in_arg0(cb, 6); +} + +enum { SINGLE_RESOURCE, ITERATE_RESOURCES }; + +int drbd_adm_dump_connections(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource = NULL, *next_resource; + struct drbd_connection *uninitialized_var(connection); + int err = 0, retcode; + struct drbd_genlmsghdr *dh; + struct connection_info connection_info; + struct connection_statistics connection_statistics; + + rcu_read_lock(); + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + cb->args[0] = (long)resource; + cb->args[1] = SINGLE_RESOURCE; + } + } + if (!resource) { + if (list_empty(&drbd_resources)) + goto out; + resource = list_first_entry(&drbd_resources, struct drbd_resource, resources); + kref_get(&resource->kref); + cb->args[0] = (long)resource; + cb->args[1] = ITERATE_RESOURCES; + } + + next_resource: + rcu_read_unlock(); + mutex_lock(&resource->conf_update); + rcu_read_lock(); + if (cb->args[2]) { + for_each_connection_rcu(connection, resource) + if (connection == (struct drbd_connection *)cb->args[2]) + goto found_connection; + /* connection was probably deleted */ + goto no_more_connections; + } + connection = list_entry(&resource->connections, struct drbd_connection, connections); + +found_connection: + list_for_each_entry_continue_rcu(connection, &resource->connections, connections) { + if (!has_net_conf(connection)) + continue; + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + +no_more_connections: + if (cb->args[1] == ITERATE_RESOURCES) { + for_each_resource_rcu(next_resource, &drbd_resources) { + if (next_resource == resource) + goto found_resource; + } + /* resource was probably deleted */ + } + goto out; + +found_resource: + list_for_each_entry_continue_rcu(next_resource, &drbd_resources, resources) { + mutex_unlock(&resource->conf_update); + kref_put(&resource->kref, drbd_destroy_resource); + resource = next_resource; + kref_get(&resource->kref); + cb->args[0] = (long)resource; + cb->args[2] = 0; + goto next_resource; + } + goto out; /* no more resources */ + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_CONNECTIONS); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + struct net_conf *net_conf; + + err = nla_put_drbd_cfg_context(skb, resource, connection, NULL); + if (err) + goto out; + net_conf = rcu_dereference(connection->net_conf); + if (net_conf) { + err = net_conf_to_skb(skb, net_conf, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + } + connection_to_info(&connection_info, connection); + err = connection_info_to_skb(skb, &connection_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags); + err = connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[2] = (long)connection; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (resource) + mutex_unlock(&resource->conf_update); + if (err) + return err; + return skb->len; +} + +enum mdf_peer_flag { + MDF_PEER_CONNECTED = 1 << 0, + MDF_PEER_OUTDATED = 1 << 1, + MDF_PEER_FENCING = 1 << 2, + MDF_PEER_FULL_SYNC = 1 << 3, +}; + +static void peer_device_to_statistics(struct peer_device_statistics *s, + struct drbd_peer_device *peer_device) +{ + struct drbd_device *device = peer_device->device; + + memset(s, 0, sizeof(*s)); + s->peer_dev_received = device->recv_cnt; + s->peer_dev_sent = device->send_cnt; + s->peer_dev_pending = atomic_read(&device->ap_pending_cnt) + + atomic_read(&device->rs_pending_cnt); + s->peer_dev_unacked = atomic_read(&device->unacked_cnt); + s->peer_dev_out_of_sync = drbd_bm_total_weight(device) << (BM_BLOCK_SHIFT - 9); + s->peer_dev_resync_failed = device->rs_failed << (BM_BLOCK_SHIFT - 9); + if (get_ldev(device)) { + struct drbd_md *md = &device->ldev->md; + + spin_lock_irq(&md->uuid_lock); + s->peer_dev_bitmap_uuid = md->uuid[UI_BITMAP]; + spin_unlock_irq(&md->uuid_lock); + s->peer_dev_flags = + (drbd_md_test_flag(device->ldev, MDF_CONNECTED_IND) ? + MDF_PEER_CONNECTED : 0) + + (drbd_md_test_flag(device->ldev, MDF_CONSISTENT) && + !drbd_md_test_flag(device->ldev, MDF_WAS_UP_TO_DATE) ? + MDF_PEER_OUTDATED : 0) + + /* FIXME: MDF_PEER_FENCING? */ + (drbd_md_test_flag(device->ldev, MDF_FULL_SYNC) ? + MDF_PEER_FULL_SYNC : 0); + put_ldev(device); + } +} + +int drbd_adm_dump_peer_devices_done(struct netlink_callback *cb) +{ + return put_resource_in_arg0(cb, 9); +} + +int drbd_adm_dump_peer_devices(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct nlattr *resource_filter; + struct drbd_resource *resource; + struct drbd_device *uninitialized_var(device); + struct drbd_peer_device *peer_device = NULL; + int minor, err, retcode; + struct drbd_genlmsghdr *dh; + struct idr *idr_to_search; + + resource = (struct drbd_resource *)cb->args[0]; + if (!cb->args[0] && !cb->args[1]) { + resource_filter = find_cfg_context_attr(cb->nlh, T_ctx_resource_name); + if (resource_filter) { + retcode = ERR_RES_NOT_KNOWN; + resource = drbd_find_resource(nla_data(resource_filter)); + if (!resource) + goto put_result; + } + cb->args[0] = (long)resource; + } + + rcu_read_lock(); + minor = cb->args[1]; + idr_to_search = resource ? &resource->devices : &drbd_devices; + device = idr_find(idr_to_search, minor); + if (!device) { +next_device: + minor++; + cb->args[2] = 0; + device = idr_get_next(idr_to_search, &minor); + if (!device) { + err = 0; + goto out; + } + } + if (cb->args[2]) { + for_each_peer_device(peer_device, device) + if (peer_device == (struct drbd_peer_device *)cb->args[2]) + goto found_peer_device; + /* peer device was probably deleted */ + goto next_device; + } + /* Make peer_device point to the list head (not the first entry). */ + peer_device = list_entry(&device->peer_devices, struct drbd_peer_device, peer_devices); + +found_peer_device: + list_for_each_entry_continue_rcu(peer_device, &device->peer_devices, peer_devices) { + if (!has_net_conf(peer_device->connection)) + continue; + retcode = NO_ERROR; + goto put_result; /* only one iteration */ + } + goto next_device; + +put_result: + dh = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, &drbd_genl_family, + NLM_F_MULTI, DRBD_ADM_GET_PEER_DEVICES); + err = -ENOMEM; + if (!dh) + goto out; + dh->ret_code = retcode; + dh->minor = -1U; + if (retcode == NO_ERROR) { + struct peer_device_info peer_device_info; + struct peer_device_statistics peer_device_statistics; + + dh->minor = minor; + err = nla_put_drbd_cfg_context(skb, device->resource, peer_device->connection, device); + if (err) + goto out; + peer_device_to_info(&peer_device_info, peer_device); + err = peer_device_info_to_skb(skb, &peer_device_info, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + peer_device_to_statistics(&peer_device_statistics, peer_device); + err = peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto out; + cb->args[1] = minor; + cb->args[2] = (long)peer_device; + } + genlmsg_end(skb, dh); + err = 0; + +out: + rcu_read_unlock(); + if (err) + return err; + return skb->len; +} +/* * Return the connection of @resource if @resource has exactly one connection. */ static struct drbd_connection *the_only_connection(struct drbd_resource *resource) @@ -3414,8 +4055,18 @@ drbd_check_resource_name(struct drbd_config_context *adm_ctx) return NO_ERROR; } +static void resource_to_info(struct resource_info *info, + struct drbd_resource *resource) +{ + info->res_role = conn_highest_role(first_connection(resource)); + info->res_susp = resource->susp; + info->res_susp_nod = resource->susp_nod; + info->res_susp_fen = resource->susp_fen; +} + int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) { + struct drbd_connection *connection; struct drbd_config_context adm_ctx; enum drbd_ret_code retcode; struct res_opts res_opts; @@ -3449,13 +4100,33 @@ int drbd_adm_new_resource(struct sk_buff *skb, struct genl_info *info) } /* not yet safe for genl_family.parallel_ops */ - if (!conn_create(adm_ctx.resource_name, &res_opts)) + mutex_lock(&resources_mutex); + connection = conn_create(adm_ctx.resource_name, &res_opts); + mutex_unlock(&resources_mutex); + + if (connection) { + struct resource_info resource_info; + + mutex_lock(¬ification_mutex); + resource_to_info(&resource_info, connection->resource); + notify_resource_state(NULL, 0, connection->resource, + &resource_info, NOTIFY_CREATE); + mutex_unlock(¬ification_mutex); + } else retcode = ERR_NOMEM; + out: drbd_adm_finish(&adm_ctx, info, retcode); return 0; } +static void device_to_info(struct device_info *info, + struct drbd_device *device) +{ + info->dev_disk_state = device->state.disk; +} + + int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) { struct drbd_config_context adm_ctx; @@ -3490,6 +4161,36 @@ int drbd_adm_new_minor(struct sk_buff *skb, struct genl_info *info) mutex_lock(&adm_ctx.resource->adm_mutex); retcode = drbd_create_device(&adm_ctx, dh->minor); + if (retcode == NO_ERROR) { + struct drbd_device *device; + struct drbd_peer_device *peer_device; + struct device_info info; + unsigned int peer_devices = 0; + enum drbd_notification_type flags; + + device = minor_to_device(dh->minor); + for_each_peer_device(peer_device, device) { + if (!has_net_conf(peer_device->connection)) + continue; + peer_devices++; + } + + device_to_info(&info, device); + mutex_lock(¬ification_mutex); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_device_state(NULL, 0, device, &info, NOTIFY_CREATE | flags); + for_each_peer_device(peer_device, device) { + struct peer_device_info peer_device_info; + + if (!has_net_conf(peer_device->connection)) + continue; + peer_device_to_info(&peer_device_info, peer_device); + flags = (peer_devices--) ? NOTIFY_CONTINUES : 0; + notify_peer_device_state(NULL, 0, peer_device, &peer_device_info, + NOTIFY_CREATE | flags); + } + mutex_unlock(¬ification_mutex); + } mutex_unlock(&adm_ctx.resource->adm_mutex); out: drbd_adm_finish(&adm_ctx, info, retcode); @@ -3498,13 +4199,35 @@ out: static enum drbd_ret_code adm_del_minor(struct drbd_device *device) { + struct drbd_peer_device *peer_device; + if (device->state.disk == D_DISKLESS && /* no need to be device->state.conn == C_STANDALONE && * we may want to delete a minor from a live replication group. */ device->state.role == R_SECONDARY) { + struct drbd_connection *connection = + first_connection(device->resource); + _drbd_request_state(device, NS(conn, C_WF_REPORT_PARAMS), CS_VERBOSE + CS_WAIT_COMPLETE); + + /* If the state engine hasn't stopped the sender thread yet, we + * need to flush the sender work queue before generating the + * DESTROY events here. */ + if (get_t_state(&connection->worker) == RUNNING) + drbd_flush_workqueue(&connection->sender_work); + + mutex_lock(¬ification_mutex); + for_each_peer_device(peer_device, device) { + if (!has_net_conf(peer_device->connection)) + continue; + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + } + notify_device_state(NULL, 0, device, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + drbd_delete_device(device); return NO_ERROR; } else @@ -3541,7 +4264,16 @@ static int adm_del_resource(struct drbd_resource *resource) if (!idr_is_empty(&resource->devices)) return ERR_RES_IN_USE; + /* The state engine has stopped the sender thread, so we don't + * need to flush the sender work queue before generating the + * DESTROY event here. */ + mutex_lock(¬ification_mutex); + notify_resource_state(NULL, 0, resource, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + + mutex_lock(&resources_mutex); list_del_rcu(&resource->resources); + mutex_unlock(&resources_mutex); /* Make sure all threads have actually stopped: state handling only * does drbd_thread_stop_nowait(). */ list_for_each_entry(connection, &resource->connections, connections) @@ -3637,7 +4369,6 @@ finish: void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) { - static atomic_t drbd_genl_seq = ATOMIC_INIT(2); /* two. */ struct sk_buff *msg; struct drbd_genlmsghdr *d_out; unsigned seq; @@ -3658,7 +4389,7 @@ void drbd_bcast_event(struct drbd_device *device, const struct sib_info *sib) if (nla_put_status_info(msg, device, sib)) goto nla_put_failure; genlmsg_end(msg, d_out); - err = drbd_genl_multicast_events(msg, 0); + err = drbd_genl_multicast_events(msg, GFP_NOWAIT); /* msg has been consumed or freed in netlink_broadcast() */ if (err && err != -ESRCH) goto failed; @@ -3672,3 +4403,405 @@ failed: "Event seq:%u sib_reason:%u\n", err, seq, sib->sib_reason); } + +static int nla_put_notification_header(struct sk_buff *msg, + enum drbd_notification_type type) +{ + struct drbd_notification_header nh = { + .nh_type = type, + }; + + return drbd_notification_header_to_skb(msg, &nh, true); +} + +void notify_resource_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource *resource, + struct resource_info *resource_info, + enum drbd_notification_type type) +{ + struct resource_statistics resource_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_RESOURCE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, resource, NULL, NULL) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + resource_info_to_skb(skb, resource_info, true))) + goto nla_put_failure; + resource_statistics.res_stat_write_ordering = resource->write_ordering; + err = resource_statistics_to_skb(skb, &resource_statistics, !capable(CAP_SYS_ADMIN)); + if (err) + goto nla_put_failure; + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_device_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_device *device, + struct device_info *device_info, + enum drbd_notification_type type) +{ + struct device_statistics device_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_DEVICE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = device->minor; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, device->resource, NULL, device) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + device_info_to_skb(skb, device_info, true))) + goto nla_put_failure; + device_to_statistics(&device_statistics, device); + device_statistics_to_skb(skb, &device_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(device, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_connection_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection *connection, + struct connection_info *connection_info, + enum drbd_notification_type type) +{ + struct connection_statistics connection_statistics; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_CONNECTION_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, connection->resource, connection, NULL) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + connection_info_to_skb(skb, connection_info, true))) + goto nla_put_failure; + connection_statistics.conn_congested = test_bit(NET_CONGESTED, &connection->flags); + connection_statistics_to_skb(skb, &connection_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(connection, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_peer_device_state(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device *peer_device, + struct peer_device_info *peer_device_info, + enum drbd_notification_type type) +{ + struct peer_device_statistics peer_device_statistics; + struct drbd_resource *resource = peer_device->device->resource; + struct drbd_genlmsghdr *dh; + bool multicast = false; + int err; + + if (!skb) { + seq = atomic_inc_return(¬ify_genl_seq); + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto failed; + multicast = true; + } + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_PEER_DEVICE_STATE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_drbd_cfg_context(skb, resource, peer_device->connection, peer_device->device) || + nla_put_notification_header(skb, type) || + ((type & ~NOTIFY_FLAGS) != NOTIFY_DESTROY && + peer_device_info_to_skb(skb, peer_device_info, true))) + goto nla_put_failure; + peer_device_to_statistics(&peer_device_statistics, peer_device); + peer_device_statistics_to_skb(skb, &peer_device_statistics, !capable(CAP_SYS_ADMIN)); + genlmsg_end(skb, dh); + if (multicast) { + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto failed; + } + return; + +nla_put_failure: + nlmsg_free(skb); +failed: + drbd_err(peer_device, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +void notify_helper(enum drbd_notification_type type, + struct drbd_device *device, struct drbd_connection *connection, + const char *name, int status) +{ + struct drbd_resource *resource = device ? device->resource : connection->resource; + struct drbd_helper_info helper_info; + unsigned int seq = atomic_inc_return(¬ify_genl_seq); + struct sk_buff *skb = NULL; + struct drbd_genlmsghdr *dh; + int err; + + strlcpy(helper_info.helper_name, name, sizeof(helper_info.helper_name)); + helper_info.helper_name_len = min(strlen(name), sizeof(helper_info.helper_name)); + helper_info.helper_status = status; + + skb = genlmsg_new(NLMSG_GOODSIZE, GFP_NOIO); + err = -ENOMEM; + if (!skb) + goto fail; + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_HELPER); + if (!dh) + goto fail; + dh->minor = device ? device->minor : -1; + dh->ret_code = NO_ERROR; + mutex_lock(¬ification_mutex); + if (nla_put_drbd_cfg_context(skb, resource, connection, device) || + nla_put_notification_header(skb, type) || + drbd_helper_info_to_skb(skb, &helper_info, true)) + goto unlock_fail; + genlmsg_end(skb, dh); + err = drbd_genl_multicast_events(skb, GFP_NOWAIT); + skb = NULL; + /* skb has been consumed or freed in netlink_broadcast() */ + if (err && err != -ESRCH) + goto unlock_fail; + mutex_unlock(¬ification_mutex); + return; + +unlock_fail: + mutex_unlock(¬ification_mutex); +fail: + nlmsg_free(skb); + drbd_err(resource, "Error %d while broadcasting event. Event seq:%u\n", + err, seq); +} + +static void notify_initial_state_done(struct sk_buff *skb, unsigned int seq) +{ + struct drbd_genlmsghdr *dh; + int err; + + err = -EMSGSIZE; + dh = genlmsg_put(skb, 0, seq, &drbd_genl_family, 0, DRBD_INITIAL_STATE_DONE); + if (!dh) + goto nla_put_failure; + dh->minor = -1U; + dh->ret_code = NO_ERROR; + if (nla_put_notification_header(skb, NOTIFY_EXISTS)) + goto nla_put_failure; + genlmsg_end(skb, dh); + return; + +nla_put_failure: + nlmsg_free(skb); + pr_err("Error %d sending event. Event seq:%u\n", err, seq); +} + +static void free_state_changes(struct list_head *list) +{ + while (!list_empty(list)) { + struct drbd_state_change *state_change = + list_first_entry(list, struct drbd_state_change, list); + list_del(&state_change->list); + forget_state_change(state_change); + } +} + +static unsigned int notifications_for_state_change(struct drbd_state_change *state_change) +{ + return 1 + + state_change->n_connections + + state_change->n_devices + + state_change->n_devices * state_change->n_connections; +} + +static int get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_state_change *state_change = (struct drbd_state_change *)cb->args[0]; + unsigned int seq = cb->args[2]; + unsigned int n; + enum drbd_notification_type flags = 0; + + /* There is no need for taking notification_mutex here: it doesn't + matter if the initial state events mix with later state chage + events; we can always tell the events apart by the NOTIFY_EXISTS + flag. */ + + cb->args[5]--; + if (cb->args[5] == 1) { + notify_initial_state_done(skb, seq); + goto out; + } + n = cb->args[4]++; + if (cb->args[4] < cb->args[3]) + flags |= NOTIFY_CONTINUES; + if (n < 1) { + notify_resource_state_change(skb, seq, state_change->resource, + NOTIFY_EXISTS | flags); + goto next; + } + n--; + if (n < state_change->n_connections) { + notify_connection_state_change(skb, seq, &state_change->connections[n], + NOTIFY_EXISTS | flags); + goto next; + } + n -= state_change->n_connections; + if (n < state_change->n_devices) { + notify_device_state_change(skb, seq, &state_change->devices[n], + NOTIFY_EXISTS | flags); + goto next; + } + n -= state_change->n_devices; + if (n < state_change->n_devices * state_change->n_connections) { + notify_peer_device_state_change(skb, seq, &state_change->peer_devices[n], + NOTIFY_EXISTS | flags); + goto next; + } + +next: + if (cb->args[4] == cb->args[3]) { + struct drbd_state_change *next_state_change = + list_entry(state_change->list.next, + struct drbd_state_change, list); + cb->args[0] = (long)next_state_change; + cb->args[3] = notifications_for_state_change(next_state_change); + cb->args[4] = 0; + } +out: + return skb->len; +} + +int drbd_adm_get_initial_state(struct sk_buff *skb, struct netlink_callback *cb) +{ + struct drbd_resource *resource; + LIST_HEAD(head); + + if (cb->args[5] >= 1) { + if (cb->args[5] > 1) + return get_initial_state(skb, cb); + if (cb->args[0]) { + struct drbd_state_change *state_change = + (struct drbd_state_change *)cb->args[0]; + + /* connect list to head */ + list_add(&head, &state_change->list); + free_state_changes(&head); + } + return 0; + } + + cb->args[5] = 2; /* number of iterations */ + mutex_lock(&resources_mutex); + for_each_resource(resource, &drbd_resources) { + struct drbd_state_change *state_change; + + state_change = remember_old_state(resource, GFP_KERNEL); + if (!state_change) { + if (!list_empty(&head)) + free_state_changes(&head); + mutex_unlock(&resources_mutex); + return -ENOMEM; + } + copy_old_to_new_state_change(state_change); + list_add_tail(&state_change->list, &head); + cb->args[5] += notifications_for_state_change(state_change); + } + mutex_unlock(&resources_mutex); + + if (!list_empty(&head)) { + struct drbd_state_change *state_change = + list_entry(head.next, struct drbd_state_change, list); + cb->args[0] = (long)state_change; + cb->args[3] = notifications_for_state_change(state_change); + list_del(&head); /* detach list from head */ + } + + cb->args[2] = cb->nlh->nlmsg_seq; + return get_initial_state(skb, cb); +} diff --git a/drivers/block/drbd/drbd_proc.c b/drivers/block/drbd/drbd_proc.c index 3b10fa6cb039..6537b25db9c1 100644 --- a/drivers/block/drbd/drbd_proc.c +++ b/drivers/block/drbd/drbd_proc.c @@ -245,9 +245,9 @@ static int drbd_seq_show(struct seq_file *seq, void *v) char wp; static char write_ordering_chars[] = { - [WO_none] = 'n', - [WO_drain_io] = 'd', - [WO_bdev_flush] = 'f', + [WO_NONE] = 'n', + [WO_DRAIN_IO] = 'd', + [WO_BDEV_FLUSH] = 'f', }; seq_printf(seq, "version: " REL_VERSION " (api:%d/proto:%d-%d)\n%s\n", diff --git a/drivers/block/drbd/drbd_protocol.h b/drivers/block/drbd/drbd_protocol.h index 2da9104a3851..ef9245363dcc 100644 --- a/drivers/block/drbd/drbd_protocol.h +++ b/drivers/block/drbd/drbd_protocol.h @@ -23,7 +23,7 @@ enum drbd_packet { P_AUTH_RESPONSE = 0x11, P_STATE_CHG_REQ = 0x12, - /* asender (meta socket */ + /* (meta socket) */ P_PING = 0x13, P_PING_ACK = 0x14, P_RECV_ACK = 0x15, /* Used in protocol B */ diff --git a/drivers/block/drbd/drbd_receiver.c b/drivers/block/drbd/drbd_receiver.c index b4b5680ac6ad..1957fe8601dc 100644 --- a/drivers/block/drbd/drbd_receiver.c +++ b/drivers/block/drbd/drbd_receiver.c @@ -215,7 +215,7 @@ static void reclaim_finished_net_peer_reqs(struct drbd_device *device, } } -static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) +static void drbd_reclaim_net_peer_reqs(struct drbd_device *device) { LIST_HEAD(reclaimed); struct drbd_peer_request *peer_req, *t; @@ -223,11 +223,30 @@ static void drbd_kick_lo_and_reclaim_net(struct drbd_device *device) spin_lock_irq(&device->resource->req_lock); reclaim_finished_net_peer_reqs(device, &reclaimed); spin_unlock_irq(&device->resource->req_lock); - list_for_each_entry_safe(peer_req, t, &reclaimed, w.list) drbd_free_net_peer_req(device, peer_req); } +static void conn_reclaim_net_peer_reqs(struct drbd_connection *connection) +{ + struct drbd_peer_device *peer_device; + int vnr; + + rcu_read_lock(); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { + struct drbd_device *device = peer_device->device; + if (!atomic_read(&device->pp_in_use_by_net)) + continue; + + kref_get(&device->kref); + rcu_read_unlock(); + drbd_reclaim_net_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + rcu_read_lock(); + } + rcu_read_unlock(); +} + /** * drbd_alloc_pages() - Returns @number pages, retries forever (or until signalled) * @device: DRBD device. @@ -265,10 +284,15 @@ struct page *drbd_alloc_pages(struct drbd_peer_device *peer_device, unsigned int if (atomic_read(&device->pp_in_use) < mxb) page = __drbd_alloc_pages(device, number); + /* Try to keep the fast path fast, but occasionally we need + * to reclaim the pages we lended to the network stack. */ + if (page && atomic_read(&device->pp_in_use_by_net) > 512) + drbd_reclaim_net_peer_reqs(device); + while (page == NULL) { prepare_to_wait(&drbd_pp_wait, &wait, TASK_INTERRUPTIBLE); - drbd_kick_lo_and_reclaim_net(device); + drbd_reclaim_net_peer_reqs(device); if (atomic_read(&device->pp_in_use) < mxb) { page = __drbd_alloc_pages(device, number); @@ -1099,7 +1123,15 @@ randomize: return 0; } - drbd_thread_start(&connection->asender); + drbd_thread_start(&connection->ack_receiver); + /* opencoded create_singlethread_workqueue(), + * to be able to use format string arguments */ + connection->ack_sender = + alloc_ordered_workqueue("drbd_as_%s", WQ_MEM_RECLAIM, connection->resource->name); + if (!connection->ack_sender) { + drbd_err(connection, "Failed to create workqueue ack_sender\n"); + return 0; + } mutex_lock(&connection->resource->conf_update); /* The discard_my_data flag is a single-shot modifier to the next @@ -1178,7 +1210,7 @@ static void drbd_flush(struct drbd_connection *connection) struct drbd_peer_device *peer_device; int vnr; - if (connection->resource->write_ordering >= WO_bdev_flush) { + if (connection->resource->write_ordering >= WO_BDEV_FLUSH) { rcu_read_lock(); idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { struct drbd_device *device = peer_device->device; @@ -1203,7 +1235,7 @@ static void drbd_flush(struct drbd_connection *connection) /* would rather check on EOPNOTSUPP, but that is not reliable. * don't try again for ANY return value != 0 * if (rv == -EOPNOTSUPP) */ - drbd_bump_write_ordering(connection->resource, NULL, WO_drain_io); + drbd_bump_write_ordering(connection->resource, NULL, WO_DRAIN_IO); } put_ldev(device); kref_put(&device->kref, drbd_destroy_device); @@ -1299,10 +1331,10 @@ max_allowed_wo(struct drbd_backing_dev *bdev, enum write_ordering_e wo) dc = rcu_dereference(bdev->disk_conf); - if (wo == WO_bdev_flush && !dc->disk_flushes) - wo = WO_drain_io; - if (wo == WO_drain_io && !dc->disk_drain) - wo = WO_none; + if (wo == WO_BDEV_FLUSH && !dc->disk_flushes) + wo = WO_DRAIN_IO; + if (wo == WO_DRAIN_IO && !dc->disk_drain) + wo = WO_NONE; return wo; } @@ -1319,13 +1351,13 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin enum write_ordering_e pwo; int vnr; static char *write_ordering_str[] = { - [WO_none] = "none", - [WO_drain_io] = "drain", - [WO_bdev_flush] = "flush", + [WO_NONE] = "none", + [WO_DRAIN_IO] = "drain", + [WO_BDEV_FLUSH] = "flush", }; pwo = resource->write_ordering; - if (wo != WO_bdev_flush) + if (wo != WO_BDEV_FLUSH) wo = min(pwo, wo); rcu_read_lock(); idr_for_each_entry(&resource->devices, device, vnr) { @@ -1343,7 +1375,7 @@ void drbd_bump_write_ordering(struct drbd_resource *resource, struct drbd_backin rcu_read_unlock(); resource->write_ordering = wo; - if (pwo != resource->write_ordering || wo == WO_bdev_flush) + if (pwo != resource->write_ordering || wo == WO_BDEV_FLUSH) drbd_info(resource, "Method to ensure write ordering: %s\n", write_ordering_str[resource->write_ordering]); } @@ -1380,7 +1412,7 @@ int drbd_submit_peer_request(struct drbd_device *device, if (peer_req->flags & EE_IS_TRIM_USE_ZEROOUT) { /* wait for all pending IO completions, before we start * zeroing things out. */ - conn_wait_active_ee_empty(first_peer_device(device)->connection); + conn_wait_active_ee_empty(peer_req->peer_device->connection); /* add it to the active list now, * so we can find it to present it in debugfs */ peer_req->submit_jif = jiffies; @@ -1508,12 +1540,6 @@ static void conn_wait_active_ee_empty(struct drbd_connection *connection) rcu_read_unlock(); } -static struct drbd_peer_device * -conn_peer_device(struct drbd_connection *connection, int volume_number) -{ - return idr_find(&connection->peer_devices, volume_number); -} - static int receive_Barrier(struct drbd_connection *connection, struct packet_info *pi) { int rv; @@ -1533,7 +1559,7 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf * Therefore we must send the barrier_ack after the barrier request was * completed. */ switch (connection->resource->write_ordering) { - case WO_none: + case WO_NONE: if (rv == FE_RECYCLED) return 0; @@ -1546,8 +1572,8 @@ static int receive_Barrier(struct drbd_connection *connection, struct packet_inf drbd_warn(connection, "Allocation of an epoch failed, slowing down\n"); /* Fall through */ - case WO_bdev_flush: - case WO_drain_io: + case WO_BDEV_FLUSH: + case WO_DRAIN_IO: conn_wait_active_ee_empty(connection); drbd_flush(connection); @@ -1752,7 +1778,7 @@ static int recv_dless_read(struct drbd_peer_device *peer_device, struct drbd_req } /* - * e_end_resync_block() is called in asender context via + * e_end_resync_block() is called in ack_sender context via * drbd_finish_peer_reqs(). */ static int e_end_resync_block(struct drbd_work *w, int unused) @@ -1926,7 +1952,7 @@ static void restart_conflicting_writes(struct drbd_device *device, } /* - * e_end_block() is called in asender context via drbd_finish_peer_reqs(). + * e_end_block() is called in ack_sender context via drbd_finish_peer_reqs(). */ static int e_end_block(struct drbd_work *w, int cancel) { @@ -1966,7 +1992,7 @@ static int e_end_block(struct drbd_work *w, int cancel) } else D_ASSERT(device, drbd_interval_empty(&peer_req->i)); - drbd_may_finish_epoch(first_peer_device(device)->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); + drbd_may_finish_epoch(peer_device->connection, peer_req->epoch, EV_PUT + (cancel ? EV_CLEANUP : 0)); return err; } @@ -2098,7 +2124,7 @@ static int wait_for_and_update_peer_seq(struct drbd_peer_device *peer_device, co } rcu_read_lock(); - tp = rcu_dereference(first_peer_device(device)->connection->net_conf)->two_primaries; + tp = rcu_dereference(peer_device->connection->net_conf)->two_primaries; rcu_read_unlock(); if (!tp) @@ -2217,7 +2243,7 @@ static int handle_write_conflicts(struct drbd_device *device, peer_req->w.cb = superseded ? e_send_superseded : e_send_retry_write; list_add_tail(&peer_req->w.list, &device->done_ee); - wake_asender(connection); + queue_work(connection->ack_sender, &peer_req->peer_device->send_acks_work); err = -ENOENT; goto out; @@ -2364,7 +2390,7 @@ static int receive_Data(struct drbd_connection *connection, struct packet_info * if (dp_flags & DP_SEND_RECEIVE_ACK) { /* I really don't like it that the receiver thread * sends on the msock, but anyways */ - drbd_send_ack(first_peer_device(device), P_RECV_ACK, peer_req); + drbd_send_ack(peer_device, P_RECV_ACK, peer_req); } if (tp) { @@ -4056,7 +4082,7 @@ static int receive_state(struct drbd_connection *connection, struct packet_info os = ns = drbd_read_state(device); spin_unlock_irq(&device->resource->req_lock); - /* If some other part of the code (asender thread, timeout) + /* If some other part of the code (ack_receiver thread, timeout) * already decided to close the connection again, * we must not "re-establish" it here. */ if (os.conn <= C_TEAR_DOWN) @@ -4661,8 +4687,12 @@ static void conn_disconnect(struct drbd_connection *connection) */ conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); - /* asender does not clean up anything. it must not interfere, either */ - drbd_thread_stop(&connection->asender); + /* ack_receiver does not clean up anything. it must not interfere, either */ + drbd_thread_stop(&connection->ack_receiver); + if (connection->ack_sender) { + destroy_workqueue(connection->ack_sender); + connection->ack_sender = NULL; + } drbd_free_sock(connection); rcu_read_lock(); @@ -5431,49 +5461,39 @@ static int got_skip(struct drbd_connection *connection, struct packet_info *pi) return 0; } -static int connection_finish_peer_reqs(struct drbd_connection *connection) +struct meta_sock_cmd { + size_t pkt_size; + int (*fn)(struct drbd_connection *connection, struct packet_info *); +}; + +static void set_rcvtimeo(struct drbd_connection *connection, bool ping_timeout) { - struct drbd_peer_device *peer_device; - int vnr, not_empty = 0; + long t; + struct net_conf *nc; - do { - clear_bit(SIGNAL_ASENDER, &connection->flags); - flush_signals(current); + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + t = ping_timeout ? nc->ping_timeo : nc->ping_int; + rcu_read_unlock(); - rcu_read_lock(); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - kref_get(&device->kref); - rcu_read_unlock(); - if (drbd_finish_peer_reqs(device)) { - kref_put(&device->kref, drbd_destroy_device); - return 1; - } - kref_put(&device->kref, drbd_destroy_device); - rcu_read_lock(); - } - set_bit(SIGNAL_ASENDER, &connection->flags); + t *= HZ; + if (ping_timeout) + t /= 10; - spin_lock_irq(&connection->resource->req_lock); - idr_for_each_entry(&connection->peer_devices, peer_device, vnr) { - struct drbd_device *device = peer_device->device; - not_empty = !list_empty(&device->done_ee); - if (not_empty) - break; - } - spin_unlock_irq(&connection->resource->req_lock); - rcu_read_unlock(); - } while (not_empty); + connection->meta.socket->sk->sk_rcvtimeo = t; +} - return 0; +static void set_ping_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 1); } -struct asender_cmd { - size_t pkt_size; - int (*fn)(struct drbd_connection *connection, struct packet_info *); -}; +static void set_idle_timeout(struct drbd_connection *connection) +{ + set_rcvtimeo(connection, 0); +} -static struct asender_cmd asender_tbl[] = { +static struct meta_sock_cmd ack_receiver_tbl[] = { [P_PING] = { 0, got_Ping }, [P_PING_ACK] = { 0, got_PingAck }, [P_RECV_ACK] = { sizeof(struct p_block_ack), got_BlockAck }, @@ -5493,64 +5513,40 @@ static struct asender_cmd asender_tbl[] = { [P_RETRY_WRITE] = { sizeof(struct p_block_ack), got_BlockAck }, }; -int drbd_asender(struct drbd_thread *thi) +int drbd_ack_receiver(struct drbd_thread *thi) { struct drbd_connection *connection = thi->connection; - struct asender_cmd *cmd = NULL; + struct meta_sock_cmd *cmd = NULL; struct packet_info pi; + unsigned long pre_recv_jif; int rv; void *buf = connection->meta.rbuf; int received = 0; unsigned int header_size = drbd_header_size(connection); int expect = header_size; bool ping_timeout_active = false; - struct net_conf *nc; - int ping_timeo, tcp_cork, ping_int; struct sched_param param = { .sched_priority = 2 }; rv = sched_setscheduler(current, SCHED_RR, ¶m); if (rv < 0) - drbd_err(connection, "drbd_asender: ERROR set priority, ret=%d\n", rv); + drbd_err(connection, "drbd_ack_receiver: ERROR set priority, ret=%d\n", rv); while (get_t_state(thi) == RUNNING) { drbd_thread_current_set_cpu(thi); - rcu_read_lock(); - nc = rcu_dereference(connection->net_conf); - ping_timeo = nc->ping_timeo; - tcp_cork = nc->tcp_cork; - ping_int = nc->ping_int; - rcu_read_unlock(); + conn_reclaim_net_peer_reqs(connection); if (test_and_clear_bit(SEND_PING, &connection->flags)) { if (drbd_send_ping(connection)) { drbd_err(connection, "drbd_send_ping has failed\n"); goto reconnect; } - connection->meta.socket->sk->sk_rcvtimeo = ping_timeo * HZ / 10; + set_ping_timeout(connection); ping_timeout_active = true; } - /* TODO: conditionally cork; it may hurt latency if we cork without - much to send */ - if (tcp_cork) - drbd_tcp_cork(connection->meta.socket); - if (connection_finish_peer_reqs(connection)) { - drbd_err(connection, "connection_finish_peer_reqs() failed\n"); - goto reconnect; - } - /* but unconditionally uncork unless disabled */ - if (tcp_cork) - drbd_tcp_uncork(connection->meta.socket); - - /* short circuit, recv_msg would return EINTR anyways. */ - if (signal_pending(current)) - continue; - + pre_recv_jif = jiffies; rv = drbd_recv_short(connection->meta.socket, buf, expect-received, 0); - clear_bit(SIGNAL_ASENDER, &connection->flags); - - flush_signals(current); /* Note: * -EINTR (on meta) we got a signal @@ -5562,7 +5558,6 @@ int drbd_asender(struct drbd_thread *thi) * rv < expected: "woken" by signal during receive * rv == 0 : "connection shut down by peer" */ -received_more: if (likely(rv > 0)) { received += rv; buf += rv; @@ -5584,8 +5579,7 @@ received_more: } else if (rv == -EAGAIN) { /* If the data socket received something meanwhile, * that is good enough: peer is still alive. */ - if (time_after(connection->last_received, - jiffies - connection->meta.socket->sk->sk_rcvtimeo)) + if (time_after(connection->last_received, pre_recv_jif)) continue; if (ping_timeout_active) { drbd_err(connection, "PingAck did not arrive in time.\n"); @@ -5594,6 +5588,10 @@ received_more: set_bit(SEND_PING, &connection->flags); continue; } else if (rv == -EINTR) { + /* maybe drbd_thread_stop(): the while condition will notice. + * maybe woken for send_ping: we'll send a ping above, + * and change the rcvtimeo */ + flush_signals(current); continue; } else { drbd_err(connection, "sock_recvmsg returned %d\n", rv); @@ -5603,8 +5601,8 @@ received_more: if (received == expect && cmd == NULL) { if (decode_header(connection, connection->meta.rbuf, &pi)) goto reconnect; - cmd = &asender_tbl[pi.cmd]; - if (pi.cmd >= ARRAY_SIZE(asender_tbl) || !cmd->fn) { + cmd = &ack_receiver_tbl[pi.cmd]; + if (pi.cmd >= ARRAY_SIZE(ack_receiver_tbl) || !cmd->fn) { drbd_err(connection, "Unexpected meta packet %s (0x%04x)\n", cmdname(pi.cmd), pi.cmd); goto disconnect; @@ -5627,9 +5625,8 @@ received_more: connection->last_received = jiffies; - if (cmd == &asender_tbl[P_PING_ACK]) { - /* restore idle timeout */ - connection->meta.socket->sk->sk_rcvtimeo = ping_int * HZ; + if (cmd == &ack_receiver_tbl[P_PING_ACK]) { + set_idle_timeout(connection); ping_timeout_active = false; } @@ -5638,11 +5635,6 @@ received_more: expect = header_size; cmd = NULL; } - if (test_bit(SEND_PING, &connection->flags)) - continue; - rv = drbd_recv_short(connection->meta.socket, buf, expect-received, MSG_DONTWAIT); - if (rv > 0) - goto received_more; } if (0) { @@ -5654,9 +5646,41 @@ reconnect: disconnect: conn_request_state(connection, NS(conn, C_DISCONNECTING), CS_HARD); } - clear_bit(SIGNAL_ASENDER, &connection->flags); - drbd_info(connection, "asender terminated\n"); + drbd_info(connection, "ack_receiver terminated\n"); return 0; } + +void drbd_send_acks_wf(struct work_struct *ws) +{ + struct drbd_peer_device *peer_device = + container_of(ws, struct drbd_peer_device, send_acks_work); + struct drbd_connection *connection = peer_device->connection; + struct drbd_device *device = peer_device->device; + struct net_conf *nc; + int tcp_cork, err; + + rcu_read_lock(); + nc = rcu_dereference(connection->net_conf); + tcp_cork = nc->tcp_cork; + rcu_read_unlock(); + + if (tcp_cork) + drbd_tcp_cork(connection->meta.socket); + + err = drbd_finish_peer_reqs(device); + kref_put(&device->kref, drbd_destroy_device); + /* get is in drbd_endio_write_sec_final(). That is necessary to keep the + struct work_struct send_acks_work alive, which is in the peer_device object */ + + if (err) { + conn_request_state(connection, NS(conn, C_NETWORK_FAILURE), CS_HARD); + return; + } + + if (tcp_cork) + drbd_tcp_uncork(connection->meta.socket); + + return; +} diff --git a/drivers/block/drbd/drbd_req.c b/drivers/block/drbd/drbd_req.c index 3ae2c0086563..2255dcfebd2b 100644 --- a/drivers/block/drbd/drbd_req.c +++ b/drivers/block/drbd/drbd_req.c @@ -453,12 +453,12 @@ static void mod_rq_state(struct drbd_request *req, struct bio_and_error *m, kref_get(&req->kref); /* wait for the DONE */ if (!(s & RQ_NET_SENT) && (set & RQ_NET_SENT)) { - /* potentially already completed in the asender thread */ + /* potentially already completed in the ack_receiver thread */ if (!(s & RQ_NET_DONE)) { atomic_add(req->i.size >> 9, &device->ap_in_flight); set_if_null_req_not_net_done(peer_device, req); } - if (s & RQ_NET_PENDING) + if (req->rq_state & RQ_NET_PENDING) set_if_null_req_ack_pending(peer_device, req); } @@ -1095,6 +1095,24 @@ static bool do_remote_read(struct drbd_request *req) return false; } +bool drbd_should_do_remote(union drbd_dev_state s) +{ + return s.pdsk == D_UP_TO_DATE || + (s.pdsk >= D_INCONSISTENT && + s.conn >= C_WF_BITMAP_T && + s.conn < C_AHEAD); + /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. + That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* + states. */ +} + +static bool drbd_should_send_out_of_sync(union drbd_dev_state s) +{ + return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; + /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary + since we enter state C_AHEAD only if proto >= 96 */ +} + /* returns number of connections (== 1, for drbd 8.4) * expected to actually write this data, * which does NOT include those that we are L_AHEAD for. */ @@ -1149,7 +1167,6 @@ drbd_submit_req_private_bio(struct drbd_request *req) * stable storage, and this is a WRITE, we may not even submit * this bio. */ if (get_ldev(device)) { - req->pre_submit_jif = jiffies; if (drbd_insert_fault(device, rw == WRITE ? DRBD_FAULT_DT_WR : rw == READ ? DRBD_FAULT_DT_RD @@ -1293,6 +1310,7 @@ static void drbd_send_and_submit(struct drbd_device *device, struct drbd_request &device->pending_master_completion[rw == WRITE]); if (req->private_bio) { /* needs to be marked within the same spinlock */ + req->pre_submit_jif = jiffies; list_add_tail(&req->req_pending_local, &device->pending_completion[rw == WRITE]); _req_mod(req, TO_BE_SUBMITTED); @@ -1513,6 +1531,78 @@ blk_qc_t drbd_make_request(struct request_queue *q, struct bio *bio) return BLK_QC_T_NONE; } +static bool net_timeout_reached(struct drbd_request *net_req, + struct drbd_connection *connection, + unsigned long now, unsigned long ent, + unsigned int ko_count, unsigned int timeout) +{ + struct drbd_device *device = net_req->device; + + if (!time_after(now, net_req->pre_send_jif + ent)) + return false; + + if (time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) + return false; + + if (net_req->rq_state & RQ_NET_PENDING) { + drbd_warn(device, "Remote failed to finish a request within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", + jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); + return true; + } + + /* We received an ACK already (or are using protocol A), + * but are waiting for the epoch closing barrier ack. + * Check if we sent the barrier already. We should not blame the peer + * for being unresponsive, if we did not even ask it yet. */ + if (net_req->epoch == connection->send.current_epoch_nr) { + drbd_warn(device, + "We did not send a P_BARRIER for %ums > ko-count (%u) * timeout (%u * 0.1s); drbd kernel thread blocked?\n", + jiffies_to_msecs(now - net_req->pre_send_jif), ko_count, timeout); + return false; + } + + /* Worst case: we may have been blocked for whatever reason, then + * suddenly are able to send a lot of requests (and epoch separating + * barriers) in quick succession. + * The timestamp of the net_req may be much too old and not correspond + * to the sending time of the relevant unack'ed barrier packet, so + * would trigger a spurious timeout. The latest barrier packet may + * have a too recent timestamp to trigger the timeout, potentially miss + * a timeout. Right now we don't have a place to conveniently store + * these timestamps. + * But in this particular situation, the application requests are still + * completed to upper layers, DRBD should still "feel" responsive. + * No need yet to kill this connection, it may still recover. + * If not, eventually we will have queued enough into the network for + * us to block. From that point of view, the timestamp of the last sent + * barrier packet is relevant enough. + */ + if (time_after(now, connection->send.last_sent_barrier_jif + ent)) { + drbd_warn(device, "Remote failed to answer a P_BARRIER (sent at %lu jif; now=%lu jif) within %ums > ko-count (%u) * timeout (%u * 0.1s)\n", + connection->send.last_sent_barrier_jif, now, + jiffies_to_msecs(now - connection->send.last_sent_barrier_jif), ko_count, timeout); + return true; + } + return false; +} + +/* A request is considered timed out, if + * - we have some effective timeout from the configuration, + * with some state restrictions applied, + * - the oldest request is waiting for a response from the network + * resp. the local disk, + * - the oldest request is in fact older than the effective timeout, + * - the connection was established (resp. disk was attached) + * for longer than the timeout already. + * Note that for 32bit jiffies and very stable connections/disks, + * we may have a wrap around, which is catched by + * !time_in_range(now, last_..._jif, last_..._jif + timeout). + * + * Side effect: once per 32bit wrap-around interval, which means every + * ~198 days with 250 HZ, we have a window where the timeout would need + * to expire twice (worst case) to become effective. Good enough. + */ + void request_timer_fn(unsigned long data) { struct drbd_device *device = (struct drbd_device *) data; @@ -1522,11 +1612,14 @@ void request_timer_fn(unsigned long data) unsigned long oldest_submit_jif; unsigned long ent = 0, dt = 0, et, nt; /* effective timeout = ko_count * timeout */ unsigned long now; + unsigned int ko_count = 0, timeout = 0; rcu_read_lock(); nc = rcu_dereference(connection->net_conf); - if (nc && device->state.conn >= C_WF_REPORT_PARAMS) - ent = nc->timeout * HZ/10 * nc->ko_count; + if (nc && device->state.conn >= C_WF_REPORT_PARAMS) { + ko_count = nc->ko_count; + timeout = nc->timeout; + } if (get_ldev(device)) { /* implicit state.disk >= D_INCONSISTENT */ dt = rcu_dereference(device->ldev->disk_conf)->disk_timeout * HZ / 10; @@ -1534,6 +1627,8 @@ void request_timer_fn(unsigned long data) } rcu_read_unlock(); + + ent = timeout * HZ/10 * ko_count; et = min_not_zero(dt, ent); if (!et) @@ -1545,11 +1640,22 @@ void request_timer_fn(unsigned long data) spin_lock_irq(&device->resource->req_lock); req_read = list_first_entry_or_null(&device->pending_completion[0], struct drbd_request, req_pending_local); req_write = list_first_entry_or_null(&device->pending_completion[1], struct drbd_request, req_pending_local); - req_peer = connection->req_not_net_done; + /* maybe the oldest request waiting for the peer is in fact still - * blocking in tcp sendmsg */ - if (!req_peer && connection->req_next && connection->req_next->pre_send_jif) - req_peer = connection->req_next; + * blocking in tcp sendmsg. That's ok, though, that's handled via the + * socket send timeout, requesting a ping, and bumping ko-count in + * we_should_drop_the_connection(). + */ + + /* check the oldest request we did successfully sent, + * but which is still waiting for an ACK. */ + req_peer = connection->req_ack_pending; + + /* if we don't have such request (e.g. protocoll A) + * check the oldest requests which is still waiting on its epoch + * closing barrier ack. */ + if (!req_peer) + req_peer = connection->req_not_net_done; /* evaluate the oldest peer request only in one timer! */ if (req_peer && req_peer->device != device) @@ -1566,28 +1672,9 @@ void request_timer_fn(unsigned long data) : req_write ? req_write->pre_submit_jif : req_read ? req_read->pre_submit_jif : now; - /* The request is considered timed out, if - * - we have some effective timeout from the configuration, - * with above state restrictions applied, - * - the oldest request is waiting for a response from the network - * resp. the local disk, - * - the oldest request is in fact older than the effective timeout, - * - the connection was established (resp. disk was attached) - * for longer than the timeout already. - * Note that for 32bit jiffies and very stable connections/disks, - * we may have a wrap around, which is catched by - * !time_in_range(now, last_..._jif, last_..._jif + timeout). - * - * Side effect: once per 32bit wrap-around interval, which means every - * ~198 days with 250 HZ, we have a window where the timeout would need - * to expire twice (worst case) to become effective. Good enough. - */ - if (ent && req_peer && - time_after(now, req_peer->pre_send_jif + ent) && - !time_in_range(now, connection->last_reconnect_jif, connection->last_reconnect_jif + ent)) { - drbd_warn(device, "Remote failed to finish a request within ko-count * timeout\n"); + if (ent && req_peer && net_timeout_reached(req_peer, connection, now, ent, ko_count, timeout)) _conn_request_state(connection, NS(conn, C_TIMEOUT), CS_VERBOSE | CS_HARD); - } + if (dt && oldest_submit_jif != now && time_after(now, oldest_submit_jif + dt) && !time_in_range(now, device->last_reattach_jif, device->last_reattach_jif + dt)) { diff --git a/drivers/block/drbd/drbd_req.h b/drivers/block/drbd/drbd_req.h index 9f6a04080e9f..bb2ef78165e5 100644 --- a/drivers/block/drbd/drbd_req.h +++ b/drivers/block/drbd/drbd_req.h @@ -331,21 +331,6 @@ static inline int req_mod(struct drbd_request *req, return rv; } -static inline bool drbd_should_do_remote(union drbd_dev_state s) -{ - return s.pdsk == D_UP_TO_DATE || - (s.pdsk >= D_INCONSISTENT && - s.conn >= C_WF_BITMAP_T && - s.conn < C_AHEAD); - /* Before proto 96 that was >= CONNECTED instead of >= C_WF_BITMAP_T. - That is equivalent since before 96 IO was frozen in the C_WF_BITMAP* - states. */ -} -static inline bool drbd_should_send_out_of_sync(union drbd_dev_state s) -{ - return s.conn == C_AHEAD || s.conn == C_WF_BITMAP_S; - /* pdsk = D_INCONSISTENT as a consequence. Protocol 96 check not necessary - since we enter state C_AHEAD only if proto >= 96 */ -} +extern bool drbd_should_do_remote(union drbd_dev_state); #endif diff --git a/drivers/block/drbd/drbd_state.c b/drivers/block/drbd/drbd_state.c index 2d7dd269b6a8..5a7ef7873b67 100644 --- a/drivers/block/drbd/drbd_state.c +++ b/drivers/block/drbd/drbd_state.c @@ -29,6 +29,7 @@ #include "drbd_int.h" #include "drbd_protocol.h" #include "drbd_req.h" +#include "drbd_state_change.h" struct after_state_chg_work { struct drbd_work w; @@ -37,6 +38,7 @@ struct after_state_chg_work { union drbd_state ns; enum chg_state_flags flags; struct completion *done; + struct drbd_state_change *state_change; }; enum sanitize_state_warnings { @@ -48,9 +50,248 @@ enum sanitize_state_warnings { IMPLICITLY_UPGRADED_PDSK, }; +static void count_objects(struct drbd_resource *resource, + unsigned int *n_devices, + unsigned int *n_connections) +{ + struct drbd_device *device; + struct drbd_connection *connection; + int vnr; + + *n_devices = 0; + *n_connections = 0; + + idr_for_each_entry(&resource->devices, device, vnr) + (*n_devices)++; + for_each_connection(connection, resource) + (*n_connections)++; +} + +static struct drbd_state_change *alloc_state_change(unsigned int n_devices, unsigned int n_connections, gfp_t gfp) +{ + struct drbd_state_change *state_change; + unsigned int size, n; + + size = sizeof(struct drbd_state_change) + + n_devices * sizeof(struct drbd_device_state_change) + + n_connections * sizeof(struct drbd_connection_state_change) + + n_devices * n_connections * sizeof(struct drbd_peer_device_state_change); + state_change = kmalloc(size, gfp); + if (!state_change) + return NULL; + state_change->n_devices = n_devices; + state_change->n_connections = n_connections; + state_change->devices = (void *)(state_change + 1); + state_change->connections = (void *)&state_change->devices[n_devices]; + state_change->peer_devices = (void *)&state_change->connections[n_connections]; + state_change->resource->resource = NULL; + for (n = 0; n < n_devices; n++) + state_change->devices[n].device = NULL; + for (n = 0; n < n_connections; n++) + state_change->connections[n].connection = NULL; + return state_change; +} + +struct drbd_state_change *remember_old_state(struct drbd_resource *resource, gfp_t gfp) +{ + struct drbd_state_change *state_change; + struct drbd_device *device; + unsigned int n_devices; + struct drbd_connection *connection; + unsigned int n_connections; + int vnr; + + struct drbd_device_state_change *device_state_change; + struct drbd_peer_device_state_change *peer_device_state_change; + struct drbd_connection_state_change *connection_state_change; + + /* Caller holds req_lock spinlock. + * No state, no device IDR, no connections lists can change. */ + count_objects(resource, &n_devices, &n_connections); + state_change = alloc_state_change(n_devices, n_connections, gfp); + if (!state_change) + return NULL; + + kref_get(&resource->kref); + state_change->resource->resource = resource; + state_change->resource->role[OLD] = + conn_highest_role(first_connection(resource)); + state_change->resource->susp[OLD] = resource->susp; + state_change->resource->susp_nod[OLD] = resource->susp_nod; + state_change->resource->susp_fen[OLD] = resource->susp_fen; + + connection_state_change = state_change->connections; + for_each_connection(connection, resource) { + kref_get(&connection->kref); + connection_state_change->connection = connection; + connection_state_change->cstate[OLD] = + connection->cstate; + connection_state_change->peer_role[OLD] = + conn_highest_peer(connection); + connection_state_change++; + } + + device_state_change = state_change->devices; + peer_device_state_change = state_change->peer_devices; + idr_for_each_entry(&resource->devices, device, vnr) { + kref_get(&device->kref); + device_state_change->device = device; + device_state_change->disk_state[OLD] = device->state.disk; + + /* The peer_devices for each device have to be enumerated in + the order of the connections. We may not use for_each_peer_device() here. */ + for_each_connection(connection, resource) { + struct drbd_peer_device *peer_device; + + peer_device = conn_peer_device(connection, device->vnr); + peer_device_state_change->peer_device = peer_device; + peer_device_state_change->disk_state[OLD] = + device->state.pdsk; + peer_device_state_change->repl_state[OLD] = + max_t(enum drbd_conns, + C_WF_REPORT_PARAMS, device->state.conn); + peer_device_state_change->resync_susp_user[OLD] = + device->state.user_isp; + peer_device_state_change->resync_susp_peer[OLD] = + device->state.peer_isp; + peer_device_state_change->resync_susp_dependency[OLD] = + device->state.aftr_isp; + peer_device_state_change++; + } + device_state_change++; + } + + return state_change; +} + +static void remember_new_state(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change; + struct drbd_resource *resource; + unsigned int n; + + if (!state_change) + return; + + resource_state_change = &state_change->resource[0]; + resource = resource_state_change->resource; + + resource_state_change->role[NEW] = + conn_highest_role(first_connection(resource)); + resource_state_change->susp[NEW] = resource->susp; + resource_state_change->susp_nod[NEW] = resource->susp_nod; + resource_state_change->susp_fen[NEW] = resource->susp_fen; + + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n]; + struct drbd_device *device = device_state_change->device; + + device_state_change->disk_state[NEW] = device->state.disk; + } + + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n]; + struct drbd_connection *connection = + connection_state_change->connection; + + connection_state_change->cstate[NEW] = connection->cstate; + connection_state_change->peer_role[NEW] = + conn_highest_peer(connection); + } + + for (n = 0; n < state_change->n_devices * state_change->n_connections; n++) { + struct drbd_peer_device_state_change *peer_device_state_change = + &state_change->peer_devices[n]; + struct drbd_device *device = + peer_device_state_change->peer_device->device; + union drbd_dev_state state = device->state; + + peer_device_state_change->disk_state[NEW] = state.pdsk; + peer_device_state_change->repl_state[NEW] = + max_t(enum drbd_conns, C_WF_REPORT_PARAMS, state.conn); + peer_device_state_change->resync_susp_user[NEW] = + state.user_isp; + peer_device_state_change->resync_susp_peer[NEW] = + state.peer_isp; + peer_device_state_change->resync_susp_dependency[NEW] = + state.aftr_isp; + } +} + +void copy_old_to_new_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + +#define OLD_TO_NEW(x) \ + (x[NEW] = x[OLD]) + + OLD_TO_NEW(resource_state_change->role); + OLD_TO_NEW(resource_state_change->susp); + OLD_TO_NEW(resource_state_change->susp_nod); + OLD_TO_NEW(resource_state_change->susp_fen); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + OLD_TO_NEW(connection_state_change->peer_role); + OLD_TO_NEW(connection_state_change->cstate); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + OLD_TO_NEW(device_state_change->disk_state); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + OLD_TO_NEW(p->disk_state); + OLD_TO_NEW(p->repl_state); + OLD_TO_NEW(p->resync_susp_user); + OLD_TO_NEW(p->resync_susp_peer); + OLD_TO_NEW(p->resync_susp_dependency); + } + +#undef OLD_TO_NEW +} + +void forget_state_change(struct drbd_state_change *state_change) +{ + unsigned int n; + + if (!state_change) + return; + + if (state_change->resource->resource) + kref_put(&state_change->resource->resource->kref, drbd_destroy_resource); + for (n = 0; n < state_change->n_devices; n++) { + struct drbd_device *device = state_change->devices[n].device; + + if (device) + kref_put(&device->kref, drbd_destroy_device); + } + for (n = 0; n < state_change->n_connections; n++) { + struct drbd_connection *connection = + state_change->connections[n].connection; + + if (connection) + kref_put(&connection->kref, drbd_destroy_connection); + } + kfree(state_change); +} + static int w_after_state_ch(struct drbd_work *w, int unused); static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags); + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *); static enum drbd_state_rv is_valid_state(struct drbd_device *, union drbd_state); static enum drbd_state_rv is_valid_soft_transition(union drbd_state, union drbd_state, struct drbd_connection *); static enum drbd_state_rv is_valid_transition(union drbd_state os, union drbd_state ns); @@ -93,6 +334,7 @@ static enum drbd_role max_role(enum drbd_role role1, enum drbd_role role2) return R_SECONDARY; return R_UNKNOWN; } + static enum drbd_role min_role(enum drbd_role role1, enum drbd_role role2) { if (role1 == R_UNKNOWN || role2 == R_UNKNOWN) @@ -937,7 +1179,7 @@ void drbd_resume_al(struct drbd_device *device) drbd_info(device, "Resumed AL updates\n"); } -/* helper for __drbd_set_state */ +/* helper for _drbd_set_state */ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) { if (first_peer_device(device)->connection->agreed_pro_version < 90) @@ -965,17 +1207,17 @@ static void set_ov_position(struct drbd_device *device, enum drbd_conns cs) } /** - * __drbd_set_state() - Set a new DRBD state + * _drbd_set_state() - Set a new DRBD state * @device: DRBD device. * @ns: new state. * @flags: Flags * @done: Optional completion, that will get completed after the after_state_ch() finished * - * Caller needs to hold req_lock, and global_state_lock. Do not call directly. + * Caller needs to hold req_lock. Do not call directly. */ enum drbd_state_rv -__drbd_set_state(struct drbd_device *device, union drbd_state ns, - enum chg_state_flags flags, struct completion *done) +_drbd_set_state(struct drbd_device *device, union drbd_state ns, + enum chg_state_flags flags, struct completion *done) { struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; @@ -983,6 +1225,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, enum drbd_state_rv rv = SS_SUCCESS; enum sanitize_state_warnings ssw; struct after_state_chg_work *ascw; + struct drbd_state_change *state_change; os = drbd_read_state(device); @@ -1037,6 +1280,9 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, if (!is_sync_state(os.conn) && is_sync_state(ns.conn)) clear_bit(RS_DONE, &device->flags); + /* FIXME: Have any flags been set earlier in this function already? */ + state_change = remember_old_state(device->resource, GFP_ATOMIC); + /* changes to local_cnt and device flags should be visible before * changes to state, which again should be visible before anything else * depending on that change happens. */ @@ -1047,6 +1293,8 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, device->resource->susp_fen = ns.susp_fen; smp_wmb(); + remember_new_state(state_change); + /* put replicated vs not-replicated requests in seperate epochs */ if (drbd_should_do_remote((union drbd_dev_state)os.i) != drbd_should_do_remote((union drbd_dev_state)ns.i)) @@ -1184,6 +1432,7 @@ __drbd_set_state(struct drbd_device *device, union drbd_state ns, ascw->w.cb = w_after_state_ch; ascw->device = device; ascw->done = done; + ascw->state_change = state_change; drbd_queue_work(&connection->sender_work, &ascw->w); } else { @@ -1199,7 +1448,8 @@ static int w_after_state_ch(struct drbd_work *w, int unused) container_of(w, struct after_state_chg_work, w); struct drbd_device *device = ascw->device; - after_state_ch(device, ascw->os, ascw->ns, ascw->flags); + after_state_ch(device, ascw->os, ascw->ns, ascw->flags, ascw->state_change); + forget_state_change(ascw->state_change); if (ascw->flags & CS_WAIT_COMPLETE) complete(ascw->done); kfree(ascw); @@ -1234,7 +1484,7 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, D_ASSERT(device, current == first_peer_device(device)->connection->worker.task); /* open coded non-blocking drbd_suspend_io(device); */ - set_bit(SUSPEND_IO, &device->flags); + atomic_inc(&device->suspend_cnt); drbd_bm_lock(device, why, flags); rv = io_fn(device); @@ -1245,6 +1495,139 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, return rv; } +void notify_resource_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_resource_state_change *resource_state_change, + enum drbd_notification_type type) +{ + struct drbd_resource *resource = resource_state_change->resource; + struct resource_info resource_info = { + .res_role = resource_state_change->role[NEW], + .res_susp = resource_state_change->susp[NEW], + .res_susp_nod = resource_state_change->susp_nod[NEW], + .res_susp_fen = resource_state_change->susp_fen[NEW], + }; + + notify_resource_state(skb, seq, resource, &resource_info, type); +} + +void notify_connection_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_connection_state_change *connection_state_change, + enum drbd_notification_type type) +{ + struct drbd_connection *connection = connection_state_change->connection; + struct connection_info connection_info = { + .conn_connection_state = connection_state_change->cstate[NEW], + .conn_role = connection_state_change->peer_role[NEW], + }; + + notify_connection_state(skb, seq, connection, &connection_info, type); +} + +void notify_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_device_state_change *device_state_change, + enum drbd_notification_type type) +{ + struct drbd_device *device = device_state_change->device; + struct device_info device_info = { + .dev_disk_state = device_state_change->disk_state[NEW], + }; + + notify_device_state(skb, seq, device, &device_info, type); +} + +void notify_peer_device_state_change(struct sk_buff *skb, + unsigned int seq, + struct drbd_peer_device_state_change *p, + enum drbd_notification_type type) +{ + struct drbd_peer_device *peer_device = p->peer_device; + struct peer_device_info peer_device_info = { + .peer_repl_state = p->repl_state[NEW], + .peer_disk_state = p->disk_state[NEW], + .peer_resync_susp_user = p->resync_susp_user[NEW], + .peer_resync_susp_peer = p->resync_susp_peer[NEW], + .peer_resync_susp_dependency = p->resync_susp_dependency[NEW], + }; + + notify_peer_device_state(skb, seq, peer_device, &peer_device_info, type); +} + +static void broadcast_state_change(struct drbd_state_change *state_change) +{ + struct drbd_resource_state_change *resource_state_change = &state_change->resource[0]; + bool resource_state_has_changed; + unsigned int n_device, n_connection, n_peer_device, n_peer_devices; + void (*last_func)(struct sk_buff *, unsigned int, void *, + enum drbd_notification_type) = NULL; + void *uninitialized_var(last_arg); + +#define HAS_CHANGED(state) ((state)[OLD] != (state)[NEW]) +#define FINAL_STATE_CHANGE(type) \ + ({ if (last_func) \ + last_func(NULL, 0, last_arg, type); \ + }) +#define REMEMBER_STATE_CHANGE(func, arg, type) \ + ({ FINAL_STATE_CHANGE(type | NOTIFY_CONTINUES); \ + last_func = (typeof(last_func))func; \ + last_arg = arg; \ + }) + + mutex_lock(¬ification_mutex); + + resource_state_has_changed = + HAS_CHANGED(resource_state_change->role) || + HAS_CHANGED(resource_state_change->susp) || + HAS_CHANGED(resource_state_change->susp_nod) || + HAS_CHANGED(resource_state_change->susp_fen); + + if (resource_state_has_changed) + REMEMBER_STATE_CHANGE(notify_resource_state_change, + resource_state_change, NOTIFY_CHANGE); + + for (n_connection = 0; n_connection < state_change->n_connections; n_connection++) { + struct drbd_connection_state_change *connection_state_change = + &state_change->connections[n_connection]; + + if (HAS_CHANGED(connection_state_change->peer_role) || + HAS_CHANGED(connection_state_change->cstate)) + REMEMBER_STATE_CHANGE(notify_connection_state_change, + connection_state_change, NOTIFY_CHANGE); + } + + for (n_device = 0; n_device < state_change->n_devices; n_device++) { + struct drbd_device_state_change *device_state_change = + &state_change->devices[n_device]; + + if (HAS_CHANGED(device_state_change->disk_state)) + REMEMBER_STATE_CHANGE(notify_device_state_change, + device_state_change, NOTIFY_CHANGE); + } + + n_peer_devices = state_change->n_devices * state_change->n_connections; + for (n_peer_device = 0; n_peer_device < n_peer_devices; n_peer_device++) { + struct drbd_peer_device_state_change *p = + &state_change->peer_devices[n_peer_device]; + + if (HAS_CHANGED(p->disk_state) || + HAS_CHANGED(p->repl_state) || + HAS_CHANGED(p->resync_susp_user) || + HAS_CHANGED(p->resync_susp_peer) || + HAS_CHANGED(p->resync_susp_dependency)) + REMEMBER_STATE_CHANGE(notify_peer_device_state_change, + p, NOTIFY_CHANGE); + } + + FINAL_STATE_CHANGE(NOTIFY_CHANGE); + mutex_unlock(¬ification_mutex); + +#undef HAS_CHANGED +#undef FINAL_STATE_CHANGE +#undef REMEMBER_STATE_CHANGE +} + /** * after_state_ch() - Perform after state change actions that may sleep * @device: DRBD device. @@ -1253,13 +1636,16 @@ int drbd_bitmap_io_from_worker(struct drbd_device *device, * @flags: Flags */ static void after_state_ch(struct drbd_device *device, union drbd_state os, - union drbd_state ns, enum chg_state_flags flags) + union drbd_state ns, enum chg_state_flags flags, + struct drbd_state_change *state_change) { struct drbd_resource *resource = device->resource; struct drbd_peer_device *peer_device = first_peer_device(device); struct drbd_connection *connection = peer_device ? peer_device->connection : NULL; struct sib_info sib; + broadcast_state_change(state_change); + sib.sib_reason = SIB_STATE_CHANGE; sib.os = os; sib.ns = ns; @@ -1377,7 +1763,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, } if (ns.pdsk < D_INCONSISTENT && get_ldev(device)) { - if (os.peer == R_SECONDARY && ns.peer == R_PRIMARY && + if (os.peer != R_PRIMARY && ns.peer == R_PRIMARY && device->ldev->md.uuid[UI_BITMAP] == 0 && ns.disk >= D_UP_TO_DATE) { drbd_uuid_new_current(device); drbd_send_uuids(peer_device); @@ -1444,7 +1830,7 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, if (os.disk != D_FAILED && ns.disk == D_FAILED) { enum drbd_io_error_p eh = EP_PASS_ON; int was_io_error = 0; - /* corresponding get_ldev was in __drbd_set_state, to serialize + /* corresponding get_ldev was in _drbd_set_state, to serialize * our cleanup here with the transition to D_DISKLESS. * But is is still not save to dreference ldev here, since * we might come from an failed Attach before ldev was set. */ @@ -1455,6 +1841,10 @@ static void after_state_ch(struct drbd_device *device, union drbd_state os, was_io_error = test_and_clear_bit(WAS_IO_ERROR, &device->flags); + /* Intentionally call this handler first, before drbd_send_state(). + * See: 2932204 drbd: call local-io-error handler early + * People may chose to hard-reset the box from this handler. + * It is useful if this looks like a "regular node crash". */ if (was_io_error && eh == EP_CALL_HELPER) drbd_khelper(device, "local-io-error"); @@ -1572,6 +1962,7 @@ struct after_conn_state_chg_work { union drbd_state ns_max; /* new, max state, over all devices */ enum chg_state_flags flags; struct drbd_connection *connection; + struct drbd_state_change *state_change; }; static int w_after_conn_state_ch(struct drbd_work *w, int unused) @@ -1584,6 +1975,8 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) struct drbd_peer_device *peer_device; int vnr; + broadcast_state_change(acscw->state_change); + forget_state_change(acscw->state_change); kfree(acscw); /* Upon network configuration, we need to start the receiver */ @@ -1593,6 +1986,13 @@ static int w_after_conn_state_ch(struct drbd_work *w, int unused) if (oc == C_DISCONNECTING && ns_max.conn == C_STANDALONE) { struct net_conf *old_conf; + mutex_lock(¬ification_mutex); + idr_for_each_entry(&connection->peer_devices, peer_device, vnr) + notify_peer_device_state(NULL, 0, peer_device, NULL, + NOTIFY_DESTROY | NOTIFY_CONTINUES); + notify_connection_state(NULL, 0, connection, NULL, NOTIFY_DESTROY); + mutex_unlock(¬ification_mutex); + mutex_lock(&connection->resource->conf_update); old_conf = connection->net_conf; connection->my_addr_len = 0; @@ -1759,7 +2159,7 @@ conn_set_state(struct drbd_connection *connection, union drbd_state mask, union if (flags & CS_IGN_OUTD_FAIL && ns.disk == D_OUTDATED && os.disk < D_OUTDATED) ns.disk = os.disk; - rv = __drbd_set_state(device, ns, flags, NULL); + rv = _drbd_set_state(device, ns, flags, NULL); if (rv < SS_SUCCESS) BUG(); @@ -1823,6 +2223,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u enum drbd_conns oc = connection->cstate; union drbd_state ns_max, ns_min, os; bool have_mutex = false; + struct drbd_state_change *state_change; if (mask.conn) { rv = is_valid_conn_transition(oc, val.conn); @@ -1868,10 +2269,12 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u goto abort; } + state_change = remember_old_state(connection->resource, GFP_ATOMIC); conn_old_common_state(connection, &os, &flags); flags |= CS_DC_SUSP; conn_set_state(connection, mask, val, &ns_min, &ns_max, flags); conn_pr_state_change(connection, os, ns_max, flags); + remember_new_state(state_change); acscw = kmalloc(sizeof(*acscw), GFP_ATOMIC); if (acscw) { @@ -1882,6 +2285,7 @@ _conn_request_state(struct drbd_connection *connection, union drbd_state mask, u acscw->w.cb = w_after_conn_state_ch; kref_get(&connection->kref); acscw->connection = connection; + acscw->state_change = state_change; drbd_queue_work(&connection->sender_work, &acscw->w); } else { drbd_err(connection, "Could not kmalloc an acscw\n"); diff --git a/drivers/block/drbd/drbd_state.h b/drivers/block/drbd/drbd_state.h index 7f53c40823cd..bd989536f888 100644 --- a/drivers/block/drbd/drbd_state.h +++ b/drivers/block/drbd/drbd_state.h @@ -122,9 +122,9 @@ extern enum drbd_state_rv _drbd_request_state_holding_state_mutex(struct drbd_device *, union drbd_state, union drbd_state, enum chg_state_flags); -extern enum drbd_state_rv __drbd_set_state(struct drbd_device *, union drbd_state, - enum chg_state_flags, - struct completion *done); +extern enum drbd_state_rv _drbd_set_state(struct drbd_device *, union drbd_state, + enum chg_state_flags, + struct completion *done); extern void print_st_err(struct drbd_device *, union drbd_state, union drbd_state, int); diff --git a/drivers/block/drbd/drbd_state_change.h b/drivers/block/drbd/drbd_state_change.h new file mode 100644 index 000000000000..9e503a1a0bfb --- /dev/null +++ b/drivers/block/drbd/drbd_state_change.h @@ -0,0 +1,63 @@ +#ifndef DRBD_STATE_CHANGE_H +#define DRBD_STATE_CHANGE_H + +struct drbd_resource_state_change { + struct drbd_resource *resource; + enum drbd_role role[2]; + bool susp[2]; + bool susp_nod[2]; + bool susp_fen[2]; +}; + +struct drbd_device_state_change { + struct drbd_device *device; + enum drbd_disk_state disk_state[2]; +}; + +struct drbd_connection_state_change { + struct drbd_connection *connection; + enum drbd_conns cstate[2]; /* drbd9: enum drbd_conn_state */ + enum drbd_role peer_role[2]; +}; + +struct drbd_peer_device_state_change { + struct drbd_peer_device *peer_device; + enum drbd_disk_state disk_state[2]; + enum drbd_conns repl_state[2]; /* drbd9: enum drbd_repl_state */ + bool resync_susp_user[2]; + bool resync_susp_peer[2]; + bool resync_susp_dependency[2]; +}; + +struct drbd_state_change { + struct list_head list; + unsigned int n_devices; + unsigned int n_connections; + struct drbd_resource_state_change resource[1]; + struct drbd_device_state_change *devices; + struct drbd_connection_state_change *connections; + struct drbd_peer_device_state_change *peer_devices; +}; + +extern struct drbd_state_change *remember_old_state(struct drbd_resource *, gfp_t); +extern void copy_old_to_new_state_change(struct drbd_state_change *); +extern void forget_state_change(struct drbd_state_change *); + +extern void notify_resource_state_change(struct sk_buff *, + unsigned int, + struct drbd_resource_state_change *, + enum drbd_notification_type type); +extern void notify_connection_state_change(struct sk_buff *, + unsigned int, + struct drbd_connection_state_change *, + enum drbd_notification_type type); +extern void notify_device_state_change(struct sk_buff *, + unsigned int, + struct drbd_device_state_change *, + enum drbd_notification_type type); +extern void notify_peer_device_state_change(struct sk_buff *, + unsigned int, + struct drbd_peer_device_state_change *, + enum drbd_notification_type type); + +#endif /* DRBD_STATE_CHANGE_H */ diff --git a/drivers/block/drbd/drbd_worker.c b/drivers/block/drbd/drbd_worker.c index 5578c1477ba6..eff716c27b1f 100644 --- a/drivers/block/drbd/drbd_worker.c +++ b/drivers/block/drbd/drbd_worker.c @@ -55,13 +55,6 @@ static int make_resync_request(struct drbd_device *, int); * */ - -/* About the global_state_lock - Each state transition on an device holds a read lock. In case we have - to evaluate the resync after dependencies, we grab a write lock, because - we need stable states on all devices for that. */ -rwlock_t global_state_lock; - /* used for synchronous meta data and bitmap IO * submitted by drbd_md_sync_page_io() */ @@ -120,6 +113,7 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l unsigned long flags = 0; struct drbd_peer_device *peer_device = peer_req->peer_device; struct drbd_device *device = peer_device->device; + struct drbd_connection *connection = peer_device->connection; struct drbd_interval i; int do_wake; u64 block_id; @@ -152,6 +146,12 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l * ((peer_req->flags & (EE_WAS_ERROR|EE_IS_TRIM)) == EE_WAS_ERROR) */ if (peer_req->flags & EE_WAS_ERROR) __drbd_chk_io_error(device, DRBD_WRITE_ERROR); + + if (connection->cstate >= C_WF_REPORT_PARAMS) { + kref_get(&device->kref); /* put is in drbd_send_acks_wf() */ + if (!queue_work(connection->ack_sender, &peer_device->send_acks_work)) + kref_put(&device->kref, drbd_destroy_device); + } spin_unlock_irqrestore(&device->resource->req_lock, flags); if (block_id == ID_SYNCER) @@ -163,7 +163,6 @@ void drbd_endio_write_sec_final(struct drbd_peer_request *peer_req) __releases(l if (do_al_complete_io) drbd_al_complete_io(device, &i); - wake_asender(peer_device->connection); put_ldev(device); } @@ -195,6 +194,12 @@ void drbd_peer_request_endio(struct bio *bio) } } +void drbd_panic_after_delayed_completion_of_aborted_request(struct drbd_device *device) +{ + panic("drbd%u %s/%u potential random memory corruption caused by delayed completion of aborted local request\n", + device->minor, device->resource->name, device->vnr); +} + /* read, readA or write requests on R_PRIMARY coming from drbd_make_request */ void drbd_request_endio(struct bio *bio) @@ -238,7 +243,7 @@ void drbd_request_endio(struct bio *bio) drbd_emerg(device, "delayed completion of aborted local request; disk-timeout may be too aggressive\n"); if (!bio->bi_error) - panic("possible random memory corruption caused by delayed completion of aborted local request\n"); + drbd_panic_after_delayed_completion_of_aborted_request(device); } /* to avoid recursion in __req_mod */ @@ -1291,6 +1296,7 @@ static int drbd_send_barrier(struct drbd_connection *connection) p->barrier = connection->send.current_epoch_nr; p->pad = 0; connection->send.current_epoch_writes = 0; + connection->send.last_sent_barrier_jif = jiffies; return conn_send_command(connection, sock, P_BARRIER, sizeof(*p), NULL, 0); } @@ -1315,6 +1321,7 @@ static void re_init_if_first_write(struct drbd_connection *connection, unsigned connection->send.seen_any_write_yet = true; connection->send.current_epoch_nr = epoch; connection->send.current_epoch_writes = 0; + connection->send.last_sent_barrier_jif = jiffies; } } @@ -1456,70 +1463,73 @@ static int _drbd_may_sync_now(struct drbd_device *device) } /** - * _drbd_pause_after() - Pause resync on all devices that may not resync now + * drbd_pause_after() - Pause resync on all devices that may not resync now * @device: DRBD device. * * Called from process context only (admin command and after_state_ch). */ -static int _drbd_pause_after(struct drbd_device *device) +static bool drbd_pause_after(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; - if (!_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 1), CS_HARD, NULL) - != SS_NOTHING_TO_DO); + if (!_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 1), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } rcu_read_unlock(); - return rv; + return changed; } /** - * _drbd_resume_next() - Resume resync on all devices that may resync now + * drbd_resume_next() - Resume resync on all devices that may resync now * @device: DRBD device. * * Called from process context only (admin command and worker). */ -static int _drbd_resume_next(struct drbd_device *device) +static bool drbd_resume_next(struct drbd_device *device) { + bool changed = false; struct drbd_device *odev; - int i, rv = 0; + int i; rcu_read_lock(); idr_for_each_entry(&drbd_devices, odev, i) { if (odev->state.conn == C_STANDALONE && odev->state.disk == D_DISKLESS) continue; if (odev->state.aftr_isp) { - if (_drbd_may_sync_now(odev)) - rv |= (__drbd_set_state(_NS(odev, aftr_isp, 0), - CS_HARD, NULL) - != SS_NOTHING_TO_DO) ; + if (_drbd_may_sync_now(odev) && + _drbd_set_state(_NS(odev, aftr_isp, 0), + CS_HARD, NULL) != SS_NOTHING_TO_DO) + changed = true; } } rcu_read_unlock(); - return rv; + return changed; } void resume_next_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_resume_next(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_resume_next(device); + unlock_all_resources(); } void suspend_other_sg(struct drbd_device *device) { - write_lock_irq(&global_state_lock); - _drbd_pause_after(device); - write_unlock_irq(&global_state_lock); + lock_all_resources(); + drbd_pause_after(device); + unlock_all_resources(); } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_minor) { struct drbd_device *odev; @@ -1557,15 +1567,15 @@ enum drbd_ret_code drbd_resync_after_valid(struct drbd_device *device, int o_min } } -/* caller must hold global_state_lock */ +/* caller must lock_all_resources() */ void drbd_resync_after_changed(struct drbd_device *device) { - int changes; + int changed; do { - changes = _drbd_pause_after(device); - changes |= _drbd_resume_next(device); - } while (changes); + changed = drbd_pause_after(device); + changed |= drbd_resume_next(device); + } while (changed); } void drbd_rs_controller_reset(struct drbd_device *device) @@ -1685,19 +1695,14 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) } else { mutex_lock(device->state_mutex); } - clear_bit(B_RS_H_DONE, &device->flags); - /* req_lock: serialize with drbd_send_and_submit() and others - * global_state_lock: for stable sync-after dependencies */ - spin_lock_irq(&device->resource->req_lock); - write_lock(&global_state_lock); + lock_all_resources(); + clear_bit(B_RS_H_DONE, &device->flags); /* Did some connection breakage or IO error race with us? */ if (device->state.conn < C_CONNECTED || !get_ldev_if_state(device, D_NEGOTIATING)) { - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); - mutex_unlock(device->state_mutex); - return; + unlock_all_resources(); + goto out; } ns = drbd_read_state(device); @@ -1711,7 +1716,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) else /* side == C_SYNC_SOURCE */ ns.pdsk = D_INCONSISTENT; - r = __drbd_set_state(device, ns, CS_VERBOSE, NULL); + r = _drbd_set_state(device, ns, CS_VERBOSE, NULL); ns = drbd_read_state(device); if (ns.conn < C_CONNECTED) @@ -1732,7 +1737,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->rs_mark_left[i] = tw; device->rs_mark_time[i] = now; } - _drbd_pause_after(device); + drbd_pause_after(device); /* Forget potentially stale cached per resync extent bit-counts. * Open coded drbd_rs_cancel_all(device), we already have IRQs * disabled, and know the disk state is ok. */ @@ -1742,8 +1747,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) device->resync_wenr = LC_FREE; spin_unlock(&device->al_lock); } - write_unlock(&global_state_lock); - spin_unlock_irq(&device->resource->req_lock); + unlock_all_resources(); if (r == SS_SUCCESS) { wake_up(&device->al_wait); /* for lc_reset() above */ @@ -1807,6 +1811,7 @@ void drbd_start_resync(struct drbd_device *device, enum drbd_conns side) drbd_md_sync(device); } put_ldev(device); +out: mutex_unlock(device->state_mutex); } @@ -1836,7 +1841,7 @@ static void drbd_ldev_destroy(struct drbd_device *device) device->act_log = NULL; __acquire(local); - drbd_free_ldev(device->ldev); + drbd_backing_dev_free(device, device->ldev); device->ldev = NULL; __release(local); diff --git a/drivers/block/mtip32xx/mtip32xx.c b/drivers/block/mtip32xx/mtip32xx.c index 15bec407ac37..9b180dbbd03c 100644 --- a/drivers/block/mtip32xx/mtip32xx.c +++ b/drivers/block/mtip32xx/mtip32xx.c @@ -104,9 +104,9 @@ /* Device instance number, incremented each time a device is probed. */ static int instance; -struct list_head online_list; -struct list_head removing_list; -spinlock_t dev_lock; +static struct list_head online_list; +static struct list_head removing_list; +static spinlock_t dev_lock; /* * Global variable used to hold the major block device number diff --git a/drivers/block/null_blk.c b/drivers/block/null_blk.c index 95dff91135ad..8ba1e97d573c 100644 --- a/drivers/block/null_blk.c +++ b/drivers/block/null_blk.c @@ -436,9 +436,8 @@ static void null_del_dev(struct nullb *nullb) static void null_lnvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; - struct nvm_dev *dev = rqd->dev; - dev->mt->end_io(rqd, error); + nvm_end_io(rqd, error); blk_put_request(rq); } @@ -495,17 +494,17 @@ static int null_lnvm_id(struct nvm_dev *dev, struct nvm_id *id) id->ppaf.ch_offset = 56; id->ppaf.ch_len = 8; - do_div(size, bs); /* convert size to pages */ - do_div(size, 256); /* concert size to pgs pr blk */ + sector_div(size, bs); /* convert size to pages */ + size >>= 8; /* concert size to pgs pr blk */ grp = &id->groups[0]; grp->mtype = 0; grp->fmtype = 0; grp->num_ch = 1; grp->num_pg = 256; blksize = size; - do_div(size, (1 << 16)); + size >>= 16; grp->num_lun = size + 1; - do_div(blksize, grp->num_lun); + sector_div(blksize, grp->num_lun); grp->num_blk = blksize; grp->num_pln = 1; diff --git a/drivers/block/rbd.c b/drivers/block/rbd.c index 81ea69fee7ca..4a876785b68c 100644 --- a/drivers/block/rbd.c +++ b/drivers/block/rbd.c @@ -5185,8 +5185,7 @@ static int rbd_dev_probe_parent(struct rbd_device *rbd_dev, int depth) out_err: rbd_dev_unparent(rbd_dev); - if (parent) - rbd_dev_destroy(parent); + rbd_dev_destroy(parent); return ret; } diff --git a/drivers/block/sx8.c b/drivers/block/sx8.c index 59c91d49b14b..ba4bfe933276 100644 --- a/drivers/block/sx8.c +++ b/drivers/block/sx8.c @@ -23,7 +23,7 @@ #include <linux/workqueue.h> #include <linux/bitops.h> #include <linux/delay.h> -#include <linux/time.h> +#include <linux/ktime.h> #include <linux/hdreg.h> #include <linux/dma-mapping.h> #include <linux/completion.h> @@ -671,16 +671,15 @@ static int carm_send_special (struct carm_host *host, carm_sspc_t func) static unsigned int carm_fill_sync_time(struct carm_host *host, unsigned int idx, void *mem) { - struct timeval tv; struct carm_msg_sync_time *st = mem; - do_gettimeofday(&tv); + time64_t tv = ktime_get_real_seconds(); memset(st, 0, sizeof(*st)); st->type = CARM_MSG_MISC; st->subtype = MISC_SET_TIME; st->handle = cpu_to_le32(TAG_ENCODE(idx)); - st->timestamp = cpu_to_le32(tv.tv_sec); + st->timestamp = cpu_to_le32(tv); return sizeof(struct carm_msg_sync_time); } diff --git a/drivers/block/xen-blkback/blkback.c b/drivers/block/xen-blkback/blkback.c index 41fb1a917b17..4809c1501d7e 100644 --- a/drivers/block/xen-blkback/blkback.c +++ b/drivers/block/xen-blkback/blkback.c @@ -84,6 +84,16 @@ MODULE_PARM_DESC(max_persistent_grants, "Maximum number of grants to map persistently"); /* + * Maximum number of rings/queues blkback supports, allow as many queues as there + * are CPUs if user has not specified a value. + */ +unsigned int xenblk_max_queues; +module_param_named(max_queues, xenblk_max_queues, uint, 0644); +MODULE_PARM_DESC(max_queues, + "Maximum number of hardware queues per virtual disk." \ + "By default it is the number of online CPUs."); + +/* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. */ @@ -113,71 +123,71 @@ module_param(log_stats, int, 0644); /* Number of free pages to remove on each call to gnttab_free_pages */ #define NUM_BATCH_FREE_PAGES 10 -static inline int get_free_page(struct xen_blkif *blkif, struct page **page) +static inline int get_free_page(struct xen_blkif_ring *ring, struct page **page) { unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - if (list_empty(&blkif->free_pages)) { - BUG_ON(blkif->free_pages_num != 0); - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); + if (list_empty(&ring->free_pages)) { + BUG_ON(ring->free_pages_num != 0); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return gnttab_alloc_pages(1, page); } - BUG_ON(blkif->free_pages_num == 0); - page[0] = list_first_entry(&blkif->free_pages, struct page, lru); + BUG_ON(ring->free_pages_num == 0); + page[0] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[0]->lru); - blkif->free_pages_num--; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + ring->free_pages_num--; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); return 0; } -static inline void put_free_pages(struct xen_blkif *blkif, struct page **page, +static inline void put_free_pages(struct xen_blkif_ring *ring, struct page **page, int num) { unsigned long flags; int i; - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); for (i = 0; i < num; i++) - list_add(&page[i]->lru, &blkif->free_pages); - blkif->free_pages_num += num; - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + list_add(&page[i]->lru, &ring->free_pages); + ring->free_pages_num += num; + spin_unlock_irqrestore(&ring->free_pages_lock, flags); } -static inline void shrink_free_pagepool(struct xen_blkif *blkif, int num) +static inline void shrink_free_pagepool(struct xen_blkif_ring *ring, int num) { /* Remove requested pages in batches of NUM_BATCH_FREE_PAGES */ struct page *page[NUM_BATCH_FREE_PAGES]; unsigned int num_pages = 0; unsigned long flags; - spin_lock_irqsave(&blkif->free_pages_lock, flags); - while (blkif->free_pages_num > num) { - BUG_ON(list_empty(&blkif->free_pages)); - page[num_pages] = list_first_entry(&blkif->free_pages, + spin_lock_irqsave(&ring->free_pages_lock, flags); + while (ring->free_pages_num > num) { + BUG_ON(list_empty(&ring->free_pages)); + page[num_pages] = list_first_entry(&ring->free_pages, struct page, lru); list_del(&page[num_pages]->lru); - blkif->free_pages_num--; + ring->free_pages_num--; if (++num_pages == NUM_BATCH_FREE_PAGES) { - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); gnttab_free_pages(num_pages, page); - spin_lock_irqsave(&blkif->free_pages_lock, flags); + spin_lock_irqsave(&ring->free_pages_lock, flags); num_pages = 0; } } - spin_unlock_irqrestore(&blkif->free_pages_lock, flags); + spin_unlock_irqrestore(&ring->free_pages_lock, flags); if (num_pages != 0) gnttab_free_pages(num_pages, page); } #define vaddr(page) ((unsigned long)pfn_to_kaddr(page_to_pfn(page))) -static int do_block_io_op(struct xen_blkif *blkif); -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int do_block_io_op(struct xen_blkif_ring *ring); +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req); -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st); #define foreach_grant_safe(pos, n, rbtree, node) \ @@ -190,7 +200,7 @@ static void make_response(struct xen_blkif *blkif, u64 id, /* * We don't need locking around the persistent grant helpers - * because blkback uses a single-thread for each backed, so we + * because blkback uses a single-thread for each backend, so we * can be sure that this functions will never be called recursively. * * The only exception to that is put_persistent_grant, that can be called @@ -198,19 +208,20 @@ static void make_response(struct xen_blkif *blkif, u64 id, * bit operations to modify the flags of a persistent grant and to count * the number of used grants. */ -static int add_persistent_gnt(struct xen_blkif *blkif, +static int add_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { struct rb_node **new = NULL, *parent = NULL; struct persistent_gnt *this; + struct xen_blkif *blkif = ring->blkif; - if (blkif->persistent_gnt_c >= xen_blkif_max_pgrants) { + if (ring->persistent_gnt_c >= xen_blkif_max_pgrants) { if (!blkif->vbd.overflow_max_grants) blkif->vbd.overflow_max_grants = 1; return -EBUSY; } /* Figure out where to put new node */ - new = &blkif->persistent_gnts.rb_node; + new = &ring->persistent_gnts.rb_node; while (*new) { this = container_of(*new, struct persistent_gnt, node); @@ -229,19 +240,19 @@ static int add_persistent_gnt(struct xen_blkif *blkif, set_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); /* Add new node and rebalance tree. */ rb_link_node(&(persistent_gnt->node), parent, new); - rb_insert_color(&(persistent_gnt->node), &blkif->persistent_gnts); - blkif->persistent_gnt_c++; - atomic_inc(&blkif->persistent_gnt_in_use); + rb_insert_color(&(persistent_gnt->node), &ring->persistent_gnts); + ring->persistent_gnt_c++; + atomic_inc(&ring->persistent_gnt_in_use); return 0; } -static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, +static struct persistent_gnt *get_persistent_gnt(struct xen_blkif_ring *ring, grant_ref_t gref) { struct persistent_gnt *data; struct rb_node *node = NULL; - node = blkif->persistent_gnts.rb_node; + node = ring->persistent_gnts.rb_node; while (node) { data = container_of(node, struct persistent_gnt, node); @@ -255,24 +266,24 @@ static struct persistent_gnt *get_persistent_gnt(struct xen_blkif *blkif, return NULL; } set_bit(PERSISTENT_GNT_ACTIVE, data->flags); - atomic_inc(&blkif->persistent_gnt_in_use); + atomic_inc(&ring->persistent_gnt_in_use); return data; } } return NULL; } -static void put_persistent_gnt(struct xen_blkif *blkif, +static void put_persistent_gnt(struct xen_blkif_ring *ring, struct persistent_gnt *persistent_gnt) { if(!test_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags)) pr_alert_ratelimited("freeing a grant already unused\n"); set_bit(PERSISTENT_GNT_WAS_ACTIVE, persistent_gnt->flags); clear_bit(PERSISTENT_GNT_ACTIVE, persistent_gnt->flags); - atomic_dec(&blkif->persistent_gnt_in_use); + atomic_dec(&ring->persistent_gnt_in_use); } -static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, +static void free_persistent_gnts(struct xen_blkif_ring *ring, struct rb_root *root, unsigned int num) { struct gnttab_unmap_grant_ref unmap[BLKIF_MAX_SEGMENTS_PER_REQUEST]; @@ -303,7 +314,7 @@ static void free_persistent_gnts(struct xen_blkif *blkif, struct rb_root *root, unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } @@ -320,15 +331,15 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) struct page *pages[BLKIF_MAX_SEGMENTS_PER_REQUEST]; struct persistent_gnt *persistent_gnt; int segs_to_unmap = 0; - struct xen_blkif *blkif = container_of(work, typeof(*blkif), persistent_purge_work); + struct xen_blkif_ring *ring = container_of(work, typeof(*ring), persistent_purge_work); struct gntab_unmap_queue_data unmap_data; unmap_data.pages = pages; unmap_data.unmap_ops = unmap; unmap_data.kunmap_ops = NULL; - while(!list_empty(&blkif->persistent_purge_list)) { - persistent_gnt = list_first_entry(&blkif->persistent_purge_list, + while(!list_empty(&ring->persistent_purge_list)) { + persistent_gnt = list_first_entry(&ring->persistent_purge_list, struct persistent_gnt, remove_node); list_del(&persistent_gnt->remove_node); @@ -343,7 +354,7 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (++segs_to_unmap == BLKIF_MAX_SEGMENTS_PER_REQUEST) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); segs_to_unmap = 0; } kfree(persistent_gnt); @@ -351,11 +362,11 @@ void xen_blkbk_unmap_purged_grants(struct work_struct *work) if (segs_to_unmap > 0) { unmap_data.count = segs_to_unmap; BUG_ON(gnttab_unmap_refs_sync(&unmap_data)); - put_free_pages(blkif, pages, segs_to_unmap); + put_free_pages(ring, pages, segs_to_unmap); } } -static void purge_persistent_gnt(struct xen_blkif *blkif) +static void purge_persistent_gnt(struct xen_blkif_ring *ring) { struct persistent_gnt *persistent_gnt; struct rb_node *n; @@ -363,23 +374,23 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) bool scan_used = false, clean_used = false; struct rb_root *root; - if (blkif->persistent_gnt_c < xen_blkif_max_pgrants || - (blkif->persistent_gnt_c == xen_blkif_max_pgrants && - !blkif->vbd.overflow_max_grants)) { - return; + if (ring->persistent_gnt_c < xen_blkif_max_pgrants || + (ring->persistent_gnt_c == xen_blkif_max_pgrants && + !ring->blkif->vbd.overflow_max_grants)) { + goto out; } - if (work_busy(&blkif->persistent_purge_work)) { + if (work_busy(&ring->persistent_purge_work)) { pr_alert_ratelimited("Scheduled work from previous purge is still busy, cannot purge list\n"); - return; + goto out; } num_clean = (xen_blkif_max_pgrants / 100) * LRU_PERCENT_CLEAN; - num_clean = blkif->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; - num_clean = min(blkif->persistent_gnt_c, num_clean); + num_clean = ring->persistent_gnt_c - xen_blkif_max_pgrants + num_clean; + num_clean = min(ring->persistent_gnt_c, num_clean); if ((num_clean == 0) || - (num_clean > (blkif->persistent_gnt_c - atomic_read(&blkif->persistent_gnt_in_use)))) - return; + (num_clean > (ring->persistent_gnt_c - atomic_read(&ring->persistent_gnt_in_use)))) + goto out; /* * At this point, we can assure that there will be no calls @@ -394,8 +405,8 @@ static void purge_persistent_gnt(struct xen_blkif *blkif) pr_debug("Going to purge %u persistent grants\n", num_clean); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - root = &blkif->persistent_gnts; + BUG_ON(!list_empty(&ring->persistent_purge_list)); + root = &ring->persistent_gnts; purge_list: foreach_grant_safe(persistent_gnt, n, root, node) { BUG_ON(persistent_gnt->handle == @@ -414,7 +425,7 @@ purge_list: rb_erase(&persistent_gnt->node, root); list_add(&persistent_gnt->remove_node, - &blkif->persistent_purge_list); + &ring->persistent_purge_list); if (--num_clean == 0) goto finished; } @@ -435,30 +446,32 @@ finished: goto purge_list; } - blkif->persistent_gnt_c -= (total - num_clean); - blkif->vbd.overflow_max_grants = 0; + ring->persistent_gnt_c -= (total - num_clean); + ring->blkif->vbd.overflow_max_grants = 0; /* We can defer this work */ - schedule_work(&blkif->persistent_purge_work); + schedule_work(&ring->persistent_purge_work); pr_debug("Purged %u/%u\n", (total - num_clean), total); + +out: return; } /* * Retrieve from the 'pending_reqs' a free pending_req structure to be used. */ -static struct pending_req *alloc_req(struct xen_blkif *blkif) +static struct pending_req *alloc_req(struct xen_blkif_ring *ring) { struct pending_req *req = NULL; unsigned long flags; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - if (!list_empty(&blkif->pending_free)) { - req = list_entry(blkif->pending_free.next, struct pending_req, + spin_lock_irqsave(&ring->pending_free_lock, flags); + if (!list_empty(&ring->pending_free)) { + req = list_entry(ring->pending_free.next, struct pending_req, free_list); list_del(&req->free_list); } - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); return req; } @@ -466,17 +479,17 @@ static struct pending_req *alloc_req(struct xen_blkif *blkif) * Return the 'pending_req' structure back to the freepool. We also * wake up the thread if it was waiting for a free page. */ -static void free_req(struct xen_blkif *blkif, struct pending_req *req) +static void free_req(struct xen_blkif_ring *ring, struct pending_req *req) { unsigned long flags; int was_empty; - spin_lock_irqsave(&blkif->pending_free_lock, flags); - was_empty = list_empty(&blkif->pending_free); - list_add(&req->free_list, &blkif->pending_free); - spin_unlock_irqrestore(&blkif->pending_free_lock, flags); + spin_lock_irqsave(&ring->pending_free_lock, flags); + was_empty = list_empty(&ring->pending_free); + list_add(&req->free_list, &ring->pending_free); + spin_unlock_irqrestore(&ring->pending_free_lock, flags); if (was_empty) - wake_up(&blkif->pending_free_wq); + wake_up(&ring->pending_free_wq); } /* @@ -556,10 +569,10 @@ abort: /* * Notification from the guest OS. */ -static void blkif_notify_work(struct xen_blkif *blkif) +static void blkif_notify_work(struct xen_blkif_ring *ring) { - blkif->waiting_reqs = 1; - wake_up(&blkif->wq); + ring->waiting_reqs = 1; + wake_up(&ring->wq); } irqreturn_t xen_blkif_be_int(int irq, void *dev_id) @@ -572,31 +585,33 @@ irqreturn_t xen_blkif_be_int(int irq, void *dev_id) * SCHEDULER FUNCTIONS */ -static void print_stats(struct xen_blkif *blkif) +static void print_stats(struct xen_blkif_ring *ring) { pr_info("(%s): oo %3llu | rd %4llu | wr %4llu | f %4llu" " | ds %4llu | pg: %4u/%4d\n", - current->comm, blkif->st_oo_req, - blkif->st_rd_req, blkif->st_wr_req, - blkif->st_f_req, blkif->st_ds_req, - blkif->persistent_gnt_c, + current->comm, ring->st_oo_req, + ring->st_rd_req, ring->st_wr_req, + ring->st_f_req, ring->st_ds_req, + ring->persistent_gnt_c, xen_blkif_max_pgrants); - blkif->st_print = jiffies + msecs_to_jiffies(10 * 1000); - blkif->st_rd_req = 0; - blkif->st_wr_req = 0; - blkif->st_oo_req = 0; - blkif->st_ds_req = 0; + ring->st_print = jiffies + msecs_to_jiffies(10 * 1000); + ring->st_rd_req = 0; + ring->st_wr_req = 0; + ring->st_oo_req = 0; + ring->st_ds_req = 0; } int xen_blkif_schedule(void *arg) { - struct xen_blkif *blkif = arg; + struct xen_blkif_ring *ring = arg; + struct xen_blkif *blkif = ring->blkif; struct xen_vbd *vbd = &blkif->vbd; unsigned long timeout; int ret; xen_blkif_get(blkif); + set_freezable(); while (!kthread_should_stop()) { if (try_to_freeze()) continue; @@ -606,50 +621,50 @@ int xen_blkif_schedule(void *arg) timeout = msecs_to_jiffies(LRU_INTERVAL); timeout = wait_event_interruptible_timeout( - blkif->wq, - blkif->waiting_reqs || kthread_should_stop(), + ring->wq, + ring->waiting_reqs || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; timeout = wait_event_interruptible_timeout( - blkif->pending_free_wq, - !list_empty(&blkif->pending_free) || + ring->pending_free_wq, + !list_empty(&ring->pending_free) || kthread_should_stop(), timeout); if (timeout == 0) goto purge_gnt_list; - blkif->waiting_reqs = 0; + ring->waiting_reqs = 0; smp_mb(); /* clear flag *before* checking for work */ - ret = do_block_io_op(blkif); + ret = do_block_io_op(ring); if (ret > 0) - blkif->waiting_reqs = 1; + ring->waiting_reqs = 1; if (ret == -EACCES) - wait_event_interruptible(blkif->shutdown_wq, + wait_event_interruptible(ring->shutdown_wq, kthread_should_stop()); purge_gnt_list: if (blkif->vbd.feature_gnt_persistent && - time_after(jiffies, blkif->next_lru)) { - purge_persistent_gnt(blkif); - blkif->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); + time_after(jiffies, ring->next_lru)) { + purge_persistent_gnt(ring); + ring->next_lru = jiffies + msecs_to_jiffies(LRU_INTERVAL); } /* Shrink if we have more than xen_blkif_max_buffer_pages */ - shrink_free_pagepool(blkif, xen_blkif_max_buffer_pages); + shrink_free_pagepool(ring, xen_blkif_max_buffer_pages); - if (log_stats && time_after(jiffies, blkif->st_print)) - print_stats(blkif); + if (log_stats && time_after(jiffies, ring->st_print)) + print_stats(ring); } /* Drain pending purge work */ - flush_work(&blkif->persistent_purge_work); + flush_work(&ring->persistent_purge_work); if (log_stats) - print_stats(blkif); + print_stats(ring); - blkif->xenblkd = NULL; + ring->xenblkd = NULL; xen_blkif_put(blkif); return 0; @@ -658,22 +673,22 @@ purge_gnt_list: /* * Remove persistent grants and empty the pool of free pages */ -void xen_blkbk_free_caches(struct xen_blkif *blkif) +void xen_blkbk_free_caches(struct xen_blkif_ring *ring) { /* Free all persistent grant pages */ - if (!RB_EMPTY_ROOT(&blkif->persistent_gnts)) - free_persistent_gnts(blkif, &blkif->persistent_gnts, - blkif->persistent_gnt_c); + if (!RB_EMPTY_ROOT(&ring->persistent_gnts)) + free_persistent_gnts(ring, &ring->persistent_gnts, + ring->persistent_gnt_c); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - blkif->persistent_gnt_c = 0; + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + ring->persistent_gnt_c = 0; /* Since we are shutting down remove all pages from the buffer */ - shrink_free_pagepool(blkif, 0 /* All */); + shrink_free_pagepool(ring, 0 /* All */); } static unsigned int xen_blkbk_unmap_prepare( - struct xen_blkif *blkif, + struct xen_blkif_ring *ring, struct grant_page **pages, unsigned int num, struct gnttab_unmap_grant_ref *unmap_ops, @@ -683,7 +698,7 @@ static unsigned int xen_blkbk_unmap_prepare( for (i = 0; i < num; i++) { if (pages[i]->persistent_gnt != NULL) { - put_persistent_gnt(blkif, pages[i]->persistent_gnt); + put_persistent_gnt(ring, pages[i]->persistent_gnt); continue; } if (pages[i]->handle == BLKBACK_INVALID_HANDLE) @@ -700,17 +715,18 @@ static unsigned int xen_blkbk_unmap_prepare( static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_queue_data *data) { - struct pending_req* pending_req = (struct pending_req*) (data->data); - struct xen_blkif *blkif = pending_req->blkif; + struct pending_req *pending_req = (struct pending_req *)(data->data); + struct xen_blkif_ring *ring = pending_req->ring; + struct xen_blkif *blkif = ring->blkif; /* BUG_ON used to reproduce existing behaviour, but is this the best way to deal with this? */ BUG_ON(result); - put_free_pages(blkif, data->pages, data->count); - make_response(blkif, pending_req->id, + put_free_pages(ring, data->pages, data->count); + make_response(ring, pending_req->id, pending_req->operation, pending_req->status); - free_req(blkif, pending_req); + free_req(ring, pending_req); /* * Make sure the request is freed before releasing blkif, * or there could be a race between free_req and the @@ -723,7 +739,7 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ * pending_free_wq if there's a drain going on, but it has * to be taken into account if the current model is changed. */ - if (atomic_dec_and_test(&blkif->inflight) && atomic_read(&blkif->drain)) { + if (atomic_dec_and_test(&ring->inflight) && atomic_read(&blkif->drain)) { complete(&blkif->drain_complete); } xen_blkif_put(blkif); @@ -732,11 +748,11 @@ static void xen_blkbk_unmap_and_respond_callback(int result, struct gntab_unmap_ static void xen_blkbk_unmap_and_respond(struct pending_req *req) { struct gntab_unmap_queue_data* work = &req->gnttab_unmap_data; - struct xen_blkif *blkif = req->blkif; + struct xen_blkif_ring *ring = req->ring; struct grant_page **pages = req->segments; unsigned int invcount; - invcount = xen_blkbk_unmap_prepare(blkif, pages, req->nr_segs, + invcount = xen_blkbk_unmap_prepare(ring, pages, req->nr_segs, req->unmap, req->unmap_pages); work->data = req; @@ -757,7 +773,7 @@ static void xen_blkbk_unmap_and_respond(struct pending_req *req) * of hypercalls, but since this is only used in error paths there's * no real need. */ -static void xen_blkbk_unmap(struct xen_blkif *blkif, +static void xen_blkbk_unmap(struct xen_blkif_ring *ring, struct grant_page *pages[], int num) { @@ -768,20 +784,20 @@ static void xen_blkbk_unmap(struct xen_blkif *blkif, while (num) { unsigned int batch = min(num, BLKIF_MAX_SEGMENTS_PER_REQUEST); - - invcount = xen_blkbk_unmap_prepare(blkif, pages, batch, + + invcount = xen_blkbk_unmap_prepare(ring, pages, batch, unmap, unmap_pages); if (invcount) { ret = gnttab_unmap_refs(unmap, NULL, unmap_pages, invcount); BUG_ON(ret); - put_free_pages(blkif, unmap_pages, invcount); + put_free_pages(ring, unmap_pages, invcount); } pages += batch; num -= batch; } } -static int xen_blkbk_map(struct xen_blkif *blkif, +static int xen_blkbk_map(struct xen_blkif_ring *ring, struct grant_page *pages[], int num, bool ro) { @@ -794,6 +810,7 @@ static int xen_blkbk_map(struct xen_blkif *blkif, int ret = 0; int last_map = 0, map_until = 0; int use_persistent_gnts; + struct xen_blkif *blkif = ring->blkif; use_persistent_gnts = (blkif->vbd.feature_gnt_persistent); @@ -806,10 +823,11 @@ again: for (i = map_until; i < num; i++) { uint32_t flags; - if (use_persistent_gnts) + if (use_persistent_gnts) { persistent_gnt = get_persistent_gnt( - blkif, + ring, pages[i]->gref); + } if (persistent_gnt) { /* @@ -819,7 +837,7 @@ again: pages[i]->page = persistent_gnt->page; pages[i]->persistent_gnt = persistent_gnt; } else { - if (get_free_page(blkif, &pages[i]->page)) + if (get_free_page(ring, &pages[i]->page)) goto out_of_memory; addr = vaddr(pages[i]->page); pages_to_gnt[segs_to_map] = pages[i]->page; @@ -852,7 +870,7 @@ again: BUG_ON(new_map_idx >= segs_to_map); if (unlikely(map[new_map_idx].status != 0)) { pr_debug("invalid buffer -- could not remap it\n"); - put_free_pages(blkif, &pages[seg_idx]->page, 1); + put_free_pages(ring, &pages[seg_idx]->page, 1); pages[seg_idx]->handle = BLKBACK_INVALID_HANDLE; ret |= 1; goto next; @@ -862,7 +880,7 @@ again: continue; } if (use_persistent_gnts && - blkif->persistent_gnt_c < xen_blkif_max_pgrants) { + ring->persistent_gnt_c < xen_blkif_max_pgrants) { /* * We are using persistent grants, the grant is * not mapped but we might have room for it. @@ -880,7 +898,7 @@ again: persistent_gnt->gnt = map[new_map_idx].ref; persistent_gnt->handle = map[new_map_idx].handle; persistent_gnt->page = pages[seg_idx]->page; - if (add_persistent_gnt(blkif, + if (add_persistent_gnt(ring, persistent_gnt)) { kfree(persistent_gnt); persistent_gnt = NULL; @@ -888,7 +906,7 @@ again: } pages[seg_idx]->persistent_gnt = persistent_gnt; pr_debug("grant %u added to the tree of persistent grants, using %u/%u\n", - persistent_gnt->gnt, blkif->persistent_gnt_c, + persistent_gnt->gnt, ring->persistent_gnt_c, xen_blkif_max_pgrants); goto next; } @@ -913,7 +931,7 @@ next: out_of_memory: pr_alert("%s: out of memory\n", __func__); - put_free_pages(blkif, pages_to_gnt, segs_to_map); + put_free_pages(ring, pages_to_gnt, segs_to_map); return -ENOMEM; } @@ -921,7 +939,7 @@ static int xen_blkbk_map_seg(struct pending_req *pending_req) { int rc; - rc = xen_blkbk_map(pending_req->blkif, pending_req->segments, + rc = xen_blkbk_map(pending_req->ring, pending_req->segments, pending_req->nr_segs, (pending_req->operation != BLKIF_OP_READ)); @@ -934,7 +952,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, struct phys_req *preq) { struct grant_page **pages = pending_req->indirect_pages; - struct xen_blkif *blkif = pending_req->blkif; + struct xen_blkif_ring *ring = pending_req->ring; int indirect_grefs, rc, n, nseg, i; struct blkif_request_segment *segments = NULL; @@ -945,7 +963,7 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, for (i = 0; i < indirect_grefs; i++) pages[i]->gref = req->u.indirect.indirect_grefs[i]; - rc = xen_blkbk_map(blkif, pages, indirect_grefs, true); + rc = xen_blkbk_map(ring, pages, indirect_grefs, true); if (rc) goto unmap; @@ -977,15 +995,16 @@ static int xen_blkbk_parse_indirect(struct blkif_request *req, unmap: if (segments) kunmap_atomic(segments); - xen_blkbk_unmap(blkif, pages, indirect_grefs); + xen_blkbk_unmap(ring, pages, indirect_grefs); return rc; } -static int dispatch_discard_io(struct xen_blkif *blkif, +static int dispatch_discard_io(struct xen_blkif_ring *ring, struct blkif_request *req) { int err = 0; int status = BLKIF_RSP_OKAY; + struct xen_blkif *blkif = ring->blkif; struct block_device *bdev = blkif->vbd.bdev; unsigned long secure; struct phys_req preq; @@ -1002,7 +1021,7 @@ static int dispatch_discard_io(struct xen_blkif *blkif, preq.sector_number + preq.nr_sects, blkif->vbd.pdevice); goto fail_response; } - blkif->st_ds_req++; + ring->st_ds_req++; secure = (blkif->vbd.discard_secure && (req->u.discard.flag & BLKIF_DISCARD_SECURE)) ? @@ -1018,26 +1037,28 @@ fail_response: } else if (err) status = BLKIF_RSP_ERROR; - make_response(blkif, req->u.discard.id, req->operation, status); + make_response(ring, req->u.discard.id, req->operation, status); xen_blkif_put(blkif); return err; } -static int dispatch_other_io(struct xen_blkif *blkif, +static int dispatch_other_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { - free_req(blkif, pending_req); - make_response(blkif, req->u.other.id, req->operation, + free_req(ring, pending_req); + make_response(ring, req->u.other.id, req->operation, BLKIF_RSP_EOPNOTSUPP); return -EIO; } -static void xen_blk_drain_io(struct xen_blkif *blkif) +static void xen_blk_drain_io(struct xen_blkif_ring *ring) { + struct xen_blkif *blkif = ring->blkif; + atomic_set(&blkif->drain, 1); do { - if (atomic_read(&blkif->inflight) == 0) + if (atomic_read(&ring->inflight) == 0) break; wait_for_completion_interruptible_timeout( &blkif->drain_complete, HZ); @@ -1058,12 +1079,12 @@ static void __end_block_io_op(struct pending_req *pending_req, int error) if ((pending_req->operation == BLKIF_OP_FLUSH_DISKCACHE) && (error == -EOPNOTSUPP)) { pr_debug("flush diskcache op failed, not supported\n"); - xen_blkbk_flush_diskcache(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_flush_diskcache(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if ((pending_req->operation == BLKIF_OP_WRITE_BARRIER) && (error == -EOPNOTSUPP)) { pr_debug("write barrier op failed, not supported\n"); - xen_blkbk_barrier(XBT_NIL, pending_req->blkif->be, 0); + xen_blkbk_barrier(XBT_NIL, pending_req->ring->blkif->be, 0); pending_req->status = BLKIF_RSP_EOPNOTSUPP; } else if (error) { pr_debug("Buffer not up-to-date at end of operation," @@ -1097,9 +1118,9 @@ static void end_block_io_op(struct bio *bio) * and transmute it to the block API to hand it over to the proper block disk. */ static int -__do_block_io_op(struct xen_blkif *blkif) +__do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; struct blkif_request req; struct pending_req *pending_req; RING_IDX rc, rp; @@ -1112,7 +1133,7 @@ __do_block_io_op(struct xen_blkif *blkif) if (RING_REQUEST_PROD_OVERFLOW(&blk_rings->common, rp)) { rc = blk_rings->common.rsp_prod_pvt; pr_warn("Frontend provided bogus ring requests (%d - %d = %d). Halting ring processing on dev=%04x\n", - rp, rc, rp - rc, blkif->vbd.pdevice); + rp, rc, rp - rc, ring->blkif->vbd.pdevice); return -EACCES; } while (rc != rp) { @@ -1125,14 +1146,14 @@ __do_block_io_op(struct xen_blkif *blkif) break; } - pending_req = alloc_req(blkif); + pending_req = alloc_req(ring); if (NULL == pending_req) { - blkif->st_oo_req++; + ring->st_oo_req++; more_to_do = 1; break; } - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(&req, RING_GET_REQUEST(&blk_rings->native, rc), sizeof(req)); break; @@ -1156,16 +1177,16 @@ __do_block_io_op(struct xen_blkif *blkif) case BLKIF_OP_WRITE_BARRIER: case BLKIF_OP_FLUSH_DISKCACHE: case BLKIF_OP_INDIRECT: - if (dispatch_rw_block_io(blkif, &req, pending_req)) + if (dispatch_rw_block_io(ring, &req, pending_req)) goto done; break; case BLKIF_OP_DISCARD: - free_req(blkif, pending_req); - if (dispatch_discard_io(blkif, &req)) + free_req(ring, pending_req); + if (dispatch_discard_io(ring, &req)) goto done; break; default: - if (dispatch_other_io(blkif, &req, pending_req)) + if (dispatch_other_io(ring, &req, pending_req)) goto done; break; } @@ -1178,13 +1199,13 @@ done: } static int -do_block_io_op(struct xen_blkif *blkif) +do_block_io_op(struct xen_blkif_ring *ring) { - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings = &ring->blk_rings; int more_to_do; do { - more_to_do = __do_block_io_op(blkif); + more_to_do = __do_block_io_op(ring); if (more_to_do) break; @@ -1197,7 +1218,7 @@ do_block_io_op(struct xen_blkif *blkif) * Transmutation of the 'struct blkif_request' to a proper 'struct bio' * and call the 'submit_bio' to pass it to the underlying storage. */ -static int dispatch_rw_block_io(struct xen_blkif *blkif, +static int dispatch_rw_block_io(struct xen_blkif_ring *ring, struct blkif_request *req, struct pending_req *pending_req) { @@ -1225,17 +1246,17 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, switch (req_operation) { case BLKIF_OP_READ: - blkif->st_rd_req++; + ring->st_rd_req++; operation = READ; break; case BLKIF_OP_WRITE: - blkif->st_wr_req++; + ring->st_wr_req++; operation = WRITE_ODIRECT; break; case BLKIF_OP_WRITE_BARRIER: drain = true; case BLKIF_OP_FLUSH_DISKCACHE: - blkif->st_f_req++; + ring->st_f_req++; operation = WRITE_FLUSH; break; default: @@ -1260,7 +1281,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, preq.nr_sects = 0; - pending_req->blkif = blkif; + pending_req->ring = ring; pending_req->id = req->u.rw.id; pending_req->operation = req_operation; pending_req->status = BLKIF_RSP_OKAY; @@ -1287,12 +1308,12 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, goto fail_response; } - if (xen_vbd_translate(&preq, blkif, operation) != 0) { + if (xen_vbd_translate(&preq, ring->blkif, operation) != 0) { pr_debug("access denied: %s of [%llu,%llu] on dev=%04x\n", operation == READ ? "read" : "write", preq.sector_number, preq.sector_number + preq.nr_sects, - blkif->vbd.pdevice); + ring->blkif->vbd.pdevice); goto fail_response; } @@ -1304,7 +1325,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, if (((int)preq.sector_number|(int)seg[i].nsec) & ((bdev_logical_block_size(preq.bdev) >> 9) - 1)) { pr_debug("Misaligned I/O request from domain %d\n", - blkif->domid); + ring->blkif->domid); goto fail_response; } } @@ -1313,7 +1334,7 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * issue the WRITE_FLUSH. */ if (drain) - xen_blk_drain_io(pending_req->blkif); + xen_blk_drain_io(pending_req->ring); /* * If we have failed at this point, we need to undo the M2P override, @@ -1328,8 +1349,8 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, * This corresponding xen_blkif_put is done in __end_block_io_op, or * below (in "!bio") if we are handling a BLKIF_OP_DISCARD. */ - xen_blkif_get(blkif); - atomic_inc(&blkif->inflight); + xen_blkif_get(ring->blkif); + atomic_inc(&ring->inflight); for (i = 0; i < nseg; i++) { while ((bio == NULL) || @@ -1377,19 +1398,19 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, blk_finish_plug(&plug); if (operation == READ) - blkif->st_rd_sect += preq.nr_sects; + ring->st_rd_sect += preq.nr_sects; else if (operation & WRITE) - blkif->st_wr_sect += preq.nr_sects; + ring->st_wr_sect += preq.nr_sects; return 0; fail_flush: - xen_blkbk_unmap(blkif, pending_req->segments, + xen_blkbk_unmap(ring, pending_req->segments, pending_req->nr_segs); fail_response: /* Haven't submitted any bio's yet. */ - make_response(blkif, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); - free_req(blkif, pending_req); + make_response(ring, req->u.rw.id, req_operation, BLKIF_RSP_ERROR); + free_req(ring, pending_req); msleep(1); /* back off a bit */ return -EIO; @@ -1407,21 +1428,22 @@ static int dispatch_rw_block_io(struct xen_blkif *blkif, /* * Put a response on the ring on how the operation fared. */ -static void make_response(struct xen_blkif *blkif, u64 id, +static void make_response(struct xen_blkif_ring *ring, u64 id, unsigned short op, int st) { struct blkif_response resp; unsigned long flags; - union blkif_back_rings *blk_rings = &blkif->blk_rings; + union blkif_back_rings *blk_rings; int notify; resp.id = id; resp.operation = op; resp.status = st; - spin_lock_irqsave(&blkif->blk_ring_lock, flags); + spin_lock_irqsave(&ring->blk_ring_lock, flags); + blk_rings = &ring->blk_rings; /* Place on the response ring for the relevant domain. */ - switch (blkif->blk_protocol) { + switch (ring->blkif->blk_protocol) { case BLKIF_PROTOCOL_NATIVE: memcpy(RING_GET_RESPONSE(&blk_rings->native, blk_rings->native.rsp_prod_pvt), &resp, sizeof(resp)); @@ -1439,9 +1461,9 @@ static void make_response(struct xen_blkif *blkif, u64 id, } blk_rings->common.rsp_prod_pvt++; RING_PUSH_RESPONSES_AND_CHECK_NOTIFY(&blk_rings->common, notify); - spin_unlock_irqrestore(&blkif->blk_ring_lock, flags); + spin_unlock_irqrestore(&ring->blk_ring_lock, flags); if (notify) - notify_remote_via_irq(blkif->irq); + notify_remote_via_irq(ring->irq); } static int __init xen_blkif_init(void) @@ -1457,6 +1479,9 @@ static int __init xen_blkif_init(void) xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; } + if (xenblk_max_queues == 0) + xenblk_max_queues = num_online_cpus(); + rc = xen_blkif_interface_init(); if (rc) goto failed_init; diff --git a/drivers/block/xen-blkback/common.h b/drivers/block/xen-blkback/common.h index c929ae22764c..dea61f6ab8cb 100644 --- a/drivers/block/xen-blkback/common.h +++ b/drivers/block/xen-blkback/common.h @@ -46,6 +46,7 @@ #include <xen/interface/io/protocols.h> extern unsigned int xen_blkif_max_ring_order; +extern unsigned int xenblk_max_queues; /* * This is the maximum number of segments that would be allowed in indirect * requests. This value will also be passed to the frontend. @@ -269,68 +270,79 @@ struct persistent_gnt { struct list_head remove_node; }; -struct xen_blkif { - /* Unique identifier for this interface. */ - domid_t domid; - unsigned int handle; +/* Per-ring information. */ +struct xen_blkif_ring { /* Physical parameters of the comms window. */ unsigned int irq; - /* Comms information. */ - enum blkif_protocol blk_protocol; union blkif_back_rings blk_rings; void *blk_ring; - /* The VBD attached to this interface. */ - struct xen_vbd vbd; - /* Back pointer to the backend_info. */ - struct backend_info *be; /* Private fields. */ spinlock_t blk_ring_lock; - atomic_t refcnt; wait_queue_head_t wq; - /* for barrier (drain) requests */ - struct completion drain_complete; - atomic_t drain; atomic_t inflight; - /* One thread per one blkif. */ + /* One thread per blkif ring. */ struct task_struct *xenblkd; unsigned int waiting_reqs; - /* tree to store persistent grants */ + /* List of all 'pending_req' available */ + struct list_head pending_free; + /* And its spinlock. */ + spinlock_t pending_free_lock; + wait_queue_head_t pending_free_wq; + + /* Tree to store persistent grants. */ + spinlock_t pers_gnts_lock; struct rb_root persistent_gnts; unsigned int persistent_gnt_c; atomic_t persistent_gnt_in_use; unsigned long next_lru; - /* used by the kworker that offload work from the persistent purge */ + /* Statistics. */ + unsigned long st_print; + unsigned long long st_rd_req; + unsigned long long st_wr_req; + unsigned long long st_oo_req; + unsigned long long st_f_req; + unsigned long long st_ds_req; + unsigned long long st_rd_sect; + unsigned long long st_wr_sect; + + /* Used by the kworker that offload work from the persistent purge. */ struct list_head persistent_purge_list; struct work_struct persistent_purge_work; - /* buffer of free pages to map grant refs */ + /* Buffer of free pages to map grant refs. */ spinlock_t free_pages_lock; int free_pages_num; struct list_head free_pages; - /* List of all 'pending_req' available */ - struct list_head pending_free; - /* And its spinlock. */ - spinlock_t pending_free_lock; - wait_queue_head_t pending_free_wq; - - /* statistics */ - unsigned long st_print; - unsigned long long st_rd_req; - unsigned long long st_wr_req; - unsigned long long st_oo_req; - unsigned long long st_f_req; - unsigned long long st_ds_req; - unsigned long long st_rd_sect; - unsigned long long st_wr_sect; - struct work_struct free_work; /* Thread shutdown wait queue. */ wait_queue_head_t shutdown_wq; - unsigned int nr_ring_pages; + struct xen_blkif *blkif; +}; + +struct xen_blkif { + /* Unique identifier for this interface. */ + domid_t domid; + unsigned int handle; + /* Comms information. */ + enum blkif_protocol blk_protocol; + /* The VBD attached to this interface. */ + struct xen_vbd vbd; + /* Back pointer to the backend_info. */ + struct backend_info *be; + atomic_t refcnt; + /* for barrier (drain) requests */ + struct completion drain_complete; + atomic_t drain; + + struct work_struct free_work; + unsigned int nr_ring_pages; + /* All rings for this device. */ + struct xen_blkif_ring *rings; + unsigned int nr_rings; }; struct seg_buf { @@ -352,7 +364,7 @@ struct grant_page { * response queued for it, with the saved 'id' passed back. */ struct pending_req { - struct xen_blkif *blkif; + struct xen_blkif_ring *ring; u64 id; int nr_segs; atomic_t pendcnt; @@ -394,7 +406,7 @@ int xen_blkif_xenbus_init(void); irqreturn_t xen_blkif_be_int(int irq, void *dev_id); int xen_blkif_schedule(void *arg); int xen_blkif_purge_persistent(void *arg); -void xen_blkbk_free_caches(struct xen_blkif *blkif); +void xen_blkbk_free_caches(struct xen_blkif_ring *ring); int xen_blkbk_flush_diskcache(struct xenbus_transaction xbt, struct backend_info *be, int state); diff --git a/drivers/block/xen-blkback/xenbus.c b/drivers/block/xen-blkback/xenbus.c index f53cff42f8da..876763f7f13e 100644 --- a/drivers/block/xen-blkback/xenbus.c +++ b/drivers/block/xen-blkback/xenbus.c @@ -86,9 +86,11 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) { int err; char name[BLKBACK_NAME_LEN]; + struct xen_blkif_ring *ring; + int i; /* Not ready to connect? */ - if (!blkif->irq || !blkif->vbd.bdev) + if (!blkif->rings || !blkif->rings[0].irq || !blkif->vbd.bdev) return; /* Already connected? */ @@ -113,13 +115,55 @@ static void xen_update_blkif_status(struct xen_blkif *blkif) } invalidate_inode_pages2(blkif->vbd.bdev->bd_inode->i_mapping); - blkif->xenblkd = kthread_run(xen_blkif_schedule, blkif, "%s", name); - if (IS_ERR(blkif->xenblkd)) { - err = PTR_ERR(blkif->xenblkd); - blkif->xenblkd = NULL; - xenbus_dev_error(blkif->be->dev, err, "start xenblkd"); - return; + for (i = 0; i < blkif->nr_rings; i++) { + ring = &blkif->rings[i]; + ring->xenblkd = kthread_run(xen_blkif_schedule, ring, "%s-%d", name, i); + if (IS_ERR(ring->xenblkd)) { + err = PTR_ERR(ring->xenblkd); + ring->xenblkd = NULL; + xenbus_dev_fatal(blkif->be->dev, err, + "start %s-%d xenblkd", name, i); + goto out; + } + } + return; + +out: + while (--i >= 0) { + ring = &blkif->rings[i]; + kthread_stop(ring->xenblkd); + } + return; +} + +static int xen_blkif_alloc_rings(struct xen_blkif *blkif) +{ + unsigned int r; + + blkif->rings = kzalloc(blkif->nr_rings * sizeof(struct xen_blkif_ring), GFP_KERNEL); + if (!blkif->rings) + return -ENOMEM; + + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + + spin_lock_init(&ring->blk_ring_lock); + init_waitqueue_head(&ring->wq); + INIT_LIST_HEAD(&ring->pending_free); + INIT_LIST_HEAD(&ring->persistent_purge_list); + INIT_WORK(&ring->persistent_purge_work, xen_blkbk_unmap_purged_grants); + spin_lock_init(&ring->free_pages_lock); + INIT_LIST_HEAD(&ring->free_pages); + + spin_lock_init(&ring->pending_free_lock); + init_waitqueue_head(&ring->pending_free_wq); + init_waitqueue_head(&ring->shutdown_wq); + ring->blkif = blkif; + ring->st_print = jiffies; + xen_blkif_get(blkif); } + + return 0; } static struct xen_blkif *xen_blkif_alloc(domid_t domid) @@ -133,41 +177,25 @@ static struct xen_blkif *xen_blkif_alloc(domid_t domid) return ERR_PTR(-ENOMEM); blkif->domid = domid; - spin_lock_init(&blkif->blk_ring_lock); atomic_set(&blkif->refcnt, 1); - init_waitqueue_head(&blkif->wq); init_completion(&blkif->drain_complete); - atomic_set(&blkif->drain, 0); - blkif->st_print = jiffies; - blkif->persistent_gnts.rb_node = NULL; - spin_lock_init(&blkif->free_pages_lock); - INIT_LIST_HEAD(&blkif->free_pages); - INIT_LIST_HEAD(&blkif->persistent_purge_list); - blkif->free_pages_num = 0; - atomic_set(&blkif->persistent_gnt_in_use, 0); - atomic_set(&blkif->inflight, 0); - INIT_WORK(&blkif->persistent_purge_work, xen_blkbk_unmap_purged_grants); - - INIT_LIST_HEAD(&blkif->pending_free); INIT_WORK(&blkif->free_work, xen_blkif_deferred_free); - spin_lock_init(&blkif->pending_free_lock); - init_waitqueue_head(&blkif->pending_free_wq); - init_waitqueue_head(&blkif->shutdown_wq); return blkif; } -static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, +static int xen_blkif_map(struct xen_blkif_ring *ring, grant_ref_t *gref, unsigned int nr_grefs, unsigned int evtchn) { int err; + struct xen_blkif *blkif = ring->blkif; /* Already connected through? */ - if (blkif->irq) + if (ring->irq) return 0; err = xenbus_map_ring_valloc(blkif->be->dev, gref, nr_grefs, - &blkif->blk_ring); + &ring->blk_ring); if (err < 0) return err; @@ -175,24 +203,24 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, case BLKIF_PROTOCOL_NATIVE: { struct blkif_sring *sring; - sring = (struct blkif_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.native, sring, + sring = (struct blkif_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.native, sring, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_32: { struct blkif_x86_32_sring *sring_x86_32; - sring_x86_32 = (struct blkif_x86_32_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_32, sring_x86_32, + sring_x86_32 = (struct blkif_x86_32_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_32, sring_x86_32, XEN_PAGE_SIZE * nr_grefs); break; } case BLKIF_PROTOCOL_X86_64: { struct blkif_x86_64_sring *sring_x86_64; - sring_x86_64 = (struct blkif_x86_64_sring *)blkif->blk_ring; - BACK_RING_INIT(&blkif->blk_rings.x86_64, sring_x86_64, + sring_x86_64 = (struct blkif_x86_64_sring *)ring->blk_ring; + BACK_RING_INIT(&ring->blk_rings.x86_64, sring_x86_64, XEN_PAGE_SIZE * nr_grefs); break; } @@ -202,13 +230,13 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, err = bind_interdomain_evtchn_to_irqhandler(blkif->domid, evtchn, xen_blkif_be_int, 0, - "blkif-backend", blkif); + "blkif-backend", ring); if (err < 0) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; return err; } - blkif->irq = err; + ring->irq = err; return 0; } @@ -216,50 +244,69 @@ static int xen_blkif_map(struct xen_blkif *blkif, grant_ref_t *gref, static int xen_blkif_disconnect(struct xen_blkif *blkif) { struct pending_req *req, *n; - int i = 0, j; + unsigned int j, r; - if (blkif->xenblkd) { - kthread_stop(blkif->xenblkd); - wake_up(&blkif->shutdown_wq); - blkif->xenblkd = NULL; - } + for (r = 0; r < blkif->nr_rings; r++) { + struct xen_blkif_ring *ring = &blkif->rings[r]; + unsigned int i = 0; - /* The above kthread_stop() guarantees that at this point we - * don't have any discard_io or other_io requests. So, checking - * for inflight IO is enough. - */ - if (atomic_read(&blkif->inflight) > 0) - return -EBUSY; + if (ring->xenblkd) { + kthread_stop(ring->xenblkd); + wake_up(&ring->shutdown_wq); + ring->xenblkd = NULL; + } - if (blkif->irq) { - unbind_from_irqhandler(blkif->irq, blkif); - blkif->irq = 0; - } + /* The above kthread_stop() guarantees that at this point we + * don't have any discard_io or other_io requests. So, checking + * for inflight IO is enough. + */ + if (atomic_read(&ring->inflight) > 0) + return -EBUSY; - if (blkif->blk_rings.common.sring) { - xenbus_unmap_ring_vfree(blkif->be->dev, blkif->blk_ring); - blkif->blk_rings.common.sring = NULL; - } + if (ring->irq) { + unbind_from_irqhandler(ring->irq, ring); + ring->irq = 0; + } - /* Remove all persistent grants and the cache of ballooned pages. */ - xen_blkbk_free_caches(blkif); + if (ring->blk_rings.common.sring) { + xenbus_unmap_ring_vfree(blkif->be->dev, ring->blk_ring); + ring->blk_rings.common.sring = NULL; + } - /* Check that there is no request in use */ - list_for_each_entry_safe(req, n, &blkif->pending_free, free_list) { - list_del(&req->free_list); + /* Remove all persistent grants and the cache of ballooned pages. */ + xen_blkbk_free_caches(ring); - for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) - kfree(req->segments[j]); + /* Check that there is no request in use */ + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { + list_del(&req->free_list); - for (j = 0; j < MAX_INDIRECT_PAGES; j++) - kfree(req->indirect_pages[j]); + for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) + kfree(req->segments[j]); - kfree(req); - i++; - } + for (j = 0; j < MAX_INDIRECT_PAGES; j++) + kfree(req->indirect_pages[j]); + + kfree(req); + i++; + } - WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + BUG_ON(atomic_read(&ring->persistent_gnt_in_use) != 0); + BUG_ON(!list_empty(&ring->persistent_purge_list)); + BUG_ON(!RB_EMPTY_ROOT(&ring->persistent_gnts)); + BUG_ON(!list_empty(&ring->free_pages)); + BUG_ON(ring->free_pages_num != 0); + BUG_ON(ring->persistent_gnt_c != 0); + WARN_ON(i != (XEN_BLKIF_REQS_PER_PAGE * blkif->nr_ring_pages)); + xen_blkif_put(blkif); + } blkif->nr_ring_pages = 0; + /* + * blkif->rings was allocated in connect_ring, so we should free it in + * here. + */ + kfree(blkif->rings); + blkif->rings = NULL; + blkif->nr_rings = 0; return 0; } @@ -271,13 +318,6 @@ static void xen_blkif_free(struct xen_blkif *blkif) xen_vbd_free(&blkif->vbd); /* Make sure everything is drained before shutting down */ - BUG_ON(blkif->persistent_gnt_c != 0); - BUG_ON(atomic_read(&blkif->persistent_gnt_in_use) != 0); - BUG_ON(blkif->free_pages_num != 0); - BUG_ON(!list_empty(&blkif->persistent_purge_list)); - BUG_ON(!list_empty(&blkif->free_pages)); - BUG_ON(!RB_EMPTY_ROOT(&blkif->persistent_gnts)); - kmem_cache_free(xen_blkif_cachep, blkif); } @@ -296,25 +336,38 @@ int __init xen_blkif_interface_init(void) * sysfs interface for VBD I/O requests */ -#define VBD_SHOW(name, format, args...) \ +#define VBD_SHOW_ALLRING(name, format) \ static ssize_t show_##name(struct device *_dev, \ struct device_attribute *attr, \ char *buf) \ { \ struct xenbus_device *dev = to_xenbus_device(_dev); \ struct backend_info *be = dev_get_drvdata(&dev->dev); \ + struct xen_blkif *blkif = be->blkif; \ + unsigned int i; \ + unsigned long long result = 0; \ \ - return sprintf(buf, format, ##args); \ + if (!blkif->rings) \ + goto out; \ + \ + for (i = 0; i < blkif->nr_rings; i++) { \ + struct xen_blkif_ring *ring = &blkif->rings[i]; \ + \ + result += ring->st_##name; \ + } \ + \ +out: \ + return sprintf(buf, format, result); \ } \ static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) -VBD_SHOW(oo_req, "%llu\n", be->blkif->st_oo_req); -VBD_SHOW(rd_req, "%llu\n", be->blkif->st_rd_req); -VBD_SHOW(wr_req, "%llu\n", be->blkif->st_wr_req); -VBD_SHOW(f_req, "%llu\n", be->blkif->st_f_req); -VBD_SHOW(ds_req, "%llu\n", be->blkif->st_ds_req); -VBD_SHOW(rd_sect, "%llu\n", be->blkif->st_rd_sect); -VBD_SHOW(wr_sect, "%llu\n", be->blkif->st_wr_sect); +VBD_SHOW_ALLRING(oo_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_req, "%llu\n"); +VBD_SHOW_ALLRING(wr_req, "%llu\n"); +VBD_SHOW_ALLRING(f_req, "%llu\n"); +VBD_SHOW_ALLRING(ds_req, "%llu\n"); +VBD_SHOW_ALLRING(rd_sect, "%llu\n"); +VBD_SHOW_ALLRING(wr_sect, "%llu\n"); static struct attribute *xen_vbdstat_attrs[] = { &dev_attr_oo_req.attr, @@ -332,6 +385,18 @@ static struct attribute_group xen_vbdstat_group = { .attrs = xen_vbdstat_attrs, }; +#define VBD_SHOW(name, format, args...) \ + static ssize_t show_##name(struct device *_dev, \ + struct device_attribute *attr, \ + char *buf) \ + { \ + struct xenbus_device *dev = to_xenbus_device(_dev); \ + struct backend_info *be = dev_get_drvdata(&dev->dev); \ + \ + return sprintf(buf, format, ##args); \ + } \ + static DEVICE_ATTR(name, S_IRUGO, show_##name, NULL) + VBD_SHOW(physical_device, "%x:%x\n", be->major, be->minor); VBD_SHOW(mode, "%s\n", be->mode); @@ -440,11 +505,11 @@ static int xen_blkbk_remove(struct xenbus_device *dev) dev_set_drvdata(&dev->dev, NULL); - if (be->blkif) { + if (be->blkif) xen_blkif_disconnect(be->blkif); - xen_blkif_put(be->blkif); - } + /* Put the reference we set in xen_blkif_alloc(). */ + xen_blkif_put(be->blkif); kfree(be->mode); kfree(be); return 0; @@ -553,6 +618,12 @@ static int xen_blkbk_probe(struct xenbus_device *dev, goto fail; } + /* Multi-queue: advertise how many queues are supported by us.*/ + err = xenbus_printf(XBT_NIL, dev->nodename, + "multi-queue-max-queues", "%u", xenblk_max_queues); + if (err) + pr_warn("Error writing multi-queue-max-queues\n"); + /* setup back pointer */ be->blkif->be = be; @@ -708,8 +779,14 @@ static void frontend_changed(struct xenbus_device *dev, } err = connect_ring(be); - if (err) + if (err) { + /* + * Clean up so that memory resources can be used by + * other devices. connect_ring reported already error. + */ + xen_blkif_disconnect(be->blkif); break; + } xen_update_blkif_status(be->blkif); break; @@ -825,50 +902,43 @@ again: xenbus_transaction_end(xbt, 1); } - -static int connect_ring(struct backend_info *be) +/* + * Each ring may have multi pages, depends on "ring-page-order". + */ +static int read_per_ring_refs(struct xen_blkif_ring *ring, const char *dir) { - struct xenbus_device *dev = be->dev; unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; - unsigned int evtchn, nr_grefs, ring_page_order; - unsigned int pers_grants; - char protocol[64] = ""; struct pending_req *req, *n; int err, i, j; + struct xen_blkif *blkif = ring->blkif; + struct xenbus_device *dev = blkif->be->dev; + unsigned int ring_page_order, nr_grefs, evtchn; - pr_debug("%s %s\n", __func__, dev->otherend); - - err = xenbus_scanf(XBT_NIL, dev->otherend, "event-channel", "%u", + err = xenbus_scanf(XBT_NIL, dir, "event-channel", "%u", &evtchn); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/event-channel", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/event-channel", dir); return err; } - pr_info("event-channel %u\n", evtchn); err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-page-order", "%u", &ring_page_order); if (err != 1) { - err = xenbus_scanf(XBT_NIL, dev->otherend, "ring-ref", - "%u", &ring_ref[0]); + err = xenbus_scanf(XBT_NIL, dir, "ring-ref", "%u", &ring_ref[0]); if (err != 1) { err = -EINVAL; - xenbus_dev_fatal(dev, err, "reading %s/ring-ref", - dev->otherend); + xenbus_dev_fatal(dev, err, "reading %s/ring-ref", dir); return err; } nr_grefs = 1; - pr_info("%s:using single page: ring-ref %d\n", dev->otherend, - ring_ref[0]); } else { unsigned int i; if (ring_page_order > xen_blkif_max_ring_order) { err = -EINVAL; xenbus_dev_fatal(dev, err, "%s/request %d ring page order exceed max:%d", - dev->otherend, ring_page_order, + dir, ring_page_order, xen_blkif_max_ring_order); return err; } @@ -878,52 +948,23 @@ static int connect_ring(struct backend_info *be) char ring_ref_name[RINGREF_NAME_LEN]; snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_scanf(XBT_NIL, dev->otherend, ring_ref_name, + err = xenbus_scanf(XBT_NIL, dir, ring_ref_name, "%u", &ring_ref[i]); if (err != 1) { err = -EINVAL; xenbus_dev_fatal(dev, err, "reading %s/%s", - dev->otherend, ring_ref_name); + dir, ring_ref_name); return err; } - pr_info("ring-ref%u: %u\n", i, ring_ref[i]); } } - - be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; - err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", - "%63s", protocol, NULL); - if (err) - strcpy(protocol, "unspecified, assuming default"); - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; - else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) - be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; - else { - xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); - return -1; - } - err = xenbus_gather(XBT_NIL, dev->otherend, - "feature-persistent", "%u", - &pers_grants, NULL); - if (err) - pers_grants = 0; - - be->blkif->vbd.feature_gnt_persistent = pers_grants; - be->blkif->vbd.overflow_max_grants = 0; - be->blkif->nr_ring_pages = nr_grefs; - - pr_info("ring-pages:%d, event-channel %d, protocol %d (%s) %s\n", - nr_grefs, evtchn, be->blkif->blk_protocol, protocol, - pers_grants ? "persistent grants" : ""); + blkif->nr_ring_pages = nr_grefs; for (i = 0; i < nr_grefs * XEN_BLKIF_REQS_PER_PAGE; i++) { req = kzalloc(sizeof(*req), GFP_KERNEL); if (!req) goto fail; - list_add_tail(&req->free_list, &be->blkif->pending_free); + list_add_tail(&req->free_list, &ring->pending_free); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { req->segments[j] = kzalloc(sizeof(*req->segments[0]), GFP_KERNEL); if (!req->segments[j]) @@ -938,7 +979,7 @@ static int connect_ring(struct backend_info *be) } /* Map the shared frame, irq etc. */ - err = xen_blkif_map(be->blkif, ring_ref, nr_grefs, evtchn); + err = xen_blkif_map(ring, ring_ref, nr_grefs, evtchn); if (err) { xenbus_dev_fatal(dev, err, "mapping ring-ref port %u", evtchn); return err; @@ -947,7 +988,7 @@ static int connect_ring(struct backend_info *be) return 0; fail: - list_for_each_entry_safe(req, n, &be->blkif->pending_free, free_list) { + list_for_each_entry_safe(req, n, &ring->pending_free, free_list) { list_del(&req->free_list); for (j = 0; j < MAX_INDIRECT_SEGMENTS; j++) { if (!req->segments[j]) @@ -962,6 +1003,93 @@ fail: kfree(req); } return -ENOMEM; + +} + +static int connect_ring(struct backend_info *be) +{ + struct xenbus_device *dev = be->dev; + unsigned int pers_grants; + char protocol[64] = ""; + int err, i; + char *xspath; + size_t xspathsize; + const size_t xenstore_path_ext_size = 11; /* sufficient for "/queue-NNN" */ + unsigned int requested_num_queues = 0; + + pr_debug("%s %s\n", __func__, dev->otherend); + + be->blkif->blk_protocol = BLKIF_PROTOCOL_DEFAULT; + err = xenbus_gather(XBT_NIL, dev->otherend, "protocol", + "%63s", protocol, NULL); + if (err) + strcpy(protocol, "unspecified, assuming default"); + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_NATIVE)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_NATIVE; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_32)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_32; + else if (0 == strcmp(protocol, XEN_IO_PROTO_ABI_X86_64)) + be->blkif->blk_protocol = BLKIF_PROTOCOL_X86_64; + else { + xenbus_dev_fatal(dev, err, "unknown fe protocol %s", protocol); + return -ENOSYS; + } + err = xenbus_gather(XBT_NIL, dev->otherend, + "feature-persistent", "%u", + &pers_grants, NULL); + if (err) + pers_grants = 0; + + be->blkif->vbd.feature_gnt_persistent = pers_grants; + be->blkif->vbd.overflow_max_grants = 0; + + /* + * Read the number of hardware queues from frontend. + */ + err = xenbus_scanf(XBT_NIL, dev->otherend, "multi-queue-num-queues", + "%u", &requested_num_queues); + if (err < 0) { + requested_num_queues = 1; + } else { + if (requested_num_queues > xenblk_max_queues + || requested_num_queues == 0) { + /* Buggy or malicious guest. */ + xenbus_dev_fatal(dev, err, + "guest requested %u queues, exceeding the maximum of %u.", + requested_num_queues, xenblk_max_queues); + return -ENOSYS; + } + } + be->blkif->nr_rings = requested_num_queues; + if (xen_blkif_alloc_rings(be->blkif)) + return -ENOMEM; + + pr_info("%s: using %d queues, protocol %d (%s) %s\n", dev->nodename, + be->blkif->nr_rings, be->blkif->blk_protocol, protocol, + pers_grants ? "persistent grants" : ""); + + if (be->blkif->nr_rings == 1) + return read_per_ring_refs(&be->blkif->rings[0], dev->otherend); + else { + xspathsize = strlen(dev->otherend) + xenstore_path_ext_size; + xspath = kmalloc(xspathsize, GFP_KERNEL); + if (!xspath) { + xenbus_dev_fatal(dev, -ENOMEM, "reading ring references"); + return -ENOMEM; + } + + for (i = 0; i < be->blkif->nr_rings; i++) { + memset(xspath, 0, xspathsize); + snprintf(xspath, xspathsize, "%s/queue-%u", dev->otherend, i); + err = read_per_ring_refs(&be->blkif->rings[i], xspath); + if (err) { + kfree(xspath); + return err; + } + } + kfree(xspath); + } + return 0; } static const struct xenbus_device_id xen_blkbk_ids[] = { diff --git a/drivers/block/xen-blkfront.c b/drivers/block/xen-blkfront.c index 2fee2eef988d..8a8dc91c39f7 100644 --- a/drivers/block/xen-blkfront.c +++ b/drivers/block/xen-blkfront.c @@ -60,6 +60,20 @@ #include <asm/xen/hypervisor.h> +/* + * The minimal size of segment supported by the block framework is PAGE_SIZE. + * When Linux is using a different page size than Xen, it may not be possible + * to put all the data in a single segment. + * This can happen when the backend doesn't support indirect descriptor and + * therefore the maximum amount of data that a request can carry is + * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE = 44KB + * + * Note that we only support one extra request. So the Linux page size + * should be <= ( 2 * BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) = + * 88KB. + */ +#define HAS_EXTRA_REQ (BLKIF_MAX_SEGMENTS_PER_REQUEST < XEN_PFN_PER_PAGE) + enum blkif_state { BLKIF_STATE_DISCONNECTED, BLKIF_STATE_CONNECTED, @@ -72,6 +86,13 @@ struct grant { struct list_head node; }; +enum blk_req_status { + REQ_WAITING, + REQ_DONE, + REQ_ERROR, + REQ_EOPNOTSUPP, +}; + struct blk_shadow { struct blkif_request req; struct request *request; @@ -79,6 +100,14 @@ struct blk_shadow { struct grant **indirect_grants; struct scatterlist *sg; unsigned int num_sg; + enum blk_req_status status; + + #define NO_ASSOCIATED_ID ~0UL + /* + * Id of the sibling if we ever need 2 requests when handling a + * block I/O request + */ + unsigned long associated_id; }; struct split_bio { @@ -99,6 +128,10 @@ static unsigned int xen_blkif_max_segments = 32; module_param_named(max, xen_blkif_max_segments, int, S_IRUGO); MODULE_PARM_DESC(max, "Maximum amount of segments in indirect requests (default is 32)"); +static unsigned int xen_blkif_max_queues = 4; +module_param_named(max_queues, xen_blkif_max_queues, uint, S_IRUGO); +MODULE_PARM_DESC(max_queues, "Maximum number of hardware queues/rings used per virtual disk"); + /* * Maximum order of pages to be used for the shared ring between front and * backend, 4KB page granularity is used. @@ -114,10 +147,35 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the __CONST_RING_SIZE(blkif, XEN_PAGE_SIZE * XENBUS_MAX_RING_GRANTS) /* - * ring-ref%i i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 - * characters are enough. Define to 20 to keep consist with backend. + * ring-ref%u i=(-1UL) would take 11 characters + 'ring-ref' is 8, so 19 + * characters are enough. Define to 20 to keep consistent with backend. */ #define RINGREF_NAME_LEN (20) +/* + * queue-%u would take 7 + 10(UINT_MAX) = 17 characters. + */ +#define QUEUE_NAME_LEN (17) + +/* + * Per-ring info. + * Every blkfront device can associate with one or more blkfront_ring_info, + * depending on how many hardware queues/rings to be used. + */ +struct blkfront_ring_info { + /* Lock to protect data in every ring buffer. */ + spinlock_t ring_lock; + struct blkif_front_ring ring; + unsigned int ring_ref[XENBUS_MAX_RING_GRANTS]; + unsigned int evtchn, irq; + struct work_struct work; + struct gnttab_free_callback callback; + struct blk_shadow shadow[BLK_MAX_RING_SIZE]; + struct list_head indirect_pages; + struct list_head grants; + unsigned int persistent_gnts_c; + unsigned long shadow_free; + struct blkfront_info *dev_info; +}; /* * We have one of these per vbd, whether ide, scsi or 'other'. They @@ -126,25 +184,15 @@ MODULE_PARM_DESC(max_ring_page_order, "Maximum order of pages to be used for the */ struct blkfront_info { - spinlock_t io_lock; struct mutex mutex; struct xenbus_device *xbdev; struct gendisk *gd; int vdevice; blkif_vdev_t handle; enum blkif_state connected; - int ring_ref[XENBUS_MAX_RING_GRANTS]; + /* Number of pages per ring buffer. */ unsigned int nr_ring_pages; - struct blkif_front_ring ring; - unsigned int evtchn, irq; struct request_queue *rq; - struct work_struct work; - struct gnttab_free_callback callback; - struct blk_shadow shadow[BLK_MAX_RING_SIZE]; - struct list_head grants; - struct list_head indirect_pages; - unsigned int persistent_gnts_c; - unsigned long shadow_free; unsigned int feature_flush; unsigned int feature_discard:1; unsigned int feature_secdiscard:1; @@ -155,6 +203,8 @@ struct blkfront_info unsigned int max_indirect_segments; int is_ready; struct blk_mq_tag_set tag_set; + struct blkfront_ring_info *rinfo; + unsigned int nr_rings; }; static unsigned int nr_minors; @@ -198,38 +248,40 @@ static DEFINE_SPINLOCK(minor_lock); #define GREFS(_psegs) ((_psegs) * GRANTS_PER_PSEG) -static int blkfront_setup_indirect(struct blkfront_info *info); -static int blkfront_gather_backend_features(struct blkfront_info *info); +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo); +static void blkfront_gather_backend_features(struct blkfront_info *info); -static int get_id_from_freelist(struct blkfront_info *info) +static int get_id_from_freelist(struct blkfront_ring_info *rinfo) { - unsigned long free = info->shadow_free; - BUG_ON(free >= BLK_RING_SIZE(info)); - info->shadow_free = info->shadow[free].req.u.rw.id; - info->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ + unsigned long free = rinfo->shadow_free; + + BUG_ON(free >= BLK_RING_SIZE(rinfo->dev_info)); + rinfo->shadow_free = rinfo->shadow[free].req.u.rw.id; + rinfo->shadow[free].req.u.rw.id = 0x0fffffee; /* debug */ return free; } -static int add_id_to_freelist(struct blkfront_info *info, - unsigned long id) +static int add_id_to_freelist(struct blkfront_ring_info *rinfo, + unsigned long id) { - if (info->shadow[id].req.u.rw.id != id) + if (rinfo->shadow[id].req.u.rw.id != id) return -EINVAL; - if (info->shadow[id].request == NULL) + if (rinfo->shadow[id].request == NULL) return -EINVAL; - info->shadow[id].req.u.rw.id = info->shadow_free; - info->shadow[id].request = NULL; - info->shadow_free = id; + rinfo->shadow[id].req.u.rw.id = rinfo->shadow_free; + rinfo->shadow[id].request = NULL; + rinfo->shadow_free = id; return 0; } -static int fill_grant_buffer(struct blkfront_info *info, int num) +static int fill_grant_buffer(struct blkfront_ring_info *rinfo, int num) { + struct blkfront_info *info = rinfo->dev_info; struct page *granted_page; struct grant *gnt_list_entry, *n; int i = 0; - while(i < num) { + while (i < num) { gnt_list_entry = kzalloc(sizeof(struct grant), GFP_NOIO); if (!gnt_list_entry) goto out_of_memory; @@ -244,7 +296,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) } gnt_list_entry->gref = GRANT_INVALID_REF; - list_add(&gnt_list_entry->node, &info->grants); + list_add(&gnt_list_entry->node, &rinfo->grants); i++; } @@ -252,7 +304,7 @@ static int fill_grant_buffer(struct blkfront_info *info, int num) out_of_memory: list_for_each_entry_safe(gnt_list_entry, n, - &info->grants, node) { + &rinfo->grants, node) { list_del(&gnt_list_entry->node); if (info->feature_persistent) __free_page(gnt_list_entry->page); @@ -263,17 +315,17 @@ out_of_memory: return -ENOMEM; } -static struct grant *get_free_grant(struct blkfront_info *info) +static struct grant *get_free_grant(struct blkfront_ring_info *rinfo) { struct grant *gnt_list_entry; - BUG_ON(list_empty(&info->grants)); - gnt_list_entry = list_first_entry(&info->grants, struct grant, + BUG_ON(list_empty(&rinfo->grants)); + gnt_list_entry = list_first_entry(&rinfo->grants, struct grant, node); list_del(&gnt_list_entry->node); if (gnt_list_entry->gref != GRANT_INVALID_REF) - info->persistent_gnts_c--; + rinfo->persistent_gnts_c--; return gnt_list_entry; } @@ -289,9 +341,10 @@ static inline void grant_foreign_access(const struct grant *gnt_list_entry, static struct grant *get_grant(grant_ref_t *gref_head, unsigned long gfn, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -312,9 +365,10 @@ static struct grant *get_grant(grant_ref_t *gref_head, } static struct grant *get_indirect_grant(grant_ref_t *gref_head, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { - struct grant *gnt_list_entry = get_free_grant(info); + struct grant *gnt_list_entry = get_free_grant(rinfo); + struct blkfront_info *info = rinfo->dev_info; if (gnt_list_entry->gref != GRANT_INVALID_REF) return gnt_list_entry; @@ -326,8 +380,8 @@ static struct grant *get_indirect_grant(grant_ref_t *gref_head, struct page *indirect_page; /* Fetch a pre-allocated page to use for indirect grefs */ - BUG_ON(list_empty(&info->indirect_pages)); - indirect_page = list_first_entry(&info->indirect_pages, + BUG_ON(list_empty(&rinfo->indirect_pages)); + indirect_page = list_first_entry(&rinfo->indirect_pages, struct page, lru); list_del(&indirect_page->lru); gnt_list_entry->page = indirect_page; @@ -403,8 +457,8 @@ static void xlbd_release_minors(unsigned int minor, unsigned int nr) static void blkif_restart_queue_callback(void *arg) { - struct blkfront_info *info = (struct blkfront_info *)arg; - schedule_work(&info->work); + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)arg; + schedule_work(&rinfo->work); } static int blkif_getgeo(struct block_device *bd, struct hd_geometry *hg) @@ -456,16 +510,33 @@ static int blkif_ioctl(struct block_device *bdev, fmode_t mode, return 0; } -static int blkif_queue_discard_req(struct request *req) +static unsigned long blkif_ring_get_request(struct blkfront_ring_info *rinfo, + struct request *req, + struct blkif_request **ring_req) { - struct blkfront_info *info = req->rq_disk->private_data; + unsigned long id; + + *ring_req = RING_GET_REQUEST(&rinfo->ring, rinfo->ring.req_prod_pvt); + rinfo->ring.req_prod_pvt++; + + id = get_id_from_freelist(rinfo); + rinfo->shadow[id].request = req; + rinfo->shadow[id].status = REQ_WAITING; + rinfo->shadow[id].associated_id = NO_ASSOCIATED_ID; + + (*ring_req)->u.rw.id = id; + + return id; +} + +static int blkif_queue_discard_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; struct blkif_request *ring_req; unsigned long id; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; + id = blkif_ring_get_request(rinfo, req, &ring_req); ring_req->operation = BLKIF_OP_DISCARD; ring_req->u.discard.nr_sectors = blk_rq_sectors(req); @@ -476,10 +547,8 @@ static int blkif_queue_discard_req(struct request *req) else ring_req->u.discard.flag = 0; - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; return 0; } @@ -487,7 +556,7 @@ static int blkif_queue_discard_req(struct request *req) struct setup_rw_req { unsigned int grant_idx; struct blkif_request_segment *segments; - struct blkfront_info *info; + struct blkfront_ring_info *rinfo; struct blkif_request *ring_req; grant_ref_t gref_head; unsigned int id; @@ -495,6 +564,9 @@ struct setup_rw_req { bool need_copy; unsigned int bvec_off; char *bvec_data; + + bool require_extra_req; + struct blkif_request *extra_ring_req; }; static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, @@ -507,8 +579,24 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, /* Convenient aliases */ unsigned int grant_idx = setup->grant_idx; struct blkif_request *ring_req = setup->ring_req; - struct blkfront_info *info = setup->info; - struct blk_shadow *shadow = &info->shadow[setup->id]; + struct blkfront_ring_info *rinfo = setup->rinfo; + /* + * We always use the shadow of the first request to store the list + * of grant associated to the block I/O request. This made the + * completion more easy to handle even if the block I/O request is + * split. + */ + struct blk_shadow *shadow = &rinfo->shadow[setup->id]; + + if (unlikely(setup->require_extra_req && + grant_idx >= BLKIF_MAX_SEGMENTS_PER_REQUEST)) { + /* + * We are using the second request, setup grant_idx + * to be the index of the segment array. + */ + grant_idx -= BLKIF_MAX_SEGMENTS_PER_REQUEST; + ring_req = setup->extra_ring_req; + } if ((ring_req->operation == BLKIF_OP_INDIRECT) && (grant_idx % GRANTS_PER_INDIRECT_FRAME == 0)) { @@ -516,15 +604,19 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(setup->segments); n = grant_idx / GRANTS_PER_INDIRECT_FRAME; - gnt_list_entry = get_indirect_grant(&setup->gref_head, info); + gnt_list_entry = get_indirect_grant(&setup->gref_head, rinfo); shadow->indirect_grants[n] = gnt_list_entry; setup->segments = kmap_atomic(gnt_list_entry->page); ring_req->u.indirect.indirect_grefs[n] = gnt_list_entry->gref; } - gnt_list_entry = get_grant(&setup->gref_head, gfn, info); + gnt_list_entry = get_grant(&setup->gref_head, gfn, rinfo); ref = gnt_list_entry->gref; - shadow->grants_used[grant_idx] = gnt_list_entry; + /* + * All the grants are stored in the shadow of the first + * request. Therefore we have to use the global index. + */ + shadow->grants_used[setup->grant_idx] = gnt_list_entry; if (setup->need_copy) { void *shared_data; @@ -566,16 +658,36 @@ static void blkif_setup_rw_req_grant(unsigned long gfn, unsigned int offset, (setup->grant_idx)++; } -static int blkif_queue_rw_req(struct request *req) +static void blkif_setup_extra_req(struct blkif_request *first, + struct blkif_request *second) { - struct blkfront_info *info = req->rq_disk->private_data; - struct blkif_request *ring_req; - unsigned long id; + uint16_t nr_segments = first->u.rw.nr_segments; + + /* + * The second request is only present when the first request uses + * all its segments. It's always the continuity of the first one. + */ + first->u.rw.nr_segments = BLKIF_MAX_SEGMENTS_PER_REQUEST; + + second->u.rw.nr_segments = nr_segments - BLKIF_MAX_SEGMENTS_PER_REQUEST; + second->u.rw.sector_number = first->u.rw.sector_number + + (BLKIF_MAX_SEGMENTS_PER_REQUEST * XEN_PAGE_SIZE) / 512; + + second->u.rw.handle = first->u.rw.handle; + second->operation = first->operation; +} + +static int blkif_queue_rw_req(struct request *req, struct blkfront_ring_info *rinfo) +{ + struct blkfront_info *info = rinfo->dev_info; + struct blkif_request *ring_req, *extra_ring_req = NULL; + unsigned long id, extra_id = NO_ASSOCIATED_ID; + bool require_extra_req = false; int i; struct setup_rw_req setup = { .grant_idx = 0, .segments = NULL, - .info = info, + .rinfo = rinfo, .need_copy = rq_data_dir(req) && info->feature_persistent, }; @@ -584,7 +696,6 @@ static int blkif_queue_rw_req(struct request *req) * existing persistent grants, or if we have to get new grants, * as there are not sufficiently many free. */ - bool new_persistent_gnts; struct scatterlist *sg; int num_sg, max_grefs, num_grant; @@ -596,41 +707,36 @@ static int blkif_queue_rw_req(struct request *req) */ max_grefs += INDIRECT_GREFS(max_grefs); - /* Check if we have enough grants to allocate a requests */ - if (info->persistent_gnts_c < max_grefs) { - new_persistent_gnts = 1; - if (gnttab_alloc_grant_references( - max_grefs - info->persistent_gnts_c, - &setup.gref_head) < 0) { + /* + * We have to reserve 'max_grefs' grants because persistent + * grants are shared by all rings. + */ + if (max_grefs > 0) + if (gnttab_alloc_grant_references(max_grefs, &setup.gref_head) < 0) { gnttab_request_free_callback( - &info->callback, + &rinfo->callback, blkif_restart_queue_callback, - info, + rinfo, max_grefs); return 1; } - } else - new_persistent_gnts = 0; /* Fill out a communications ring structure. */ - ring_req = RING_GET_REQUEST(&info->ring, info->ring.req_prod_pvt); - id = get_id_from_freelist(info); - info->shadow[id].request = req; - - BUG_ON(info->max_indirect_segments == 0 && - GREFS(req->nr_phys_segments) > BLKIF_MAX_SEGMENTS_PER_REQUEST); - BUG_ON(info->max_indirect_segments && - GREFS(req->nr_phys_segments) > info->max_indirect_segments); + id = blkif_ring_get_request(rinfo, req, &ring_req); - num_sg = blk_rq_map_sg(req->q, req, info->shadow[id].sg); + num_sg = blk_rq_map_sg(req->q, req, rinfo->shadow[id].sg); num_grant = 0; /* Calculate the number of grant used */ - for_each_sg(info->shadow[id].sg, sg, num_sg, i) + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) num_grant += gnttab_count_grant(sg->offset, sg->length); - ring_req->u.rw.id = id; - info->shadow[id].num_sg = num_sg; - if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST) { + require_extra_req = info->max_indirect_segments == 0 && + num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST; + BUG_ON(!HAS_EXTRA_REQ && require_extra_req); + + rinfo->shadow[id].num_sg = num_sg; + if (num_grant > BLKIF_MAX_SEGMENTS_PER_REQUEST && + likely(!require_extra_req)) { /* * The indirect operation can only be a BLKIF_OP_READ or * BLKIF_OP_WRITE @@ -670,11 +776,31 @@ static int blkif_queue_rw_req(struct request *req) } } ring_req->u.rw.nr_segments = num_grant; + if (unlikely(require_extra_req)) { + extra_id = blkif_ring_get_request(rinfo, req, + &extra_ring_req); + /* + * Only the first request contains the scatter-gather + * list. + */ + rinfo->shadow[extra_id].num_sg = 0; + + blkif_setup_extra_req(ring_req, extra_ring_req); + + /* Link the 2 requests together */ + rinfo->shadow[extra_id].associated_id = id; + rinfo->shadow[id].associated_id = extra_id; + } } setup.ring_req = ring_req; setup.id = id; - for_each_sg(info->shadow[id].sg, sg, num_sg, i) { + + setup.require_extra_req = require_extra_req; + if (unlikely(require_extra_req)) + setup.extra_ring_req = extra_ring_req; + + for_each_sg(rinfo->shadow[id].sg, sg, num_sg, i) { BUG_ON(sg->offset + sg->length > PAGE_SIZE); if (setup.need_copy) { @@ -694,12 +820,12 @@ static int blkif_queue_rw_req(struct request *req) if (setup.segments) kunmap_atomic(setup.segments); - info->ring.req_prod_pvt++; - /* Keep a private copy so we can reissue requests when recovering. */ - info->shadow[id].req = *ring_req; + rinfo->shadow[id].req = *ring_req; + if (unlikely(require_extra_req)) + rinfo->shadow[extra_id].req = *extra_ring_req; - if (new_persistent_gnts) + if (max_grefs > 0) gnttab_free_grant_references(setup.gref_head); return 0; @@ -711,27 +837,25 @@ static int blkif_queue_rw_req(struct request *req) * * @req: a request struct */ -static int blkif_queue_request(struct request *req) +static int blkif_queue_request(struct request *req, struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = req->rq_disk->private_data; - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) + if (unlikely(rinfo->dev_info->connected != BLKIF_STATE_CONNECTED)) return 1; if (unlikely(req->cmd_flags & (REQ_DISCARD | REQ_SECURE))) - return blkif_queue_discard_req(req); + return blkif_queue_discard_req(req, rinfo); else - return blkif_queue_rw_req(req); + return blkif_queue_rw_req(req, rinfo); } -static inline void flush_requests(struct blkfront_info *info) +static inline void flush_requests(struct blkfront_ring_info *rinfo) { int notify; - RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&info->ring, notify); + RING_PUSH_REQUESTS_AND_CHECK_NOTIFY(&rinfo->ring, notify); if (notify) - notify_remote_via_irq(info->irq); + notify_remote_via_irq(rinfo->irq); } static inline bool blkif_request_flush_invalid(struct request *req, @@ -745,38 +869,50 @@ static inline bool blkif_request_flush_invalid(struct request *req, } static int blkif_queue_rq(struct blk_mq_hw_ctx *hctx, - const struct blk_mq_queue_data *qd) + const struct blk_mq_queue_data *qd) { - struct blkfront_info *info = qd->rq->rq_disk->private_data; + unsigned long flags; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)hctx->driver_data; blk_mq_start_request(qd->rq); - spin_lock_irq(&info->io_lock); - if (RING_FULL(&info->ring)) + spin_lock_irqsave(&rinfo->ring_lock, flags); + if (RING_FULL(&rinfo->ring)) goto out_busy; - if (blkif_request_flush_invalid(qd->rq, info)) + if (blkif_request_flush_invalid(qd->rq, rinfo->dev_info)) goto out_err; - if (blkif_queue_request(qd->rq)) + if (blkif_queue_request(qd->rq, rinfo)) goto out_busy; - flush_requests(info); - spin_unlock_irq(&info->io_lock); + flush_requests(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_OK; out_err: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return BLK_MQ_RQ_QUEUE_ERROR; out_busy: - spin_unlock_irq(&info->io_lock); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); blk_mq_stop_hw_queue(hctx); return BLK_MQ_RQ_QUEUE_BUSY; } +static int blk_mq_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, + unsigned int index) +{ + struct blkfront_info *info = (struct blkfront_info *)data; + + BUG_ON(info->nr_rings <= index); + hctx->driver_data = &info->rinfo[index]; + return 0; +} + static struct blk_mq_ops blkfront_mq_ops = { .queue_rq = blkif_queue_rq, .map_queue = blk_mq_map_queue, + .init_hctx = blk_mq_init_hctx, }; static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, @@ -788,19 +924,28 @@ static int xlvbd_init_blk_queue(struct gendisk *gd, u16 sector_size, memset(&info->tag_set, 0, sizeof(info->tag_set)); info->tag_set.ops = &blkfront_mq_ops; - info->tag_set.nr_hw_queues = 1; - info->tag_set.queue_depth = BLK_RING_SIZE(info); + info->tag_set.nr_hw_queues = info->nr_rings; + if (HAS_EXTRA_REQ && info->max_indirect_segments == 0) { + /* + * When indirect descriptior is not supported, the I/O request + * will be split between multiple request in the ring. + * To avoid problems when sending the request, divide by + * 2 the depth of the queue. + */ + info->tag_set.queue_depth = BLK_RING_SIZE(info) / 2; + } else + info->tag_set.queue_depth = BLK_RING_SIZE(info); info->tag_set.numa_node = NUMA_NO_NODE; info->tag_set.flags = BLK_MQ_F_SHOULD_MERGE | BLK_MQ_F_SG_MERGE; info->tag_set.cmd_size = 0; info->tag_set.driver_data = info; if (blk_mq_alloc_tag_set(&info->tag_set)) - return -1; + return -EINVAL; rq = blk_mq_init_queue(&info->tag_set); if (IS_ERR(rq)) { blk_mq_free_tag_set(&info->tag_set); - return -1; + return PTR_ERR(rq); } queue_flag_set_unlocked(QUEUE_FLAG_VIRT, rq); @@ -1028,7 +1173,7 @@ static int xlvbd_alloc_gendisk(blkif_sector_t capacity, static void xlvbd_release_gendisk(struct blkfront_info *info) { - unsigned int minor, nr_minors; + unsigned int minor, nr_minors, i; if (info->rq == NULL) return; @@ -1036,11 +1181,15 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) /* No more blkif_request(). */ blk_mq_stop_hw_queues(info->rq); - /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; - /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + /* No more gnttab callback work. */ + gnttab_cancel_free_callback(&rinfo->callback); + + /* Flush gnttab callback work. Must be done with no locks held. */ + flush_work(&rinfo->work); + } del_gendisk(info->gd); @@ -1056,88 +1205,87 @@ static void xlvbd_release_gendisk(struct blkfront_info *info) info->gd = NULL; } -/* Must be called with io_lock holded */ -static void kick_pending_request_queues(struct blkfront_info *info) +/* Already hold rinfo->ring_lock. */ +static inline void kick_pending_request_queues_locked(struct blkfront_ring_info *rinfo) { - if (!RING_FULL(&info->ring)) - blk_mq_start_stopped_hw_queues(info->rq, true); + if (!RING_FULL(&rinfo->ring)) + blk_mq_start_stopped_hw_queues(rinfo->dev_info->rq, true); } -static void blkif_restart_queue(struct work_struct *work) +static void kick_pending_request_queues(struct blkfront_ring_info *rinfo) { - struct blkfront_info *info = container_of(work, struct blkfront_info, work); + unsigned long flags; - spin_lock_irq(&info->io_lock); - if (info->connected == BLKIF_STATE_CONNECTED) - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + spin_lock_irqsave(&rinfo->ring_lock, flags); + kick_pending_request_queues_locked(rinfo); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); } -static void blkif_free(struct blkfront_info *info, int suspend) +static void blkif_restart_queue(struct work_struct *work) { - struct grant *persistent_gnt; - struct grant *n; - int i, j, segs; + struct blkfront_ring_info *rinfo = container_of(work, struct blkfront_ring_info, work); - /* Prevent new requests being issued until we fix things up. */ - spin_lock_irq(&info->io_lock); - info->connected = suspend ? - BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; - /* No more blkif_request(). */ - if (info->rq) - blk_mq_stop_hw_queues(info->rq); + if (rinfo->dev_info->connected == BLKIF_STATE_CONNECTED) + kick_pending_request_queues(rinfo); +} - /* Remove all persistent grants */ - if (!list_empty(&info->grants)) { - list_for_each_entry_safe(persistent_gnt, n, - &info->grants, node) { - list_del(&persistent_gnt->node); - if (persistent_gnt->gref != GRANT_INVALID_REF) { - gnttab_end_foreign_access(persistent_gnt->gref, - 0, 0UL); - info->persistent_gnts_c--; - } - if (info->feature_persistent) - __free_page(persistent_gnt->page); - kfree(persistent_gnt); - } - } - BUG_ON(info->persistent_gnts_c != 0); +static void blkif_free_ring(struct blkfront_ring_info *rinfo) +{ + struct grant *persistent_gnt, *n; + struct blkfront_info *info = rinfo->dev_info; + int i, j, segs; /* * Remove indirect pages, this only happens when using indirect * descriptors but not persistent grants */ - if (!list_empty(&info->indirect_pages)) { + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; BUG_ON(info->feature_persistent); - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } } + /* Remove all persistent grants. */ + if (!list_empty(&rinfo->grants)) { + list_for_each_entry_safe(persistent_gnt, n, + &rinfo->grants, node) { + list_del(&persistent_gnt->node); + if (persistent_gnt->gref != GRANT_INVALID_REF) { + gnttab_end_foreign_access(persistent_gnt->gref, + 0, 0UL); + rinfo->persistent_gnts_c--; + } + if (info->feature_persistent) + __free_page(persistent_gnt->page); + kfree(persistent_gnt); + } + } + BUG_ON(rinfo->persistent_gnts_c != 0); + for (i = 0; i < BLK_RING_SIZE(info); i++) { /* * Clear persistent grants present in requests already * on the shared ring */ - if (!info->shadow[i].request) + if (!rinfo->shadow[i].request) goto free_shadow; - segs = info->shadow[i].req.operation == BLKIF_OP_INDIRECT ? - info->shadow[i].req.u.indirect.nr_segments : - info->shadow[i].req.u.rw.nr_segments; + segs = rinfo->shadow[i].req.operation == BLKIF_OP_INDIRECT ? + rinfo->shadow[i].req.u.indirect.nr_segments : + rinfo->shadow[i].req.u.rw.nr_segments; for (j = 0; j < segs; j++) { - persistent_gnt = info->shadow[i].grants_used[j]; + persistent_gnt = rinfo->shadow[i].grants_used[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); if (info->feature_persistent) __free_page(persistent_gnt->page); kfree(persistent_gnt); } - if (info->shadow[i].req.operation != BLKIF_OP_INDIRECT) + if (rinfo->shadow[i].req.operation != BLKIF_OP_INDIRECT) /* * If this is not an indirect operation don't try to * free indirect segments @@ -1145,42 +1293,59 @@ static void blkif_free(struct blkfront_info *info, int suspend) goto free_shadow; for (j = 0; j < INDIRECT_GREFS(segs); j++) { - persistent_gnt = info->shadow[i].indirect_grants[j]; + persistent_gnt = rinfo->shadow[i].indirect_grants[j]; gnttab_end_foreign_access(persistent_gnt->gref, 0, 0UL); __free_page(persistent_gnt->page); kfree(persistent_gnt); } free_shadow: - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; } /* No more gnttab callback work. */ - gnttab_cancel_free_callback(&info->callback); - spin_unlock_irq(&info->io_lock); + gnttab_cancel_free_callback(&rinfo->callback); /* Flush gnttab callback work. Must be done with no locks held. */ - flush_work(&info->work); + flush_work(&rinfo->work); /* Free resources associated with old device channel. */ for (i = 0; i < info->nr_ring_pages; i++) { - if (info->ring_ref[i] != GRANT_INVALID_REF) { - gnttab_end_foreign_access(info->ring_ref[i], 0, 0); - info->ring_ref[i] = GRANT_INVALID_REF; + if (rinfo->ring_ref[i] != GRANT_INVALID_REF) { + gnttab_end_foreign_access(rinfo->ring_ref[i], 0, 0); + rinfo->ring_ref[i] = GRANT_INVALID_REF; } } - free_pages((unsigned long)info->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); - info->ring.sring = NULL; + free_pages((unsigned long)rinfo->ring.sring, get_order(info->nr_ring_pages * PAGE_SIZE)); + rinfo->ring.sring = NULL; - if (info->irq) - unbind_from_irqhandler(info->irq, info); - info->evtchn = info->irq = 0; + if (rinfo->irq) + unbind_from_irqhandler(rinfo->irq, rinfo); + rinfo->evtchn = rinfo->irq = 0; +} +static void blkif_free(struct blkfront_info *info, int suspend) +{ + unsigned int i; + + /* Prevent new requests being issued until we fix things up. */ + info->connected = suspend ? + BLKIF_STATE_SUSPENDED : BLKIF_STATE_DISCONNECTED; + /* No more blkif_request(). */ + if (info->rq) + blk_mq_stop_hw_queues(info->rq); + + for (i = 0; i < info->nr_rings; i++) + blkif_free_ring(&info->rinfo[i]); + + kfree(info->rinfo); + info->rinfo = NULL; + info->nr_rings = 0; } struct copy_from_grant { @@ -1209,19 +1374,93 @@ static void blkif_copy_from_grant(unsigned long gfn, unsigned int offset, kunmap_atomic(shared_data); } -static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, +static enum blk_req_status blkif_rsp_to_req_status(int rsp) +{ + switch (rsp) + { + case BLKIF_RSP_OKAY: + return REQ_DONE; + case BLKIF_RSP_EOPNOTSUPP: + return REQ_EOPNOTSUPP; + case BLKIF_RSP_ERROR: + /* Fallthrough. */ + default: + return REQ_ERROR; + } +} + +/* + * Get the final status of the block request based on two ring response + */ +static int blkif_get_final_status(enum blk_req_status s1, + enum blk_req_status s2) +{ + BUG_ON(s1 == REQ_WAITING); + BUG_ON(s2 == REQ_WAITING); + + if (s1 == REQ_ERROR || s2 == REQ_ERROR) + return BLKIF_RSP_ERROR; + else if (s1 == REQ_EOPNOTSUPP || s2 == REQ_EOPNOTSUPP) + return BLKIF_RSP_EOPNOTSUPP; + return BLKIF_RSP_OKAY; +} + +static bool blkif_completion(unsigned long *id, + struct blkfront_ring_info *rinfo, struct blkif_response *bret) { int i = 0; struct scatterlist *sg; int num_sg, num_grant; + struct blkfront_info *info = rinfo->dev_info; + struct blk_shadow *s = &rinfo->shadow[*id]; struct copy_from_grant data = { - .s = s, .grant_idx = 0, }; num_grant = s->req.operation == BLKIF_OP_INDIRECT ? s->req.u.indirect.nr_segments : s->req.u.rw.nr_segments; + + /* The I/O request may be split in two. */ + if (unlikely(s->associated_id != NO_ASSOCIATED_ID)) { + struct blk_shadow *s2 = &rinfo->shadow[s->associated_id]; + + /* Keep the status of the current response in shadow. */ + s->status = blkif_rsp_to_req_status(bret->status); + + /* Wait the second response if not yet here. */ + if (s2->status == REQ_WAITING) + return 0; + + bret->status = blkif_get_final_status(s->status, + s2->status); + + /* + * All the grants is stored in the first shadow in order + * to make the completion code simpler. + */ + num_grant += s2->req.u.rw.nr_segments; + + /* + * The two responses may not come in order. Only the + * first request will store the scatter-gather list. + */ + if (s2->num_sg != 0) { + /* Update "id" with the ID of the first response. */ + *id = s->associated_id; + s = s2; + } + + /* + * We don't need anymore the second request, so recycling + * it now. + */ + if (add_id_to_freelist(rinfo, s->associated_id)) + WARN(1, "%s: can't recycle the second part (id = %ld) of the request\n", + info->gd->disk_name, s->associated_id); + } + + data.s = s; num_sg = s->num_sg; if (bret->operation == BLKIF_OP_READ && info->feature_persistent) { @@ -1252,8 +1491,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->grants_used[i]->gref); - list_add(&s->grants_used[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->grants_used[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { /* * If the grant is not mapped by the backend we end the @@ -1263,7 +1502,7 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ gnttab_end_foreign_access(s->grants_used[i]->gref, 0, 0UL); s->grants_used[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->grants_used[i]->node, &info->grants); + list_add_tail(&s->grants_used[i]->node, &rinfo->grants); } } if (s->req.operation == BLKIF_OP_INDIRECT) { @@ -1272,8 +1511,8 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, if (!info->feature_persistent) pr_alert_ratelimited("backed has not unmapped grant: %u\n", s->indirect_grants[i]->gref); - list_add(&s->indirect_grants[i]->node, &info->grants); - info->persistent_gnts_c++; + list_add(&s->indirect_grants[i]->node, &rinfo->grants); + rinfo->persistent_gnts_c++; } else { struct page *indirect_page; @@ -1284,13 +1523,15 @@ static void blkif_completion(struct blk_shadow *s, struct blkfront_info *info, */ if (!info->feature_persistent) { indirect_page = s->indirect_grants[i]->page; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } s->indirect_grants[i]->gref = GRANT_INVALID_REF; - list_add_tail(&s->indirect_grants[i]->node, &info->grants); + list_add_tail(&s->indirect_grants[i]->node, &rinfo->grants); } } } + + return 1; } static irqreturn_t blkif_interrupt(int irq, void *dev_id) @@ -1299,24 +1540,22 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) struct blkif_response *bret; RING_IDX i, rp; unsigned long flags; - struct blkfront_info *info = (struct blkfront_info *)dev_id; + struct blkfront_ring_info *rinfo = (struct blkfront_ring_info *)dev_id; + struct blkfront_info *info = rinfo->dev_info; int error; - spin_lock_irqsave(&info->io_lock, flags); - - if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) { - spin_unlock_irqrestore(&info->io_lock, flags); + if (unlikely(info->connected != BLKIF_STATE_CONNECTED)) return IRQ_HANDLED; - } + spin_lock_irqsave(&rinfo->ring_lock, flags); again: - rp = info->ring.sring->rsp_prod; + rp = rinfo->ring.sring->rsp_prod; rmb(); /* Ensure we see queued responses up to 'rp'. */ - for (i = info->ring.rsp_cons; i != rp; i++) { + for (i = rinfo->ring.rsp_cons; i != rp; i++) { unsigned long id; - bret = RING_GET_RESPONSE(&info->ring, i); + bret = RING_GET_RESPONSE(&rinfo->ring, i); id = bret->id; /* * The backend has messed up and given us an id that we would @@ -1330,12 +1569,18 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) * the id is busted. */ continue; } - req = info->shadow[id].request; + req = rinfo->shadow[id].request; - if (bret->operation != BLKIF_OP_DISCARD) - blkif_completion(&info->shadow[id], info, bret); + if (bret->operation != BLKIF_OP_DISCARD) { + /* + * We may need to wait for an extra response if the + * I/O request is split in 2 + */ + if (!blkif_completion(&id, rinfo, bret)) + continue; + } - if (add_id_to_freelist(info, id)) { + if (add_id_to_freelist(rinfo, id)) { WARN(1, "%s: response to %s (id %ld) couldn't be recycled!\n", info->gd->disk_name, op_name(bret->operation), id); continue; @@ -1364,7 +1609,7 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) error = -EOPNOTSUPP; } if (unlikely(bret->status == BLKIF_RSP_ERROR && - info->shadow[id].req.u.rw.nr_segments == 0)) { + rinfo->shadow[id].req.u.rw.nr_segments == 0)) { printk(KERN_WARNING "blkfront: %s: empty %s op failed\n", info->gd->disk_name, op_name(bret->operation)); error = -EOPNOTSUPP; @@ -1389,34 +1634,35 @@ static irqreturn_t blkif_interrupt(int irq, void *dev_id) } } - info->ring.rsp_cons = i; + rinfo->ring.rsp_cons = i; - if (i != info->ring.req_prod_pvt) { + if (i != rinfo->ring.req_prod_pvt) { int more_to_do; - RING_FINAL_CHECK_FOR_RESPONSES(&info->ring, more_to_do); + RING_FINAL_CHECK_FOR_RESPONSES(&rinfo->ring, more_to_do); if (more_to_do) goto again; } else - info->ring.sring->rsp_event = i + 1; + rinfo->ring.sring->rsp_event = i + 1; - kick_pending_request_queues(info); + kick_pending_request_queues_locked(rinfo); - spin_unlock_irqrestore(&info->io_lock, flags); + spin_unlock_irqrestore(&rinfo->ring_lock, flags); return IRQ_HANDLED; } static int setup_blkring(struct xenbus_device *dev, - struct blkfront_info *info) + struct blkfront_ring_info *rinfo) { struct blkif_sring *sring; int err, i; + struct blkfront_info *info = rinfo->dev_info; unsigned long ring_size = info->nr_ring_pages * XEN_PAGE_SIZE; grant_ref_t gref[XENBUS_MAX_RING_GRANTS]; for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = GRANT_INVALID_REF; + rinfo->ring_ref[i] = GRANT_INVALID_REF; sring = (struct blkif_sring *)__get_free_pages(GFP_NOIO | __GFP_HIGH, get_order(ring_size)); @@ -1425,29 +1671,29 @@ static int setup_blkring(struct xenbus_device *dev, return -ENOMEM; } SHARED_RING_INIT(sring); - FRONT_RING_INIT(&info->ring, sring, ring_size); + FRONT_RING_INIT(&rinfo->ring, sring, ring_size); - err = xenbus_grant_ring(dev, info->ring.sring, info->nr_ring_pages, gref); + err = xenbus_grant_ring(dev, rinfo->ring.sring, info->nr_ring_pages, gref); if (err < 0) { free_pages((unsigned long)sring, get_order(ring_size)); - info->ring.sring = NULL; + rinfo->ring.sring = NULL; goto fail; } for (i = 0; i < info->nr_ring_pages; i++) - info->ring_ref[i] = gref[i]; + rinfo->ring_ref[i] = gref[i]; - err = xenbus_alloc_evtchn(dev, &info->evtchn); + err = xenbus_alloc_evtchn(dev, &rinfo->evtchn); if (err) goto fail; - err = bind_evtchn_to_irqhandler(info->evtchn, blkif_interrupt, 0, - "blkif", info); + err = bind_evtchn_to_irqhandler(rinfo->evtchn, blkif_interrupt, 0, + "blkif", rinfo); if (err <= 0) { xenbus_dev_fatal(dev, err, "bind_evtchn_to_irqhandler failed"); goto fail; } - info->irq = err; + rinfo->irq = err; return 0; fail: @@ -1455,6 +1701,53 @@ fail: return err; } +/* + * Write out per-ring/queue nodes including ring-ref and event-channel, and each + * ring buffer may have multi pages depending on ->nr_ring_pages. + */ +static int write_per_ring_nodes(struct xenbus_transaction xbt, + struct blkfront_ring_info *rinfo, const char *dir) +{ + int err; + unsigned int i; + const char *message = NULL; + struct blkfront_info *info = rinfo->dev_info; + + if (info->nr_ring_pages == 1) { + err = xenbus_printf(xbt, dir, "ring-ref", "%u", rinfo->ring_ref[0]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } else { + for (i = 0; i < info->nr_ring_pages; i++) { + char ring_ref_name[RINGREF_NAME_LEN]; + + snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); + err = xenbus_printf(xbt, dir, ring_ref_name, + "%u", rinfo->ring_ref[i]); + if (err) { + message = "writing ring-ref"; + goto abort_transaction; + } + } + } + + err = xenbus_printf(xbt, dir, "event-channel", "%u", rinfo->evtchn); + if (err) { + message = "writing event-channel"; + goto abort_transaction; + } + + return 0; + +abort_transaction: + xenbus_transaction_end(xbt, 1); + if (message) + xenbus_dev_fatal(info->xbdev, err, "%s", message); + + return err; +} /* Common code used when first setting up, and when resuming. */ static int talk_to_blkback(struct xenbus_device *dev, @@ -1462,8 +1755,8 @@ static int talk_to_blkback(struct xenbus_device *dev, { const char *message = NULL; struct xenbus_transaction xbt; - int err, i; - unsigned int max_page_order = 0; + int err; + unsigned int i, max_page_order = 0; unsigned int ring_page_order = 0; err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, @@ -1475,10 +1768,14 @@ static int talk_to_blkback(struct xenbus_device *dev, info->nr_ring_pages = 1 << ring_page_order; } - /* Create shared ring, alloc event channel. */ - err = setup_blkring(dev, info); - if (err) - goto out; + for (i = 0; i < info->nr_rings; i++) { + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + /* Create shared ring, alloc event channel. */ + err = setup_blkring(dev, rinfo); + if (err) + goto destroy_blkring; + } again: err = xenbus_transaction_start(&xbt); @@ -1487,38 +1784,49 @@ again: goto destroy_blkring; } - if (info->nr_ring_pages == 1) { - err = xenbus_printf(xbt, dev->nodename, - "ring-ref", "%u", info->ring_ref[0]); + if (info->nr_ring_pages > 1) { + err = xenbus_printf(xbt, dev->nodename, "ring-page-order", "%u", + ring_page_order); if (err) { - message = "writing ring-ref"; + message = "writing ring-page-order"; goto abort_transaction; } + } + + /* We already got the number of queues/rings in _probe */ + if (info->nr_rings == 1) { + err = write_per_ring_nodes(xbt, &info->rinfo[0], dev->nodename); + if (err) + goto destroy_blkring; } else { - err = xenbus_printf(xbt, dev->nodename, - "ring-page-order", "%u", ring_page_order); + char *path; + size_t pathsize; + + err = xenbus_printf(xbt, dev->nodename, "multi-queue-num-queues", "%u", + info->nr_rings); if (err) { - message = "writing ring-page-order"; + message = "writing multi-queue-num-queues"; goto abort_transaction; } - for (i = 0; i < info->nr_ring_pages; i++) { - char ring_ref_name[RINGREF_NAME_LEN]; + pathsize = strlen(dev->nodename) + QUEUE_NAME_LEN; + path = kmalloc(pathsize, GFP_KERNEL); + if (!path) { + err = -ENOMEM; + message = "ENOMEM while writing ring references"; + goto abort_transaction; + } - snprintf(ring_ref_name, RINGREF_NAME_LEN, "ring-ref%u", i); - err = xenbus_printf(xbt, dev->nodename, ring_ref_name, - "%u", info->ring_ref[i]); + for (i = 0; i < info->nr_rings; i++) { + memset(path, 0, pathsize); + snprintf(path, pathsize, "%s/queue-%u", dev->nodename, i); + err = write_per_ring_nodes(xbt, &info->rinfo[i], path); if (err) { - message = "writing ring-ref"; - goto abort_transaction; + kfree(path); + goto destroy_blkring; } } - } - err = xenbus_printf(xbt, dev->nodename, - "event-channel", "%u", info->evtchn); - if (err) { - message = "writing event-channel"; - goto abort_transaction; + kfree(path); } err = xenbus_printf(xbt, dev->nodename, "protocol", "%s", XEN_IO_PROTO_ABI_NATIVE); @@ -1540,9 +1848,14 @@ again: goto destroy_blkring; } - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + for (i = 0; i < info->nr_rings; i++) { + unsigned int j; + struct blkfront_ring_info *rinfo = &info->rinfo[i]; + + for (j = 0; j < BLK_RING_SIZE(info); j++) + rinfo->shadow[j].req.u.rw.id = j + 1; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + } xenbus_switch_state(dev, XenbusStateInitialised); return 0; @@ -1553,7 +1866,10 @@ again: xenbus_dev_fatal(dev, err, "%s", message); destroy_blkring: blkif_free(info, 0); - out: + + kfree(info); + dev_set_drvdata(&dev->dev, NULL); + return err; } @@ -1567,7 +1883,9 @@ static int blkfront_probe(struct xenbus_device *dev, const struct xenbus_device_id *id) { int err, vdevice; + unsigned int r_index; struct blkfront_info *info; + unsigned int backend_max_queues = 0; /* FIXME: Use dynamic device id if this is not set. */ err = xenbus_scanf(XBT_NIL, dev->nodename, @@ -1617,15 +1935,39 @@ static int blkfront_probe(struct xenbus_device *dev, return -ENOMEM; } - mutex_init(&info->mutex); - spin_lock_init(&info->io_lock); info->xbdev = dev; + /* Check if backend supports multiple queues. */ + err = xenbus_scanf(XBT_NIL, info->xbdev->otherend, + "multi-queue-max-queues", "%u", &backend_max_queues); + if (err < 0) + backend_max_queues = 1; + + info->nr_rings = min(backend_max_queues, xen_blkif_max_queues); + /* We need at least one ring. */ + if (!info->nr_rings) + info->nr_rings = 1; + + info->rinfo = kzalloc(sizeof(struct blkfront_ring_info) * info->nr_rings, GFP_KERNEL); + if (!info->rinfo) { + xenbus_dev_fatal(dev, -ENOMEM, "allocating ring_info structure"); + kfree(info); + return -ENOMEM; + } + + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + INIT_LIST_HEAD(&rinfo->indirect_pages); + INIT_LIST_HEAD(&rinfo->grants); + rinfo->dev_info = info; + INIT_WORK(&rinfo->work, blkif_restart_queue); + spin_lock_init(&rinfo->ring_lock); + } + + mutex_init(&info->mutex); info->vdevice = vdevice; - INIT_LIST_HEAD(&info->grants); - INIT_LIST_HEAD(&info->indirect_pages); - info->persistent_gnts_c = 0; info->connected = BLKIF_STATE_DISCONNECTED; - INIT_WORK(&info->work, blkif_restart_queue); /* Front end dir is a number, which is used as the id. */ info->handle = simple_strtoul(strrchr(dev->nodename, '/')+1, NULL, 0); @@ -1649,7 +1991,7 @@ static void split_bio_end(struct bio *bio) static int blkif_recover(struct blkfront_info *info) { - int i; + unsigned int i, r_index; struct request *req, *n; struct blk_shadow *copy; int rc; @@ -1660,64 +2002,73 @@ static int blkif_recover(struct blkfront_info *info) struct split_bio *split_bio; struct list_head requests; - /* Stage 1: Make a safe copy of the shadow state. */ - copy = kmemdup(info->shadow, sizeof(info->shadow), - GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); - if (!copy) - return -ENOMEM; - - /* Stage 2: Set up free list. */ - memset(&info->shadow, 0, sizeof(info->shadow)); - for (i = 0; i < BLK_RING_SIZE(info); i++) - info->shadow[i].req.u.rw.id = i+1; - info->shadow_free = info->ring.req_prod_pvt; - info->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; - - rc = blkfront_gather_backend_features(info); - if (rc) { - kfree(copy); - return rc; - } - + blkfront_gather_backend_features(info); segs = info->max_indirect_segments ? : BLKIF_MAX_SEGMENTS_PER_REQUEST; blk_queue_max_segments(info->rq, segs); bio_list_init(&bio_list); INIT_LIST_HEAD(&requests); - for (i = 0; i < BLK_RING_SIZE(info); i++) { - /* Not in use? */ - if (!copy[i].request) - continue; - /* - * Get the bios in the request so we can re-queue them. - */ - if (copy[i].request->cmd_flags & - (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Stage 1: Make a safe copy of the shadow state. */ + copy = kmemdup(rinfo->shadow, sizeof(rinfo->shadow), + GFP_NOIO | __GFP_REPEAT | __GFP_HIGH); + if (!copy) + return -ENOMEM; + + /* Stage 2: Set up free list. */ + memset(&rinfo->shadow, 0, sizeof(rinfo->shadow)); + for (i = 0; i < BLK_RING_SIZE(info); i++) + rinfo->shadow[i].req.u.rw.id = i+1; + rinfo->shadow_free = rinfo->ring.req_prod_pvt; + rinfo->shadow[BLK_RING_SIZE(info)-1].req.u.rw.id = 0x0fffffff; + + rc = blkfront_setup_indirect(rinfo); + if (rc) { + kfree(copy); + return rc; + } + + for (i = 0; i < BLK_RING_SIZE(info); i++) { + /* Not in use? */ + if (!copy[i].request) + continue; + /* - * Flush operations don't contain bios, so - * we need to requeue the whole request + * Get the bios in the request so we can re-queue them. */ - list_add(©[i].request->queuelist, &requests); - continue; + if (copy[i].request->cmd_flags & + (REQ_FLUSH | REQ_FUA | REQ_DISCARD | REQ_SECURE)) { + /* + * Flush operations don't contain bios, so + * we need to requeue the whole request + */ + list_add(©[i].request->queuelist, &requests); + continue; + } + merge_bio.head = copy[i].request->bio; + merge_bio.tail = copy[i].request->biotail; + bio_list_merge(&bio_list, &merge_bio); + copy[i].request->bio = NULL; + blk_end_request_all(copy[i].request, 0); } - merge_bio.head = copy[i].request->bio; - merge_bio.tail = copy[i].request->biotail; - bio_list_merge(&bio_list, &merge_bio); - copy[i].request->bio = NULL; - blk_end_request_all(copy[i].request, 0); - } - - kfree(copy); + kfree(copy); + } xenbus_switch_state(info->xbdev, XenbusStateConnected); - spin_lock_irq(&info->io_lock); - /* Now safe for us to use the shared ring */ info->connected = BLKIF_STATE_CONNECTED; - /* Kick any other new requests queued since we resumed */ - kick_pending_request_queues(info); + for (r_index = 0; r_index < info->nr_rings; r_index++) { + struct blkfront_ring_info *rinfo; + + rinfo = &info->rinfo[r_index]; + /* Kick any other new requests queued since we resumed */ + kick_pending_request_queues(rinfo); + } list_for_each_entry_safe(req, n, &requests, queuelist) { /* Requeue pending requests (flush or discard) */ @@ -1725,7 +2076,6 @@ static int blkif_recover(struct blkfront_info *info) BUG_ON(req->nr_phys_segments > segs); blk_mq_requeue_request(req); } - spin_unlock_irq(&info->io_lock); blk_mq_kick_requeue_list(info->rq); while ((bio = bio_list_pop(&bio_list)) != NULL) { @@ -1790,8 +2140,7 @@ static int blkfront_resume(struct xenbus_device *dev) return err; } -static void -blkfront_closing(struct blkfront_info *info) +static void blkfront_closing(struct blkfront_info *info) { struct xenbus_device *xbdev = info->xbdev; struct block_device *bdev = NULL; @@ -1851,18 +2200,29 @@ static void blkfront_setup_discard(struct blkfront_info *info) info->feature_secdiscard = !!discard_secure; } -static int blkfront_setup_indirect(struct blkfront_info *info) +static int blkfront_setup_indirect(struct blkfront_ring_info *rinfo) { unsigned int psegs, grants; int err, i; + struct blkfront_info *info = rinfo->dev_info; - if (info->max_indirect_segments == 0) - grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + if (info->max_indirect_segments == 0) { + if (!HAS_EXTRA_REQ) + grants = BLKIF_MAX_SEGMENTS_PER_REQUEST; + else { + /* + * When an extra req is required, the maximum + * grants supported is related to the size of the + * Linux block segment. + */ + grants = GRANTS_PER_PSEG; + } + } else grants = info->max_indirect_segments; psegs = grants / GRANTS_PER_PSEG; - err = fill_grant_buffer(info, + err = fill_grant_buffer(rinfo, (grants + INDIRECT_GREFS(grants)) * BLK_RING_SIZE(info)); if (err) goto out_of_memory; @@ -1875,31 +2235,31 @@ static int blkfront_setup_indirect(struct blkfront_info *info) */ int num = INDIRECT_GREFS(grants) * BLK_RING_SIZE(info); - BUG_ON(!list_empty(&info->indirect_pages)); + BUG_ON(!list_empty(&rinfo->indirect_pages)); for (i = 0; i < num; i++) { struct page *indirect_page = alloc_page(GFP_NOIO); if (!indirect_page) goto out_of_memory; - list_add(&indirect_page->lru, &info->indirect_pages); + list_add(&indirect_page->lru, &rinfo->indirect_pages); } } for (i = 0; i < BLK_RING_SIZE(info); i++) { - info->shadow[i].grants_used = kzalloc( - sizeof(info->shadow[i].grants_used[0]) * grants, + rinfo->shadow[i].grants_used = kzalloc( + sizeof(rinfo->shadow[i].grants_used[0]) * grants, GFP_NOIO); - info->shadow[i].sg = kzalloc(sizeof(info->shadow[i].sg[0]) * psegs, GFP_NOIO); + rinfo->shadow[i].sg = kzalloc(sizeof(rinfo->shadow[i].sg[0]) * psegs, GFP_NOIO); if (info->max_indirect_segments) - info->shadow[i].indirect_grants = kzalloc( - sizeof(info->shadow[i].indirect_grants[0]) * + rinfo->shadow[i].indirect_grants = kzalloc( + sizeof(rinfo->shadow[i].indirect_grants[0]) * INDIRECT_GREFS(grants), GFP_NOIO); - if ((info->shadow[i].grants_used == NULL) || - (info->shadow[i].sg == NULL) || + if ((rinfo->shadow[i].grants_used == NULL) || + (rinfo->shadow[i].sg == NULL) || (info->max_indirect_segments && - (info->shadow[i].indirect_grants == NULL))) + (rinfo->shadow[i].indirect_grants == NULL))) goto out_of_memory; - sg_init_table(info->shadow[i].sg, psegs); + sg_init_table(rinfo->shadow[i].sg, psegs); } @@ -1907,16 +2267,16 @@ static int blkfront_setup_indirect(struct blkfront_info *info) out_of_memory: for (i = 0; i < BLK_RING_SIZE(info); i++) { - kfree(info->shadow[i].grants_used); - info->shadow[i].grants_used = NULL; - kfree(info->shadow[i].sg); - info->shadow[i].sg = NULL; - kfree(info->shadow[i].indirect_grants); - info->shadow[i].indirect_grants = NULL; - } - if (!list_empty(&info->indirect_pages)) { + kfree(rinfo->shadow[i].grants_used); + rinfo->shadow[i].grants_used = NULL; + kfree(rinfo->shadow[i].sg); + rinfo->shadow[i].sg = NULL; + kfree(rinfo->shadow[i].indirect_grants); + rinfo->shadow[i].indirect_grants = NULL; + } + if (!list_empty(&rinfo->indirect_pages)) { struct page *indirect_page, *n; - list_for_each_entry_safe(indirect_page, n, &info->indirect_pages, lru) { + list_for_each_entry_safe(indirect_page, n, &rinfo->indirect_pages, lru) { list_del(&indirect_page->lru); __free_page(indirect_page); } @@ -1927,7 +2287,7 @@ out_of_memory: /* * Gather all backend feature-* */ -static int blkfront_gather_backend_features(struct blkfront_info *info) +static void blkfront_gather_backend_features(struct blkfront_info *info) { int err; int barrier, flush, discard, persistent; @@ -1982,8 +2342,6 @@ static int blkfront_gather_backend_features(struct blkfront_info *info) else info->max_indirect_segments = min(indirect_segments, xen_blkif_max_segments); - - return blkfront_setup_indirect(info); } /* @@ -1996,7 +2354,7 @@ static void blkfront_connect(struct blkfront_info *info) unsigned long sector_size; unsigned int physical_sector_size; unsigned int binfo; - int err; + int err, i; switch (info->connected) { case BLKIF_STATE_CONNECTED: @@ -2053,11 +2411,15 @@ static void blkfront_connect(struct blkfront_info *info) if (err != 1) physical_sector_size = sector_size; - err = blkfront_gather_backend_features(info); - if (err) { - xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", - info->xbdev->otherend); - return; + blkfront_gather_backend_features(info); + for (i = 0; i < info->nr_rings; i++) { + err = blkfront_setup_indirect(&info->rinfo[i]); + if (err) { + xenbus_dev_fatal(info->xbdev, err, "setup_indirect at %s", + info->xbdev->otherend); + blkif_free(info, 0); + break; + } } err = xlvbd_alloc_gendisk(sectors, info, binfo, sector_size, @@ -2071,10 +2433,9 @@ static void blkfront_connect(struct blkfront_info *info) xenbus_switch_state(info->xbdev, XenbusStateConnected); /* Kick pending requests. */ - spin_lock_irq(&info->io_lock); info->connected = BLKIF_STATE_CONNECTED; - kick_pending_request_queues(info); - spin_unlock_irq(&info->io_lock); + for (i = 0; i < info->nr_rings; i++) + kick_pending_request_queues(&info->rinfo[i]); add_disk(info->gd); @@ -2095,11 +2456,8 @@ static void blkback_changed(struct xenbus_device *dev, case XenbusStateInitWait: if (dev->state != XenbusStateInitialising) break; - if (talk_to_blkback(dev, info)) { - kfree(info); - dev_set_drvdata(&dev->dev, NULL); + if (talk_to_blkback(dev, info)) break; - } case XenbusStateInitialising: case XenbusStateInitialised: case XenbusStateReconfiguring: @@ -2108,6 +2466,10 @@ static void blkback_changed(struct xenbus_device *dev, break; case XenbusStateConnected: + if (dev->state != XenbusStateInitialised) { + if (talk_to_blkback(dev, info)) + break; + } blkfront_connect(info); break; @@ -2281,6 +2643,7 @@ static struct xenbus_driver blkfront_driver = { static int __init xlblk_init(void) { int ret; + int nr_cpus = num_online_cpus(); if (!xen_domain()) return -ENODEV; @@ -2288,7 +2651,13 @@ static int __init xlblk_init(void) if (xen_blkif_max_ring_order > XENBUS_MAX_RING_GRANT_ORDER) { pr_info("Invalid max_ring_order (%d), will use default max: %d.\n", xen_blkif_max_ring_order, XENBUS_MAX_RING_GRANT_ORDER); - xen_blkif_max_ring_order = 0; + xen_blkif_max_ring_order = XENBUS_MAX_RING_GRANT_ORDER; + } + + if (xen_blkif_max_queues > nr_cpus) { + pr_info("Invalid max_queues (%d), will use default max: %d.\n", + xen_blkif_max_queues, nr_cpus); + xen_blkif_max_queues = nr_cpus; } if (!xen_has_pv_disk_devices()) diff --git a/drivers/char/mem.c b/drivers/char/mem.c index 6b1721f978c2..4f6f94c43412 100644 --- a/drivers/char/mem.c +++ b/drivers/char/mem.c @@ -689,7 +689,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) { loff_t ret; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); switch (orig) { case SEEK_CUR: offset += file->f_pos; @@ -706,7 +706,7 @@ static loff_t memory_lseek(struct file *file, loff_t offset, int orig) default: ret = -EINVAL; } - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return ret; } diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f1d7fa45c275..f3f92d5fcda0 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -93,14 +93,11 @@ struct vma_data { spinlock_t lock; /* Serialize access to this structure. */ int count; /* Number of pages allocated. */ enum mspec_page_type type; /* Type of pages allocated. */ - int flags; /* See VMD_xxx below. */ unsigned long vm_start; /* Original (unsplit) base. */ unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; -#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ - /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma) "failed to zero page %ld\n", my_page); } - if (vdata->flags & VMD_VMALLOCED) - vfree(vdata); - else - kfree(vdata); + kvfree(vdata); } /* @@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, enum mspec_page_type type) { struct vma_data *vdata; - int pages, vdata_size, flags = 0; + int pages, vdata_size; if (vma->vm_pgoff != 0) return -EINVAL; @@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, vdata_size = sizeof(struct vma_data) + pages * sizeof(long); if (vdata_size <= PAGE_SIZE) vdata = kzalloc(vdata_size, GFP_KERNEL); - else { + else vdata = vzalloc(vdata_size); - flags = VMD_VMALLOCED; - } if (!vdata) return -ENOMEM; vdata->vm_start = vma->vm_start; vdata->vm_end = vma->vm_end; - vdata->flags = flags; vdata->type = type; spin_lock_init(&vdata->lock); atomic_set(&vdata->refcnt, 1); diff --git a/drivers/char/ps3flash.c b/drivers/char/ps3flash.c index 0b311fa277ef..b526dc15c271 100644 --- a/drivers/char/ps3flash.c +++ b/drivers/char/ps3flash.c @@ -290,9 +290,9 @@ static int ps3flash_fsync(struct file *file, loff_t start, loff_t end, int datas { struct inode *inode = file_inode(file); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = ps3flash_writeback(ps3flash_dev); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/drivers/crypto/Kconfig b/drivers/crypto/Kconfig index 3dd69df9c970..07d494276aad 100644 --- a/drivers/crypto/Kconfig +++ b/drivers/crypto/Kconfig @@ -381,6 +381,7 @@ config CRYPTO_DEV_BFIN_CRC config CRYPTO_DEV_ATMEL_AES tristate "Support for Atmel AES hw accelerator" + depends on HAS_DMA depends on AT_XDMAC || AT_HDMAC || COMPILE_TEST select CRYPTO_AES select CRYPTO_AEAD diff --git a/drivers/crypto/atmel-aes.c b/drivers/crypto/atmel-aes.c index 5621612ee921..6dd3317ca365 100644 --- a/drivers/crypto/atmel-aes.c +++ b/drivers/crypto/atmel-aes.c @@ -280,6 +280,7 @@ static const char *atmel_aes_reg_name(u32 offset, char *tmp, size_t sz) case AES_GCMHR(2): case AES_GCMHR(3): snprintf(tmp, sz, "GCMHR[%u]", (offset - AES_GCMHR(0)) >> 2); + break; default: snprintf(tmp, sz, "0x%02x", offset); diff --git a/drivers/crypto/qat/qat_common/qat_hal.c b/drivers/crypto/qat/qat_common/qat_hal.c index 0ac0ba867611..1e480f140663 100644 --- a/drivers/crypto/qat/qat_common/qat_hal.c +++ b/drivers/crypto/qat/qat_common/qat_hal.c @@ -389,7 +389,7 @@ static int qat_hal_check_ae_alive(struct icp_qat_fw_loader_handle *handle) { unsigned int base_cnt, cur_cnt; unsigned char ae; - unsigned int times = MAX_RETRY_TIMES; + int times = MAX_RETRY_TIMES; for (ae = 0; ae < handle->hal_handle->ae_max_num; ae++) { qat_hal_rd_ae_csr(handle, ae, PROFILE_COUNT, @@ -402,7 +402,7 @@ static int qat_hal_check_ae_alive(struct icp_qat_fw_loader_handle *handle) cur_cnt &= 0xffff; } while (times-- && (cur_cnt == base_cnt)); - if (!times) { + if (times < 0) { pr_err("QAT: AE%d is inactive!!\n", ae); return -EFAULT; } @@ -453,7 +453,11 @@ static int qat_hal_init_esram(struct icp_qat_fw_loader_handle *handle) void __iomem *csr_addr = (void __iomem *)((uintptr_t)handle->hal_ep_csr_addr_v + ESRAM_AUTO_INIT_CSR_OFFSET); - unsigned int csr_val, times = 30; + unsigned int csr_val; + int times = 30; + + if (handle->pci_dev->device == ADF_C3XXX_PCI_DEVICE_ID) + return 0; csr_val = ADF_CSR_RD(csr_addr, 0); if ((csr_val & ESRAM_AUTO_TINIT) && (csr_val & ESRAM_AUTO_TINIT_DONE)) @@ -467,7 +471,7 @@ static int qat_hal_init_esram(struct icp_qat_fw_loader_handle *handle) qat_hal_wait_cycles(handle, 0, ESRAM_AUTO_INIT_USED_CYCLES, 0); csr_val = ADF_CSR_RD(csr_addr, 0); } while (!(csr_val & ESRAM_AUTO_TINIT_DONE) && times--); - if ((!times)) { + if ((times < 0)) { pr_err("QAT: Fail to init eSram!\n"); return -EFAULT; } @@ -658,7 +662,7 @@ static int qat_hal_clear_gpr(struct icp_qat_fw_loader_handle *handle) ret = qat_hal_wait_cycles(handle, ae, 20, 1); } while (ret && times--); - if (!times) { + if (times < 0) { pr_err("QAT: clear GPR of AE %d failed", ae); return -EINVAL; } @@ -693,14 +697,12 @@ int qat_hal_init(struct adf_accel_dev *accel_dev) struct adf_hw_device_data *hw_data = accel_dev->hw_device; struct adf_bar *misc_bar = &pci_info->pci_bars[hw_data->get_misc_bar_id(hw_data)]; - struct adf_bar *sram_bar = - &pci_info->pci_bars[hw_data->get_sram_bar_id(hw_data)]; + struct adf_bar *sram_bar; handle = kzalloc(sizeof(*handle), GFP_KERNEL); if (!handle) return -ENOMEM; - handle->hal_sram_addr_v = sram_bar->virt_addr; handle->hal_cap_g_ctl_csr_addr_v = (void __iomem *)((uintptr_t)misc_bar->virt_addr + ICP_QAT_CAP_OFFSET); @@ -714,6 +716,11 @@ int qat_hal_init(struct adf_accel_dev *accel_dev) (void __iomem *)((uintptr_t)handle->hal_cap_ae_xfer_csr_addr_v + LOCAL_TO_XFER_REG_OFFSET); handle->pci_dev = pci_info->pci_dev; + if (handle->pci_dev->device != ADF_C3XXX_PCI_DEVICE_ID) { + sram_bar = + &pci_info->pci_bars[hw_data->get_sram_bar_id(hw_data)]; + handle->hal_sram_addr_v = sram_bar->virt_addr; + } handle->fw_auth = (handle->pci_dev->device == ADF_DH895XCC_PCI_DEVICE_ID) ? false : true; handle->hal_handle = kzalloc(sizeof(*handle->hal_handle), GFP_KERNEL); diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index c3b80fd65d62..7b30b307674b 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c @@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item); void drm_ht_remove(struct drm_open_hash *ht) { if (ht->table) { - if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order) - kfree(ht->table); - else - vfree(ht->table); + kvfree(ht->table); ht->table = NULL; } } diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig index aa26f3c3416b..8a8440c0eed1 100644 --- a/drivers/infiniband/Kconfig +++ b/drivers/infiniband/Kconfig @@ -5,6 +5,7 @@ menuconfig INFINIBAND depends on NET depends on INET depends on m || IPV6 != m + select IRQ_POLL ---help--- Core support for InfiniBand (IB). Make sure to also select any protocols you wish to use as well as drivers for your @@ -54,6 +55,15 @@ config INFINIBAND_ADDR_TRANS depends on INFINIBAND default y +config INFINIBAND_ADDR_TRANS_CONFIGFS + bool + depends on INFINIBAND_ADDR_TRANS && CONFIGFS_FS && !(INFINIBAND=y && CONFIGFS_FS=m) + default y + ---help--- + ConfigFS support for RDMA communication manager (CM). + This allows the user to config the default GID type that the CM + uses for each device, when initiaing new connections. + source "drivers/infiniband/hw/mthca/Kconfig" source "drivers/infiniband/hw/qib/Kconfig" source "drivers/infiniband/hw/cxgb3/Kconfig" diff --git a/drivers/infiniband/core/Makefile b/drivers/infiniband/core/Makefile index d43a8994ac5c..f818538a7f4e 100644 --- a/drivers/infiniband/core/Makefile +++ b/drivers/infiniband/core/Makefile @@ -8,7 +8,7 @@ obj-$(CONFIG_INFINIBAND_USER_MAD) += ib_umad.o obj-$(CONFIG_INFINIBAND_USER_ACCESS) += ib_uverbs.o ib_ucm.o \ $(user_access-y) -ib_core-y := packer.o ud_header.o verbs.o sysfs.o \ +ib_core-y := packer.o ud_header.o verbs.o cq.o sysfs.o \ device.o fmr_pool.o cache.o netlink.o \ roce_gid_mgmt.o ib_core-$(CONFIG_INFINIBAND_USER_MEM) += umem.o @@ -24,6 +24,8 @@ iw_cm-y := iwcm.o iwpm_util.o iwpm_msg.o rdma_cm-y := cma.o +rdma_cm-$(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) += cma_configfs.o + rdma_ucm-y := ucma.o ib_addr-y := addr.o diff --git a/drivers/infiniband/core/addr.c b/drivers/infiniband/core/addr.c index 34b1adad07aa..337353d86cfa 100644 --- a/drivers/infiniband/core/addr.c +++ b/drivers/infiniband/core/addr.c @@ -121,7 +121,8 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, } EXPORT_SYMBOL(rdma_copy_addr); -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, u16 *vlan_id) { struct net_device *dev; @@ -139,7 +140,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, switch (addr->sa_family) { case AF_INET: dev = ip_dev_find(dev_addr->net, - ((struct sockaddr_in *) addr)->sin_addr.s_addr); + ((const struct sockaddr_in *)addr)->sin_addr.s_addr); if (!dev) return ret; @@ -154,7 +155,7 @@ int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, rcu_read_lock(); for_each_netdev_rcu(dev_addr->net, dev) { if (ipv6_chk_addr(dev_addr->net, - &((struct sockaddr_in6 *) addr)->sin6_addr, + &((const struct sockaddr_in6 *)addr)->sin6_addr, dev, 1)) { ret = rdma_copy_addr(dev_addr, dev, NULL); if (vlan_id) @@ -198,7 +199,8 @@ static void queue_req(struct addr_req *req) mutex_unlock(&lock); } -static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, void *daddr) +static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, + const void *daddr) { struct neighbour *n; int ret; @@ -222,8 +224,9 @@ static int dst_fetch_ha(struct dst_entry *dst, struct rdma_dev_addr *dev_addr, v } static int addr4_resolve(struct sockaddr_in *src_in, - struct sockaddr_in *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in *dst_in, + struct rdma_dev_addr *addr, + struct rtable **prt) { __be32 src_ip = src_in->sin_addr.s_addr; __be32 dst_ip = dst_in->sin_addr.s_addr; @@ -243,33 +246,29 @@ static int addr4_resolve(struct sockaddr_in *src_in, src_in->sin_family = AF_INET; src_in->sin_addr.s_addr = fl4.saddr; - if (rt->dst.dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } + /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't + * routable) and we could set the network type accordingly. + */ + if (rt->rt_uses_gateway) + addr->network = RDMA_NETWORK_IPV4; - /* If the device does ARP internally, return 'done' */ - if (rt->dst.dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, rt->dst.dev, NULL); - goto put; - } + addr->hoplimit = ip4_dst_hoplimit(&rt->dst); - ret = dst_fetch_ha(&rt->dst, addr, &fl4.daddr); -put: - ip_rt_put(rt); + *prt = rt; + return 0; out: return ret; } #if IS_ENABLED(CONFIG_IPV6) static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) { struct flowi6 fl6; struct dst_entry *dst; + struct rt6_info *rt; int ret; memset(&fl6, 0, sizeof fl6); @@ -281,6 +280,7 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, if ((ret = dst->error)) goto put; + rt = (struct rt6_info *)dst; if (ipv6_addr_any(&fl6.saddr)) { ret = ipv6_dev_get_saddr(addr->net, ip6_dst_idev(dst)->dev, &fl6.daddr, 0, &fl6.saddr); @@ -291,43 +291,111 @@ static int addr6_resolve(struct sockaddr_in6 *src_in, src_in->sin6_addr = fl6.saddr; } - if (dst->dev->flags & IFF_LOOPBACK) { - ret = rdma_translate_ip((struct sockaddr *)dst_in, addr, NULL); - if (!ret) - memcpy(addr->dst_dev_addr, addr->src_dev_addr, MAX_ADDR_LEN); - goto put; - } + /* If there's a gateway, we're definitely in RoCE v2 (as RoCE v1 isn't + * routable) and we could set the network type accordingly. + */ + if (rt->rt6i_flags & RTF_GATEWAY) + addr->network = RDMA_NETWORK_IPV6; - /* If the device does ARP internally, return 'done' */ - if (dst->dev->flags & IFF_NOARP) { - ret = rdma_copy_addr(addr, dst->dev, NULL); - goto put; - } + addr->hoplimit = ip6_dst_hoplimit(dst); - ret = dst_fetch_ha(dst, addr, &fl6.daddr); + *pdst = dst; + return 0; put: dst_release(dst); return ret; } #else static int addr6_resolve(struct sockaddr_in6 *src_in, - struct sockaddr_in6 *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr_in6 *dst_in, + struct rdma_dev_addr *addr, + struct dst_entry **pdst) { return -EADDRNOTAVAIL; } #endif +static int addr_resolve_neigh(struct dst_entry *dst, + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr) +{ + if (dst->dev->flags & IFF_LOOPBACK) { + int ret; + + ret = rdma_translate_ip(dst_in, addr, NULL); + if (!ret) + memcpy(addr->dst_dev_addr, addr->src_dev_addr, + MAX_ADDR_LEN); + + return ret; + } + + /* If the device doesn't do ARP internally */ + if (!(dst->dev->flags & IFF_NOARP)) { + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + + return dst_fetch_ha(dst, addr, + dst_in->sa_family == AF_INET ? + (const void *)&dst_in4->sin_addr.s_addr : + (const void *)&dst_in6->sin6_addr); + } + + return rdma_copy_addr(addr, dst->dev, NULL); +} + static int addr_resolve(struct sockaddr *src_in, - struct sockaddr *dst_in, - struct rdma_dev_addr *addr) + const struct sockaddr *dst_in, + struct rdma_dev_addr *addr, + bool resolve_neigh) { + struct net_device *ndev; + struct dst_entry *dst; + int ret; + if (src_in->sa_family == AF_INET) { - return addr4_resolve((struct sockaddr_in *) src_in, - (struct sockaddr_in *) dst_in, addr); - } else - return addr6_resolve((struct sockaddr_in6 *) src_in, - (struct sockaddr_in6 *) dst_in, addr); + struct rtable *rt = NULL; + const struct sockaddr_in *dst_in4 = + (const struct sockaddr_in *)dst_in; + + ret = addr4_resolve((struct sockaddr_in *)src_in, + dst_in4, addr, &rt); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(&rt->dst, dst_in, addr); + + ndev = rt->dst.dev; + dev_hold(ndev); + + ip_rt_put(rt); + } else { + const struct sockaddr_in6 *dst_in6 = + (const struct sockaddr_in6 *)dst_in; + + ret = addr6_resolve((struct sockaddr_in6 *)src_in, + dst_in6, addr, + &dst); + if (ret) + return ret; + + if (resolve_neigh) + ret = addr_resolve_neigh(dst, dst_in, addr); + + ndev = dst->dev; + dev_hold(ndev); + + dst_release(dst); + } + + addr->bound_dev_if = ndev->ifindex; + addr->net = dev_net(ndev); + dev_put(ndev); + + return ret; } static void process_req(struct work_struct *work) @@ -343,7 +411,8 @@ static void process_req(struct work_struct *work) if (req->status == -ENODATA) { src_in = (struct sockaddr *) &req->src_addr; dst_in = (struct sockaddr *) &req->dst_addr; - req->status = addr_resolve(src_in, dst_in, req->addr); + req->status = addr_resolve(src_in, dst_in, req->addr, + true); if (req->status && time_after_eq(jiffies, req->timeout)) req->status = -ETIMEDOUT; else if (req->status == -ENODATA) @@ -403,7 +472,7 @@ int rdma_resolve_ip(struct rdma_addr_client *client, req->client = client; atomic_inc(&client->refcount); - req->status = addr_resolve(src_in, dst_in, addr); + req->status = addr_resolve(src_in, dst_in, addr, true); switch (req->status) { case 0: req->timeout = jiffies; @@ -425,6 +494,26 @@ err: } EXPORT_SYMBOL(rdma_resolve_ip); +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr) +{ + struct sockaddr_storage ssrc_addr = {}; + struct sockaddr *src_in = (struct sockaddr *)&ssrc_addr; + + if (src_addr) { + if (src_addr->sa_family != dst_addr->sa_family) + return -EINVAL; + + memcpy(src_in, src_addr, rdma_addr_size(src_addr)); + } else { + src_in->sa_family = dst_addr->sa_family; + } + + return addr_resolve(src_in, dst_addr, addr, false); +} +EXPORT_SYMBOL(rdma_resolve_ip_route); + void rdma_addr_cancel(struct rdma_dev_addr *addr) { struct addr_req *req, *temp_req; @@ -456,8 +545,10 @@ static void resolve_cb(int status, struct sockaddr *src_addr, complete(&((struct resolve_cb_context *)context)->comp); } -int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *dmac, u16 *vlan_id, int if_index) +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *dmac, u16 *vlan_id, int *if_index, + int *hoplimit) { int ret = 0; struct rdma_dev_addr dev_addr; @@ -475,7 +566,8 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi rdma_gid2ip(&dgid_addr._sockaddr, dgid); memset(&dev_addr, 0, sizeof(dev_addr)); - dev_addr.bound_dev_if = if_index; + if (if_index) + dev_addr.bound_dev_if = *if_index; dev_addr.net = &init_net; ctx.addr = &dev_addr; @@ -491,12 +583,16 @@ int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgi dev = dev_get_by_index(&init_net, dev_addr.bound_dev_if); if (!dev) return -ENODEV; + if (if_index) + *if_index = dev_addr.bound_dev_if; if (vlan_id) *vlan_id = rdma_vlan_dev_vlan_id(dev); + if (hoplimit) + *hoplimit = dev_addr.hoplimit; dev_put(dev); return ret; } -EXPORT_SYMBOL(rdma_addr_find_dmac_by_grh); +EXPORT_SYMBOL(rdma_addr_find_l2_eth_by_grh); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id) { diff --git a/drivers/infiniband/core/cache.c b/drivers/infiniband/core/cache.c index 89bebeada38b..53343ffbff7a 100644 --- a/drivers/infiniband/core/cache.c +++ b/drivers/infiniband/core/cache.c @@ -64,6 +64,7 @@ enum gid_attr_find_mask { GID_ATTR_FIND_MASK_GID = 1UL << 0, GID_ATTR_FIND_MASK_NETDEV = 1UL << 1, GID_ATTR_FIND_MASK_DEFAULT = 1UL << 2, + GID_ATTR_FIND_MASK_GID_TYPE = 1UL << 3, }; enum gid_table_entry_props { @@ -81,10 +82,6 @@ enum gid_table_write_action { }; struct ib_gid_table_entry { - /* This lock protects an entry from being - * read and written simultaneously. - */ - rwlock_t lock; unsigned long props; union ib_gid gid; struct ib_gid_attr attr; @@ -109,28 +106,86 @@ struct ib_gid_table { * are locked by this lock. **/ struct mutex lock; + /* This lock protects the table entries from being + * read and written simultaneously. + */ + rwlock_t rwlock; struct ib_gid_table_entry *data_vec; }; +static void dispatch_gid_change_event(struct ib_device *ib_dev, u8 port) +{ + if (rdma_cap_roce_gid_table(ib_dev, port)) { + struct ib_event event; + + event.device = ib_dev; + event.element.port_num = port; + event.event = IB_EVENT_GID_CHANGE; + + ib_dispatch_event(&event); + } +} + +static const char * const gid_type_str[] = { + [IB_GID_TYPE_IB] = "IB/RoCE v1", + [IB_GID_TYPE_ROCE_UDP_ENCAP] = "RoCE v2", +}; + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type) +{ + if (gid_type < ARRAY_SIZE(gid_type_str) && gid_type_str[gid_type]) + return gid_type_str[gid_type]; + + return "Invalid GID type"; +} +EXPORT_SYMBOL(ib_cache_gid_type_str); + +int ib_cache_gid_parse_type_str(const char *buf) +{ + unsigned int i; + size_t len; + int err = -EINVAL; + + len = strlen(buf); + if (len == 0) + return -EINVAL; + + if (buf[len - 1] == '\n') + len--; + + for (i = 0; i < ARRAY_SIZE(gid_type_str); ++i) + if (gid_type_str[i] && !strncmp(buf, gid_type_str[i], len) && + len == strlen(gid_type_str[i])) { + err = i; + break; + } + + return err; +} +EXPORT_SYMBOL(ib_cache_gid_parse_type_str); + +/* This function expects that rwlock will be write locked in all + * scenarios and that lock will be locked in sleep-able (RoCE) + * scenarios. + */ static int write_gid(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table, int ix, const union ib_gid *gid, const struct ib_gid_attr *attr, enum gid_table_write_action action, bool default_gid) + __releases(&table->rwlock) __acquires(&table->rwlock) { int ret = 0; struct net_device *old_net_dev; - unsigned long flags; /* in rdma_cap_roce_gid_table, this funciton should be protected by a * sleep-able lock. */ - write_lock_irqsave(&table->data_vec[ix].lock, flags); if (rdma_cap_roce_gid_table(ib_dev, port)) { table->data_vec[ix].props |= GID_TABLE_ENTRY_INVALID; - write_unlock_irqrestore(&table->data_vec[ix].lock, flags); + write_unlock_irq(&table->rwlock); /* GID_TABLE_WRITE_ACTION_MODIFY currently isn't supported by * RoCE providers and thus only updates the cache. */ @@ -140,7 +195,7 @@ static int write_gid(struct ib_device *ib_dev, u8 port, else if (action == GID_TABLE_WRITE_ACTION_DEL) ret = ib_dev->del_gid(ib_dev, port, ix, &table->data_vec[ix].context); - write_lock_irqsave(&table->data_vec[ix].lock, flags); + write_lock_irq(&table->rwlock); } old_net_dev = table->data_vec[ix].attr.ndev; @@ -162,17 +217,6 @@ static int write_gid(struct ib_device *ib_dev, u8 port, table->data_vec[ix].props &= ~GID_TABLE_ENTRY_INVALID; - write_unlock_irqrestore(&table->data_vec[ix].lock, flags); - - if (!ret && rdma_cap_roce_gid_table(ib_dev, port)) { - struct ib_event event; - - event.device = ib_dev; - event.element.port_num = port; - event.event = IB_EVENT_GID_CHANGE; - - ib_dispatch_event(&event); - } return ret; } @@ -201,41 +245,58 @@ static int del_gid(struct ib_device *ib_dev, u8 port, GID_TABLE_WRITE_ACTION_DEL, default_gid); } +/* rwlock should be read locked */ static int find_gid(struct ib_gid_table *table, const union ib_gid *gid, const struct ib_gid_attr *val, bool default_gid, - unsigned long mask) + unsigned long mask, int *pempty) { - int i; + int i = 0; + int found = -1; + int empty = pempty ? -1 : 0; - for (i = 0; i < table->sz; i++) { - unsigned long flags; - struct ib_gid_attr *attr = &table->data_vec[i].attr; + while (i < table->sz && (found < 0 || empty < 0)) { + struct ib_gid_table_entry *data = &table->data_vec[i]; + struct ib_gid_attr *attr = &data->attr; + int curr_index = i; - read_lock_irqsave(&table->data_vec[i].lock, flags); + i++; - if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) - goto next; + if (data->props & GID_TABLE_ENTRY_INVALID) + continue; + + if (empty < 0) + if (!memcmp(&data->gid, &zgid, sizeof(*gid)) && + !memcmp(attr, &zattr, sizeof(*attr)) && + !data->props) + empty = curr_index; + + if (found >= 0) + continue; + + if (mask & GID_ATTR_FIND_MASK_GID_TYPE && + attr->gid_type != val->gid_type) + continue; if (mask & GID_ATTR_FIND_MASK_GID && - memcmp(gid, &table->data_vec[i].gid, sizeof(*gid))) - goto next; + memcmp(gid, &data->gid, sizeof(*gid))) + continue; if (mask & GID_ATTR_FIND_MASK_NETDEV && attr->ndev != val->ndev) - goto next; + continue; if (mask & GID_ATTR_FIND_MASK_DEFAULT && - !!(table->data_vec[i].props & GID_TABLE_ENTRY_DEFAULT) != + !!(data->props & GID_TABLE_ENTRY_DEFAULT) != default_gid) - goto next; + continue; - read_unlock_irqrestore(&table->data_vec[i].lock, flags); - return i; -next: - read_unlock_irqrestore(&table->data_vec[i].lock, flags); + found = curr_index; } - return -1; + if (pempty) + *pempty = empty; + + return found; } static void make_default_gid(struct net_device *dev, union ib_gid *gid) @@ -252,6 +313,7 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, int ix; int ret = 0; struct net_device *idev; + int empty; table = ports_table[port - rdma_start_port(ib_dev)]; @@ -275,22 +337,25 @@ int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, } mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_NETDEV); + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_NETDEV, &empty); if (ix >= 0) goto out_unlock; - ix = find_gid(table, &zgid, NULL, false, GID_ATTR_FIND_MASK_GID | - GID_ATTR_FIND_MASK_DEFAULT); - if (ix < 0) { + if (empty < 0) { ret = -ENOSPC; goto out_unlock; } - add_gid(ib_dev, port, table, ix, gid, attr, false); + ret = add_gid(ib_dev, port, table, empty, gid, attr, false); + if (!ret) + dispatch_gid_change_event(ib_dev, port); out_unlock: + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return ret; } @@ -305,17 +370,22 @@ int ib_cache_gid_del(struct ib_device *ib_dev, u8 port, table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); ix = find_gid(table, gid, attr, false, GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE | GID_ATTR_FIND_MASK_NETDEV | - GID_ATTR_FIND_MASK_DEFAULT); + GID_ATTR_FIND_MASK_DEFAULT, + NULL); if (ix < 0) goto out_unlock; - del_gid(ib_dev, port, table, ix, false); + if (!del_gid(ib_dev, port, table, ix, false)) + dispatch_gid_change_event(ib_dev, port); out_unlock: + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); return 0; } @@ -326,16 +396,24 @@ int ib_cache_gid_del_all_netdev_gids(struct ib_device *ib_dev, u8 port, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; int ix; + bool deleted = false; table = ports_table[port - rdma_start_port(ib_dev)]; mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); for (ix = 0; ix < table->sz; ix++) if (table->data_vec[ix].attr.ndev == ndev) - del_gid(ib_dev, port, table, ix, false); + if (!del_gid(ib_dev, port, table, ix, false)) + deleted = true; + write_unlock_irq(&table->rwlock); mutex_unlock(&table->lock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); + return 0; } @@ -344,18 +422,14 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - unsigned long flags; table = ports_table[port - rdma_start_port(ib_dev)]; if (index < 0 || index >= table->sz) return -EINVAL; - read_lock_irqsave(&table->data_vec[index].lock, flags); - if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) { - read_unlock_irqrestore(&table->data_vec[index].lock, flags); + if (table->data_vec[index].props & GID_TABLE_ENTRY_INVALID) return -EAGAIN; - } memcpy(gid, &table->data_vec[index].gid, sizeof(*gid)); if (attr) { @@ -364,7 +438,6 @@ static int __ib_cache_gid_get(struct ib_device *ib_dev, u8 port, int index, dev_hold(attr->ndev); } - read_unlock_irqrestore(&table->data_vec[index].lock, flags); return 0; } @@ -378,17 +451,21 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, struct ib_gid_table *table; u8 p; int local_index; + unsigned long flags; for (p = 0; p < ib_dev->phys_port_cnt; p++) { table = ports_table[p]; - local_index = find_gid(table, gid, val, false, mask); + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; if (port) *port = p + rdma_start_port(ib_dev); + read_unlock_irqrestore(&table->rwlock, flags); return 0; } + read_unlock_irqrestore(&table->rwlock, flags); } return -ENOENT; @@ -396,11 +473,13 @@ static int _ib_cache_gid_table_find(struct ib_device *ib_dev, static int ib_cache_gid_find(struct ib_device *ib_dev, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port, u16 *index) { - unsigned long mask = GID_ATTR_FIND_MASK_GID; - struct ib_gid_attr gid_attr_val = {.ndev = ndev}; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr gid_attr_val = {.ndev = ndev, .gid_type = gid_type}; if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; @@ -411,14 +490,17 @@ static int ib_cache_gid_find(struct ib_device *ib_dev, int ib_find_cached_gid_by_port(struct ib_device *ib_dev, const union ib_gid *gid, + enum ib_gid_type gid_type, u8 port, struct net_device *ndev, u16 *index) { int local_index; struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; - unsigned long mask = GID_ATTR_FIND_MASK_GID; - struct ib_gid_attr val = {.ndev = ndev}; + unsigned long mask = GID_ATTR_FIND_MASK_GID | + GID_ATTR_FIND_MASK_GID_TYPE; + struct ib_gid_attr val = {.ndev = ndev, .gid_type = gid_type}; + unsigned long flags; if (port < rdma_start_port(ib_dev) || port > rdma_end_port(ib_dev)) @@ -429,13 +511,16 @@ int ib_find_cached_gid_by_port(struct ib_device *ib_dev, if (ndev) mask |= GID_ATTR_FIND_MASK_NETDEV; - local_index = find_gid(table, gid, &val, false, mask); + read_lock_irqsave(&table->rwlock, flags); + local_index = find_gid(table, gid, &val, false, mask, NULL); if (local_index >= 0) { if (index) *index = local_index; + read_unlock_irqrestore(&table->rwlock, flags); return 0; } + read_unlock_irqrestore(&table->rwlock, flags); return -ENOENT; } EXPORT_SYMBOL(ib_find_cached_gid_by_port); @@ -472,6 +557,7 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; struct ib_gid_table *table; unsigned int i; + unsigned long flags; bool found = false; if (!ports_table) @@ -484,11 +570,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, table = ports_table[port - rdma_start_port(ib_dev)]; + read_lock_irqsave(&table->rwlock, flags); for (i = 0; i < table->sz; i++) { struct ib_gid_attr attr; - unsigned long flags; - read_lock_irqsave(&table->data_vec[i].lock, flags); if (table->data_vec[i].props & GID_TABLE_ENTRY_INVALID) goto next; @@ -501,11 +586,10 @@ static int ib_cache_gid_find_by_filter(struct ib_device *ib_dev, found = true; next: - read_unlock_irqrestore(&table->data_vec[i].lock, flags); - if (found) break; } + read_unlock_irqrestore(&table->rwlock, flags); if (!found) return -ENOENT; @@ -517,9 +601,9 @@ next: static struct ib_gid_table *alloc_gid_table(int sz) { - unsigned int i; struct ib_gid_table *table = kzalloc(sizeof(struct ib_gid_table), GFP_KERNEL); + if (!table) return NULL; @@ -530,9 +614,7 @@ static struct ib_gid_table *alloc_gid_table(int sz) mutex_init(&table->lock); table->sz = sz; - - for (i = 0; i < sz; i++) - rwlock_init(&table->data_vec[i].lock); + rwlock_init(&table->rwlock); return table; @@ -553,30 +635,37 @@ static void cleanup_gid_table_port(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { int i; + bool deleted = false; if (!table) return; + write_lock_irq(&table->rwlock); for (i = 0; i < table->sz; ++i) { if (memcmp(&table->data_vec[i].gid, &zgid, sizeof(table->data_vec[i].gid))) - del_gid(ib_dev, port, table, i, - table->data_vec[i].props & - GID_ATTR_FIND_MASK_DEFAULT); + if (!del_gid(ib_dev, port, table, i, + table->data_vec[i].props & + GID_ATTR_FIND_MASK_DEFAULT)) + deleted = true; } + write_unlock_irq(&table->rwlock); + + if (deleted) + dispatch_gid_change_event(ib_dev, port); } void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, struct net_device *ndev, + unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode) { struct ib_gid_table **ports_table = ib_dev->cache.gid_cache; union ib_gid gid; struct ib_gid_attr gid_attr; + struct ib_gid_attr zattr_type = zattr; struct ib_gid_table *table; - int ix; - union ib_gid current_gid; - struct ib_gid_attr current_gid_attr = {}; + unsigned int gid_type; table = ports_table[port - rdma_start_port(ib_dev)]; @@ -584,46 +673,82 @@ void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, memset(&gid_attr, 0, sizeof(gid_attr)); gid_attr.ndev = ndev; - mutex_lock(&table->lock); - ix = find_gid(table, NULL, NULL, true, GID_ATTR_FIND_MASK_DEFAULT); - - /* Coudn't find default GID location */ - WARN_ON(ix < 0); - - if (!__ib_cache_gid_get(ib_dev, port, ix, - ¤t_gid, ¤t_gid_attr) && - mode == IB_CACHE_GID_DEFAULT_MODE_SET && - !memcmp(&gid, ¤t_gid, sizeof(gid)) && - !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) - goto unlock; - - if ((memcmp(¤t_gid, &zgid, sizeof(current_gid)) || - memcmp(¤t_gid_attr, &zattr, - sizeof(current_gid_attr))) && - del_gid(ib_dev, port, table, ix, true)) { - pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", - ix, gid.raw); - goto unlock; - } + for (gid_type = 0; gid_type < IB_GID_TYPE_SIZE; ++gid_type) { + int ix; + union ib_gid current_gid; + struct ib_gid_attr current_gid_attr = {}; + + if (1UL << gid_type & ~gid_type_mask) + continue; + + gid_attr.gid_type = gid_type; + + mutex_lock(&table->lock); + write_lock_irq(&table->rwlock); + ix = find_gid(table, NULL, &gid_attr, true, + GID_ATTR_FIND_MASK_GID_TYPE | + GID_ATTR_FIND_MASK_DEFAULT, + NULL); + + /* Coudn't find default GID location */ + WARN_ON(ix < 0); + + zattr_type.gid_type = gid_type; + + if (!__ib_cache_gid_get(ib_dev, port, ix, + ¤t_gid, ¤t_gid_attr) && + mode == IB_CACHE_GID_DEFAULT_MODE_SET && + !memcmp(&gid, ¤t_gid, sizeof(gid)) && + !memcmp(&gid_attr, ¤t_gid_attr, sizeof(gid_attr))) + goto release; + + if (memcmp(¤t_gid, &zgid, sizeof(current_gid)) || + memcmp(¤t_gid_attr, &zattr_type, + sizeof(current_gid_attr))) { + if (del_gid(ib_dev, port, table, ix, true)) { + pr_warn("ib_cache_gid: can't delete index %d for default gid %pI6\n", + ix, gid.raw); + goto release; + } else { + dispatch_gid_change_event(ib_dev, port); + } + } - if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) - if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) - pr_warn("ib_cache_gid: unable to add default gid %pI6\n", - gid.raw); + if (mode == IB_CACHE_GID_DEFAULT_MODE_SET) { + if (add_gid(ib_dev, port, table, ix, &gid, &gid_attr, true)) + pr_warn("ib_cache_gid: unable to add default gid %pI6\n", + gid.raw); + else + dispatch_gid_change_event(ib_dev, port); + } -unlock: - if (current_gid_attr.ndev) - dev_put(current_gid_attr.ndev); - mutex_unlock(&table->lock); +release: + if (current_gid_attr.ndev) + dev_put(current_gid_attr.ndev); + write_unlock_irq(&table->rwlock); + mutex_unlock(&table->lock); + } } static int gid_table_reserve_default(struct ib_device *ib_dev, u8 port, struct ib_gid_table *table) { - if (rdma_protocol_roce(ib_dev, port)) { - struct ib_gid_table_entry *entry = &table->data_vec[0]; + unsigned int i; + unsigned long roce_gid_type_mask; + unsigned int num_default_gids; + unsigned int current_gid = 0; + + roce_gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + num_default_gids = hweight_long(roce_gid_type_mask); + for (i = 0; i < num_default_gids && i < table->sz; i++) { + struct ib_gid_table_entry *entry = + &table->data_vec[i]; entry->props |= GID_TABLE_ENTRY_DEFAULT; + current_gid = find_next_bit(&roce_gid_type_mask, + BITS_PER_LONG, + current_gid); + entry->attr.gid_type = current_gid++; } return 0; @@ -728,20 +853,30 @@ int ib_get_cached_gid(struct ib_device *device, union ib_gid *gid, struct ib_gid_attr *gid_attr) { + int res; + unsigned long flags; + struct ib_gid_table **ports_table = device->cache.gid_cache; + struct ib_gid_table *table = ports_table[port_num - rdma_start_port(device)]; + if (port_num < rdma_start_port(device) || port_num > rdma_end_port(device)) return -EINVAL; - return __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_lock_irqsave(&table->rwlock, flags); + res = __ib_cache_gid_get(device, port_num, index, gid, gid_attr); + read_unlock_irqrestore(&table->rwlock, flags); + + return res; } EXPORT_SYMBOL(ib_get_cached_gid); int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index) { - return ib_cache_gid_find(device, gid, ndev, port_num, index); + return ib_cache_gid_find(device, gid, gid_type, ndev, port_num, index); } EXPORT_SYMBOL(ib_find_cached_gid); @@ -956,10 +1091,12 @@ static void ib_cache_update(struct ib_device *device, device->cache.pkey_cache[port - rdma_start_port(device)] = pkey_cache; if (!use_roce_gid_table) { + write_lock(&table->rwlock); for (i = 0; i < gid_cache->table_len; i++) { modify_gid(device, port, table, i, gid_cache->table + i, &zattr, false); } + write_unlock(&table->rwlock); } device->cache.lmc_cache[port - rdma_start_port(device)] = tprops->lmc; diff --git a/drivers/infiniband/core/cm.c b/drivers/infiniband/core/cm.c index 0a26dd6d9b19..1d92e091e22e 100644 --- a/drivers/infiniband/core/cm.c +++ b/drivers/infiniband/core/cm.c @@ -364,7 +364,7 @@ static int cm_init_av_by_path(struct ib_sa_path_rec *path, struct cm_av *av) read_lock_irqsave(&cm.device_lock, flags); list_for_each_entry(cm_dev, &cm.device_list, list) { if (!ib_find_cached_gid(cm_dev->ib_device, &path->sgid, - ndev, &p, NULL)) { + path->gid_type, ndev, &p, NULL)) { port = cm_dev->port[p-1]; break; } @@ -782,11 +782,11 @@ static void cm_enter_timewait(struct cm_id_private *cm_id_priv) wait_time = cm_convert_to_ms(cm_id_priv->av.timeout); /* Check if the device started its remove_one */ - spin_lock_irq(&cm.lock); + spin_lock_irqsave(&cm.lock, flags); if (!cm_dev->going_down) queue_delayed_work(cm.wq, &cm_id_priv->timewait_info->work.work, msecs_to_jiffies(wait_time)); - spin_unlock_irq(&cm.lock); + spin_unlock_irqrestore(&cm.lock, flags); cm_id_priv->timewait_info = NULL; } @@ -1600,6 +1600,8 @@ static int cm_req_handler(struct cm_work *work) struct ib_cm_id *cm_id; struct cm_id_private *cm_id_priv, *listen_cm_id_priv; struct cm_req_msg *req_msg; + union ib_gid gid; + struct ib_gid_attr gid_attr; int ret; req_msg = (struct cm_req_msg *)work->mad_recv_wc->recv_buf.mad; @@ -1639,11 +1641,31 @@ static int cm_req_handler(struct cm_work *work) cm_format_paths_from_req(req_msg, &work->path[0], &work->path[1]); memcpy(work->path[0].dmac, cm_id_priv->av.ah_attr.dmac, ETH_ALEN); - ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + work->path[0].hop_limit = cm_id_priv->av.ah_attr.grh.hop_limit; + ret = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, + cm_id_priv->av.ah_attr.grh.sgid_index, + &gid, &gid_attr); + if (!ret) { + if (gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->ifindex; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; + ret = cm_init_av_by_path(&work->path[0], &cm_id_priv->av); + } if (ret) { - ib_get_cached_gid(work->port->cm_dev->ib_device, - work->port->port_num, 0, &work->path[0].sgid, - NULL); + int err = ib_get_cached_gid(work->port->cm_dev->ib_device, + work->port->port_num, 0, + &work->path[0].sgid, + &gid_attr); + if (!err && gid_attr.ndev) { + work->path[0].ifindex = gid_attr.ndev->ifindex; + work->path[0].net = dev_net(gid_attr.ndev); + dev_put(gid_attr.ndev); + } + work->path[0].gid_type = gid_attr.gid_type; ib_send_cm_rej(cm_id, IB_CM_REJ_INVALID_GID, &work->path[0].sgid, sizeof work->path[0].sgid, NULL, 0); @@ -3482,6 +3504,7 @@ int ib_cm_notify(struct ib_cm_id *cm_id, enum ib_event_type event) EXPORT_SYMBOL(ib_cm_notify); static void cm_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct cm_port *port = mad_agent->context; @@ -3731,16 +3754,6 @@ int ib_cm_init_qp_attr(struct ib_cm_id *cm_id, } EXPORT_SYMBOL(ib_cm_init_qp_attr); -static void cm_get_ack_delay(struct cm_device *cm_dev) -{ - struct ib_device_attr attr; - - if (ib_query_device(cm_dev->ib_device, &attr)) - cm_dev->ack_delay = 0; /* acks will rely on packet life time */ - else - cm_dev->ack_delay = attr.local_ca_ack_delay; -} - static ssize_t cm_show_counter(struct kobject *obj, struct attribute *attr, char *buf) { @@ -3852,7 +3865,7 @@ static void cm_add_one(struct ib_device *ib_device) return; cm_dev->ib_device = ib_device; - cm_get_ack_delay(cm_dev); + cm_dev->ack_delay = ib_device->attrs.local_ca_ack_delay; cm_dev->going_down = 0; cm_dev->device = device_create(&cm_class, &ib_device->dev, MKDEV(0, 0), NULL, diff --git a/drivers/infiniband/core/cma.c b/drivers/infiniband/core/cma.c index 2d762a2ecd81..9729639df407 100644 --- a/drivers/infiniband/core/cma.c +++ b/drivers/infiniband/core/cma.c @@ -38,6 +38,7 @@ #include <linux/in6.h> #include <linux/mutex.h> #include <linux/random.h> +#include <linux/igmp.h> #include <linux/idr.h> #include <linux/inetdevice.h> #include <linux/slab.h> @@ -60,6 +61,8 @@ #include <rdma/ib_sa.h> #include <rdma/iw_cm.h> +#include "core_priv.h" + MODULE_AUTHOR("Sean Hefty"); MODULE_DESCRIPTION("Generic RDMA CM Agent"); MODULE_LICENSE("Dual BSD/GPL"); @@ -150,6 +153,7 @@ struct cma_device { struct completion comp; atomic_t refcount; struct list_head id_list; + enum ib_gid_type *default_gid_type; }; struct rdma_bind_list { @@ -185,6 +189,67 @@ enum { CMA_OPTION_AFONLY, }; +void cma_ref_dev(struct cma_device *cma_dev) +{ + atomic_inc(&cma_dev->refcount); +} + +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie) +{ + struct cma_device *cma_dev; + struct cma_device *found_cma_dev = NULL; + + mutex_lock(&lock); + + list_for_each_entry(cma_dev, &dev_list, list) + if (filter(cma_dev->device, cookie)) { + found_cma_dev = cma_dev; + break; + } + + if (found_cma_dev) + cma_ref_dev(found_cma_dev); + mutex_unlock(&lock); + return found_cma_dev; +} + +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port) +{ + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + return cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)]; +} + +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type) +{ + unsigned long supported_gids; + + if (port < rdma_start_port(cma_dev->device) || + port > rdma_end_port(cma_dev->device)) + return -EINVAL; + + supported_gids = roce_gid_type_mask_support(cma_dev->device, port); + + if (!(supported_gids & 1 << default_gid_type)) + return -EINVAL; + + cma_dev->default_gid_type[port - rdma_start_port(cma_dev->device)] = + default_gid_type; + + return 0; +} + +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev) +{ + return cma_dev->device; +} + /* * Device removal can occur at anytime, so we need extra handling to * serialize notifying the user of device removal with other callbacks. @@ -228,6 +293,7 @@ struct rdma_id_private { u8 tos; u8 reuseaddr; u8 afonly; + enum ib_gid_type gid_type; }; struct cma_multicast { @@ -239,6 +305,7 @@ struct cma_multicast { void *context; struct sockaddr_storage addr; struct kref mcref; + bool igmp_joined; }; struct cma_work { @@ -335,18 +402,48 @@ static inline void cma_set_ip_ver(struct cma_hdr *hdr, u8 ip_ver) hdr->ip_version = (ip_ver << 4) | (hdr->ip_version & 0xF); } -static void cma_attach_to_dev(struct rdma_id_private *id_priv, - struct cma_device *cma_dev) +static int cma_igmp_send(struct net_device *ndev, union ib_gid *mgid, bool join) { - atomic_inc(&cma_dev->refcount); + struct in_device *in_dev = NULL; + + if (ndev) { + rtnl_lock(); + in_dev = __in_dev_get_rtnl(ndev); + if (in_dev) { + if (join) + ip_mc_inc_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + else + ip_mc_dec_group(in_dev, + *(__be32 *)(mgid->raw + 12)); + } + rtnl_unlock(); + } + return (in_dev) ? 0 : -ENODEV; +} + +static void _cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + cma_ref_dev(cma_dev); id_priv->cma_dev = cma_dev; + id_priv->gid_type = 0; id_priv->id.device = cma_dev->device; id_priv->id.route.addr.dev_addr.transport = rdma_node_get_transport(cma_dev->device->node_type); list_add_tail(&id_priv->list, &cma_dev->id_list); } -static inline void cma_deref_dev(struct cma_device *cma_dev) +static void cma_attach_to_dev(struct rdma_id_private *id_priv, + struct cma_device *cma_dev) +{ + _cma_attach_to_dev(id_priv, cma_dev); + id_priv->gid_type = + cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(cma_dev->device)]; +} + +void cma_deref_dev(struct cma_device *cma_dev) { if (atomic_dec_and_test(&cma_dev->refcount)) complete(&cma_dev->comp); @@ -441,6 +538,7 @@ static int cma_translate_addr(struct sockaddr *addr, struct rdma_dev_addr *dev_a } static inline int cma_validate_port(struct ib_device *device, u8 port, + enum ib_gid_type gid_type, union ib_gid *gid, int dev_type, int bound_if_index) { @@ -453,10 +551,25 @@ static inline int cma_validate_port(struct ib_device *device, u8 port, if ((dev_type != ARPHRD_INFINIBAND) && rdma_protocol_ib(device, port)) return ret; - if (dev_type == ARPHRD_ETHER) + if (dev_type == ARPHRD_ETHER && rdma_protocol_roce(device, port)) { ndev = dev_get_by_index(&init_net, bound_if_index); + if (ndev && ndev->flags & IFF_LOOPBACK) { + pr_info("detected loopback device\n"); + dev_put(ndev); - ret = ib_find_cached_gid_by_port(device, gid, port, ndev, NULL); + if (!device->get_netdev) + return -EOPNOTSUPP; + + ndev = device->get_netdev(device, port); + if (!ndev) + return -ENODEV; + } + } else { + gid_type = IB_GID_TYPE_IB; + } + + ret = ib_find_cached_gid_by_port(device, gid, gid_type, port, + ndev, NULL); if (ndev) dev_put(ndev); @@ -490,7 +603,10 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - ret = cma_validate_port(cma_dev->device, port, gidp, + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + listen_id_priv->gid_type, gidp, dev_addr->dev_type, dev_addr->bound_dev_if); if (!ret) { @@ -509,8 +625,11 @@ static int cma_acquire_dev(struct rdma_id_private *id_priv, gidp = rdma_protocol_roce(cma_dev->device, port) ? &iboe_gid : &gid; - ret = cma_validate_port(cma_dev->device, port, gidp, - dev_addr->dev_type, + ret = cma_validate_port(cma_dev->device, port, + rdma_protocol_ib(cma_dev->device, port) ? + IB_GID_TYPE_IB : + cma_dev->default_gid_type[port - 1], + gidp, dev_addr->dev_type, dev_addr->bound_dev_if); if (!ret) { id_priv->id.port_num = port; @@ -1437,8 +1556,24 @@ static void cma_leave_mc_groups(struct rdma_id_private *id_priv) id_priv->id.port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - } else + } else { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + } kref_put(&mc->mcref, release_mc); + } } } @@ -1896,7 +2031,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, struct rdma_id_private *listen_id, *conn_id; struct rdma_cm_event event; int ret; - struct ib_device_attr attr; struct sockaddr *laddr = (struct sockaddr *)&iw_event->local_addr; struct sockaddr *raddr = (struct sockaddr *)&iw_event->remote_addr; @@ -1938,13 +2072,6 @@ static int iw_conn_req_handler(struct iw_cm_id *cm_id, memcpy(cma_src_addr(conn_id), laddr, rdma_addr_size(laddr)); memcpy(cma_dst_addr(conn_id), raddr, rdma_addr_size(raddr)); - ret = ib_query_device(conn_id->id.device, &attr); - if (ret) { - mutex_unlock(&conn_id->handler_mutex); - rdma_destroy_id(new_cm_id); - goto out; - } - memset(&event, 0, sizeof event); event.event = RDMA_CM_EVENT_CONNECT_REQUEST; event.param.conn.private_data = iw_event->private_data; @@ -2051,7 +2178,7 @@ static void cma_listen_on_dev(struct rdma_id_private *id_priv, memcpy(cma_src_addr(dev_id_priv), cma_src_addr(id_priv), rdma_addr_size(cma_src_addr(id_priv))); - cma_attach_to_dev(dev_id_priv, cma_dev); + _cma_attach_to_dev(dev_id_priv, cma_dev); list_add_tail(&dev_id_priv->listen_list, &id_priv->listen_list); atomic_inc(&id_priv->refcount); dev_id_priv->internal_id = 1; @@ -2321,8 +2448,23 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) if (addr->dev_addr.bound_dev_if) { ndev = dev_get_by_index(&init_net, addr->dev_addr.bound_dev_if); + if (!ndev) + return -ENODEV; + + if (ndev->flags & IFF_LOOPBACK) { + dev_put(ndev); + if (!id_priv->id.device->get_netdev) + return -EOPNOTSUPP; + + ndev = id_priv->id.device->get_netdev(id_priv->id.device, + id_priv->id.port_num); + if (!ndev) + return -ENODEV; + } + route->path_rec->net = &init_net; - route->path_rec->ifindex = addr->dev_addr.bound_dev_if; + route->path_rec->ifindex = ndev->ifindex; + route->path_rec->gid_type = id_priv->gid_type; } if (!ndev) { ret = -ENODEV; @@ -2336,7 +2478,14 @@ static int cma_resolve_iboe_route(struct rdma_id_private *id_priv) rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.dst_addr, &route->path_rec->dgid); - route->path_rec->hop_limit = 1; + /* Use the hint from IP Stack to select GID Type */ + if (route->path_rec->gid_type < ib_network_to_gid_type(addr->dev_addr.network)) + route->path_rec->gid_type = ib_network_to_gid_type(addr->dev_addr.network); + if (((struct sockaddr *)&id_priv->id.route.addr.dst_addr)->sa_family != AF_IB) + /* TODO: get the hoplimit from the inet/inet6 device */ + route->path_rec->hop_limit = addr->dev_addr.hoplimit; + else + route->path_rec->hop_limit = 1; route->path_rec->reversible = 1; route->path_rec->pkey = cpu_to_be16(0xffff); route->path_rec->mtu_selector = IB_SA_EQ; @@ -3534,12 +3683,23 @@ static int cma_ib_mc_handler(int status, struct ib_sa_multicast *multicast) event.status = status; event.param.ud.private_data = mc->context; if (!status) { + struct rdma_dev_addr *dev_addr = + &id_priv->id.route.addr.dev_addr; + struct net_device *ndev = + dev_get_by_index(&init_net, dev_addr->bound_dev_if); + enum ib_gid_type gid_type = + id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + event.event = RDMA_CM_EVENT_MULTICAST_JOIN; ib_init_ah_from_mcmember(id_priv->id.device, id_priv->id.port_num, &multicast->rec, + ndev, gid_type, &event.param.ud.ah_attr); event.param.ud.qp_num = 0xFFFFFF; event.param.ud.qkey = be32_to_cpu(multicast->rec.qkey); + if (ndev) + dev_put(ndev); } else event.event = RDMA_CM_EVENT_MULTICAST_ERROR; @@ -3672,9 +3832,10 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, { struct iboe_mcast_work *work; struct rdma_dev_addr *dev_addr = &id_priv->id.route.addr.dev_addr; - int err; + int err = 0; struct sockaddr *addr = (struct sockaddr *)&mc->addr; struct net_device *ndev = NULL; + enum ib_gid_type gid_type; if (cma_zero_addr((struct sockaddr *)&mc->addr)) return -EINVAL; @@ -3704,9 +3865,25 @@ static int cma_iboe_join_multicast(struct rdma_id_private *id_priv, mc->multicast.ib->rec.rate = iboe_get_rate(ndev); mc->multicast.ib->rec.hop_limit = 1; mc->multicast.ib->rec.mtu = iboe_get_mtu(ndev->mtu); + + gid_type = id_priv->cma_dev->default_gid_type[id_priv->id.port_num - + rdma_start_port(id_priv->cma_dev->device)]; + if (addr->sa_family == AF_INET) { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = cma_igmp_send(ndev, &mc->multicast.ib->rec.mgid, + true); + if (!err) { + mc->igmp_joined = true; + mc->multicast.ib->rec.hop_limit = IPV6_DEFAULT_HOPLIMIT; + } + } else { + if (gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) + err = -ENOTSUPP; + } dev_put(ndev); - if (!mc->multicast.ib->rec.mtu) { - err = -EINVAL; + if (err || !mc->multicast.ib->rec.mtu) { + if (!err) + err = -EINVAL; goto out2; } rdma_ip2gid((struct sockaddr *)&id_priv->id.route.addr.src_addr, @@ -3745,7 +3922,7 @@ int rdma_join_multicast(struct rdma_cm_id *id, struct sockaddr *addr, memcpy(&mc->addr, addr, rdma_addr_size(addr)); mc->context = context; mc->id_priv = id_priv; - + mc->igmp_joined = false; spin_lock(&id_priv->lock); list_add(&mc->list, &id_priv->mc_list); spin_unlock(&id_priv->lock); @@ -3790,9 +3967,25 @@ void rdma_leave_multicast(struct rdma_cm_id *id, struct sockaddr *addr) if (rdma_cap_ib_mcast(id->device, id->port_num)) { ib_sa_free_multicast(mc->multicast.ib); kfree(mc); - } else if (rdma_protocol_roce(id->device, id->port_num)) + } else if (rdma_protocol_roce(id->device, id->port_num)) { + if (mc->igmp_joined) { + struct rdma_dev_addr *dev_addr = + &id->route.addr.dev_addr; + struct net_device *ndev = NULL; + + if (dev_addr->bound_dev_if) + ndev = dev_get_by_index(&init_net, + dev_addr->bound_dev_if); + if (ndev) { + cma_igmp_send(ndev, + &mc->multicast.ib->rec.mgid, + false); + dev_put(ndev); + } + mc->igmp_joined = false; + } kref_put(&mc->mcref, release_mc); - + } return; } } @@ -3861,12 +4054,27 @@ static void cma_add_one(struct ib_device *device) { struct cma_device *cma_dev; struct rdma_id_private *id_priv; + unsigned int i; + unsigned long supported_gids = 0; cma_dev = kmalloc(sizeof *cma_dev, GFP_KERNEL); if (!cma_dev) return; cma_dev->device = device; + cma_dev->default_gid_type = kcalloc(device->phys_port_cnt, + sizeof(*cma_dev->default_gid_type), + GFP_KERNEL); + if (!cma_dev->default_gid_type) { + kfree(cma_dev); + return; + } + for (i = rdma_start_port(device); i <= rdma_end_port(device); i++) { + supported_gids = roce_gid_type_mask_support(device, i); + WARN_ON(!supported_gids); + cma_dev->default_gid_type[i - rdma_start_port(device)] = + find_first_bit(&supported_gids, BITS_PER_LONG); + } init_completion(&cma_dev->comp); atomic_set(&cma_dev->refcount, 1); @@ -3946,6 +4154,7 @@ static void cma_remove_one(struct ib_device *device, void *client_data) mutex_unlock(&lock); cma_process_remove(cma_dev); + kfree(cma_dev->default_gid_type); kfree(cma_dev); } @@ -4079,6 +4288,7 @@ static int __init cma_init(void) if (ibnl_add_client(RDMA_NL_RDMA_CM, RDMA_NL_RDMA_CM_NUM_OPS, cma_cb_table)) printk(KERN_WARNING "RDMA CMA: failed to add netlink callback\n"); + cma_configfs_init(); return 0; @@ -4093,6 +4303,7 @@ err_wq: static void __exit cma_cleanup(void) { + cma_configfs_exit(); ibnl_remove_client(RDMA_NL_RDMA_CM); ib_unregister_client(&cma_client); unregister_netdevice_notifier(&cma_nb); diff --git a/drivers/infiniband/core/cma_configfs.c b/drivers/infiniband/core/cma_configfs.c new file mode 100644 index 000000000000..18b112aa577e --- /dev/null +++ b/drivers/infiniband/core/cma_configfs.c @@ -0,0 +1,321 @@ +/* + * Copyright (c) 2015, Mellanox Technologies inc. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include <linux/module.h> +#include <linux/configfs.h> +#include <rdma/ib_verbs.h> +#include "core_priv.h" + +struct cma_device; + +struct cma_dev_group; + +struct cma_dev_port_group { + unsigned int port_num; + struct cma_dev_group *cma_dev_group; + struct config_group group; +}; + +struct cma_dev_group { + char name[IB_DEVICE_NAME_MAX]; + struct config_group device_group; + struct config_group ports_group; + struct config_group *default_dev_group[2]; + struct config_group **default_ports_group; + struct cma_dev_port_group *ports; +}; + +static struct cma_dev_port_group *to_dev_port_group(struct config_item *item) +{ + struct config_group *group; + + if (!item) + return NULL; + + group = container_of(item, struct config_group, cg_item); + return container_of(group, struct cma_dev_port_group, group); +} + +static bool filter_by_name(struct ib_device *ib_dev, void *cookie) +{ + return !strcmp(ib_dev->name, cookie); +} + +static int cma_configfs_params_get(struct config_item *item, + struct cma_device **pcma_dev, + struct cma_dev_port_group **pgroup) +{ + struct cma_dev_port_group *group = to_dev_port_group(item); + struct cma_device *cma_dev; + + if (!group) + return -ENODEV; + + cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + group->cma_dev_group->name); + if (!cma_dev) + return -ENODEV; + + *pcma_dev = cma_dev; + *pgroup = group; + + return 0; +} + +static void cma_configfs_params_put(struct cma_device *cma_dev) +{ + cma_deref_dev(cma_dev); +} + +static ssize_t default_roce_mode_show(struct config_item *item, + char *buf) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type; + ssize_t ret; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + gid_type = cma_get_default_gid_type(cma_dev, group->port_num); + cma_configfs_params_put(cma_dev); + + if (gid_type < 0) + return gid_type; + + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_type)); +} + +static ssize_t default_roce_mode_store(struct config_item *item, + const char *buf, size_t count) +{ + struct cma_device *cma_dev; + struct cma_dev_port_group *group; + int gid_type = ib_cache_gid_parse_type_str(buf); + ssize_t ret; + + if (gid_type < 0) + return -EINVAL; + + ret = cma_configfs_params_get(item, &cma_dev, &group); + if (ret) + return ret; + + ret = cma_set_default_gid_type(cma_dev, group->port_num, gid_type); + + cma_configfs_params_put(cma_dev); + + return !ret ? strnlen(buf, count) : ret; +} + +CONFIGFS_ATTR(, default_roce_mode); + +static struct configfs_attribute *cma_configfs_attributes[] = { + &attr_default_roce_mode, + NULL, +}; + +static struct config_item_type cma_port_group_type = { + .ct_attrs = cma_configfs_attributes, + .ct_owner = THIS_MODULE +}; + +static int make_cma_ports(struct cma_dev_group *cma_dev_group, + struct cma_device *cma_dev) +{ + struct ib_device *ibdev; + unsigned int i; + unsigned int ports_num; + struct cma_dev_port_group *ports; + struct config_group **ports_group; + int err; + + ibdev = cma_get_ib_dev(cma_dev); + + if (!ibdev) + return -ENODEV; + + ports_num = ibdev->phys_port_cnt; + ports = kcalloc(ports_num, sizeof(*cma_dev_group->ports), + GFP_KERNEL); + ports_group = kcalloc(ports_num + 1, sizeof(*ports_group), GFP_KERNEL); + + if (!ports || !ports_group) { + err = -ENOMEM; + goto free; + } + + for (i = 0; i < ports_num; i++) { + char port_str[10]; + + ports[i].port_num = i + 1; + snprintf(port_str, sizeof(port_str), "%u", i + 1); + ports[i].cma_dev_group = cma_dev_group; + config_group_init_type_name(&ports[i].group, + port_str, + &cma_port_group_type); + ports_group[i] = &ports[i].group; + } + ports_group[i] = NULL; + cma_dev_group->default_ports_group = ports_group; + cma_dev_group->ports = ports; + + return 0; +free: + kfree(ports); + kfree(ports_group); + cma_dev_group->ports = NULL; + cma_dev_group->default_ports_group = NULL; + return err; +} + +static void release_cma_dev(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + device_group); + + kfree(cma_dev_group); +}; + +static void release_cma_ports_group(struct config_item *item) +{ + struct config_group *group = container_of(item, struct config_group, + cg_item); + struct cma_dev_group *cma_dev_group = container_of(group, + struct cma_dev_group, + ports_group); + + kfree(cma_dev_group->ports); + kfree(cma_dev_group->default_ports_group); + cma_dev_group->ports = NULL; + cma_dev_group->default_ports_group = NULL; +}; + +static struct configfs_item_operations cma_ports_item_ops = { + .release = release_cma_ports_group +}; + +static struct config_item_type cma_ports_group_type = { + .ct_item_ops = &cma_ports_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct configfs_item_operations cma_device_item_ops = { + .release = release_cma_dev +}; + +static struct config_item_type cma_device_group_type = { + .ct_item_ops = &cma_device_item_ops, + .ct_owner = THIS_MODULE +}; + +static struct config_group *make_cma_dev(struct config_group *group, + const char *name) +{ + int err = -ENODEV; + struct cma_device *cma_dev = cma_enum_devices_by_ibdev(filter_by_name, + (void *)name); + struct cma_dev_group *cma_dev_group = NULL; + + if (!cma_dev) + goto fail; + + cma_dev_group = kzalloc(sizeof(*cma_dev_group), GFP_KERNEL); + + if (!cma_dev_group) { + err = -ENOMEM; + goto fail; + } + + strncpy(cma_dev_group->name, name, sizeof(cma_dev_group->name)); + + err = make_cma_ports(cma_dev_group, cma_dev); + if (err) + goto fail; + + cma_dev_group->ports_group.default_groups = + cma_dev_group->default_ports_group; + config_group_init_type_name(&cma_dev_group->ports_group, "ports", + &cma_ports_group_type); + + cma_dev_group->device_group.default_groups + = cma_dev_group->default_dev_group; + cma_dev_group->default_dev_group[0] = &cma_dev_group->ports_group; + cma_dev_group->default_dev_group[1] = NULL; + + config_group_init_type_name(&cma_dev_group->device_group, name, + &cma_device_group_type); + + cma_deref_dev(cma_dev); + return &cma_dev_group->device_group; + +fail: + if (cma_dev) + cma_deref_dev(cma_dev); + kfree(cma_dev_group); + return ERR_PTR(err); +} + +static struct configfs_group_operations cma_subsys_group_ops = { + .make_group = make_cma_dev, +}; + +static struct config_item_type cma_subsys_type = { + .ct_group_ops = &cma_subsys_group_ops, + .ct_owner = THIS_MODULE, +}; + +static struct configfs_subsystem cma_subsys = { + .su_group = { + .cg_item = { + .ci_namebuf = "rdma_cm", + .ci_type = &cma_subsys_type, + }, + }, +}; + +int __init cma_configfs_init(void) +{ + config_group_init(&cma_subsys.su_group); + mutex_init(&cma_subsys.su_mutex); + return configfs_register_subsystem(&cma_subsys); +} + +void __exit cma_configfs_exit(void) +{ + configfs_unregister_subsystem(&cma_subsys); +} diff --git a/drivers/infiniband/core/core_priv.h b/drivers/infiniband/core/core_priv.h index 5cf6eb716f00..eab32215756b 100644 --- a/drivers/infiniband/core/core_priv.h +++ b/drivers/infiniband/core/core_priv.h @@ -38,6 +38,32 @@ #include <rdma/ib_verbs.h> +#if IS_ENABLED(CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS) +int cma_configfs_init(void); +void cma_configfs_exit(void); +#else +static inline int cma_configfs_init(void) +{ + return 0; +} + +static inline void cma_configfs_exit(void) +{ +} +#endif +struct cma_device; +void cma_ref_dev(struct cma_device *cma_dev); +void cma_deref_dev(struct cma_device *cma_dev); +typedef bool (*cma_device_filter)(struct ib_device *, void *); +struct cma_device *cma_enum_devices_by_ibdev(cma_device_filter filter, + void *cookie); +int cma_get_default_gid_type(struct cma_device *cma_dev, + unsigned int port); +int cma_set_default_gid_type(struct cma_device *cma_dev, + unsigned int port, + enum ib_gid_type default_gid_type); +struct ib_device *cma_get_ib_dev(struct cma_device *cma_dev); + int ib_device_register_sysfs(struct ib_device *device, int (*port_callback)(struct ib_device *, u8, struct kobject *)); @@ -70,8 +96,13 @@ enum ib_cache_gid_default_mode { IB_CACHE_GID_DEFAULT_MODE_DELETE }; +int ib_cache_gid_parse_type_str(const char *buf); + +const char *ib_cache_gid_type_str(enum ib_gid_type gid_type); + void ib_cache_gid_set_default_gid(struct ib_device *ib_dev, u8 port, struct net_device *ndev, + unsigned long gid_type_mask, enum ib_cache_gid_default_mode mode); int ib_cache_gid_add(struct ib_device *ib_dev, u8 port, @@ -87,9 +118,23 @@ int roce_gid_mgmt_init(void); void roce_gid_mgmt_cleanup(void); int roce_rescan_device(struct ib_device *ib_dev); +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port); int ib_cache_setup_one(struct ib_device *device); void ib_cache_cleanup_one(struct ib_device *device); void ib_cache_release_one(struct ib_device *device); +static inline bool rdma_is_upper_dev_rcu(struct net_device *dev, + struct net_device *upper) +{ + struct net_device *_upper = NULL; + struct list_head *iter; + + netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) + if (_upper == upper) + break; + + return _upper == upper; +} + #endif /* _CORE_PRIV_H */ diff --git a/drivers/infiniband/core/cq.c b/drivers/infiniband/core/cq.c new file mode 100644 index 000000000000..a754fc727de5 --- /dev/null +++ b/drivers/infiniband/core/cq.c @@ -0,0 +1,209 @@ +/* + * Copyright (c) 2015 HGST, a Western Digital Company. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ +#include <linux/module.h> +#include <linux/err.h> +#include <linux/slab.h> +#include <rdma/ib_verbs.h> + +/* # of WCs to poll for with a single call to ib_poll_cq */ +#define IB_POLL_BATCH 16 + +/* # of WCs to iterate over before yielding */ +#define IB_POLL_BUDGET_IRQ 256 +#define IB_POLL_BUDGET_WORKQUEUE 65536 + +#define IB_POLL_FLAGS \ + (IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS) + +static int __ib_process_cq(struct ib_cq *cq, int budget) +{ + int i, n, completed = 0; + + while ((n = ib_poll_cq(cq, IB_POLL_BATCH, cq->wc)) > 0) { + for (i = 0; i < n; i++) { + struct ib_wc *wc = &cq->wc[i]; + + if (wc->wr_cqe) + wc->wr_cqe->done(cq, wc); + else + WARN_ON_ONCE(wc->status == IB_WC_SUCCESS); + } + + completed += n; + + if (n != IB_POLL_BATCH || + (budget != -1 && completed >= budget)) + break; + } + + return completed; +} + +/** + * ib_process_direct_cq - process a CQ in caller context + * @cq: CQ to process + * @budget: number of CQEs to poll for + * + * This function is used to process all outstanding CQ entries on a + * %IB_POLL_DIRECT CQ. It does not offload CQ processing to a different + * context and does not ask for completion interrupts from the HCA. + * + * Note: for compatibility reasons -1 can be passed in %budget for unlimited + * polling. Do not use this feature in new code, it will be removed soon. + */ +int ib_process_cq_direct(struct ib_cq *cq, int budget) +{ + WARN_ON_ONCE(cq->poll_ctx != IB_POLL_DIRECT); + + return __ib_process_cq(cq, budget); +} +EXPORT_SYMBOL(ib_process_cq_direct); + +static void ib_cq_completion_direct(struct ib_cq *cq, void *private) +{ + WARN_ONCE(1, "got unsolicited completion for CQ 0x%p\n", cq); +} + +static int ib_poll_handler(struct irq_poll *iop, int budget) +{ + struct ib_cq *cq = container_of(iop, struct ib_cq, iop); + int completed; + + completed = __ib_process_cq(cq, budget); + if (completed < budget) { + irq_poll_complete(&cq->iop); + if (ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + irq_poll_sched(&cq->iop); + } + + return completed; +} + +static void ib_cq_completion_softirq(struct ib_cq *cq, void *private) +{ + irq_poll_sched(&cq->iop); +} + +static void ib_cq_poll_work(struct work_struct *work) +{ + struct ib_cq *cq = container_of(work, struct ib_cq, work); + int completed; + + completed = __ib_process_cq(cq, IB_POLL_BUDGET_WORKQUEUE); + if (completed >= IB_POLL_BUDGET_WORKQUEUE || + ib_req_notify_cq(cq, IB_POLL_FLAGS) > 0) + queue_work(ib_comp_wq, &cq->work); +} + +static void ib_cq_completion_workqueue(struct ib_cq *cq, void *private) +{ + queue_work(ib_comp_wq, &cq->work); +} + +/** + * ib_alloc_cq - allocate a completion queue + * @dev: device to allocate the CQ for + * @private: driver private data, accessible from cq->cq_context + * @nr_cqe: number of CQEs to allocate + * @comp_vector: HCA completion vectors for this CQ + * @poll_ctx: context to poll the CQ from. + * + * This is the proper interface to allocate a CQ for in-kernel users. A + * CQ allocated with this interface will automatically be polled from the + * specified context. The ULP needs must use wr->wr_cqe instead of wr->wr_id + * to use this CQ abstraction. + */ +struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx) +{ + struct ib_cq_init_attr cq_attr = { + .cqe = nr_cqe, + .comp_vector = comp_vector, + }; + struct ib_cq *cq; + int ret = -ENOMEM; + + cq = dev->create_cq(dev, &cq_attr, NULL, NULL); + if (IS_ERR(cq)) + return cq; + + cq->device = dev; + cq->uobject = NULL; + cq->event_handler = NULL; + cq->cq_context = private; + cq->poll_ctx = poll_ctx; + atomic_set(&cq->usecnt, 0); + + cq->wc = kmalloc_array(IB_POLL_BATCH, sizeof(*cq->wc), GFP_KERNEL); + if (!cq->wc) + goto out_destroy_cq; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + cq->comp_handler = ib_cq_completion_direct; + break; + case IB_POLL_SOFTIRQ: + cq->comp_handler = ib_cq_completion_softirq; + + irq_poll_init(&cq->iop, IB_POLL_BUDGET_IRQ, ib_poll_handler); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + case IB_POLL_WORKQUEUE: + cq->comp_handler = ib_cq_completion_workqueue; + INIT_WORK(&cq->work, ib_cq_poll_work); + ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); + break; + default: + ret = -EINVAL; + goto out_free_wc; + } + + return cq; + +out_free_wc: + kfree(cq->wc); +out_destroy_cq: + cq->device->destroy_cq(cq); + return ERR_PTR(ret); +} +EXPORT_SYMBOL(ib_alloc_cq); + +/** + * ib_free_cq - free a completion queue + * @cq: completion queue to free. + */ +void ib_free_cq(struct ib_cq *cq) +{ + int ret; + + if (WARN_ON_ONCE(atomic_read(&cq->usecnt))) + return; + + switch (cq->poll_ctx) { + case IB_POLL_DIRECT: + break; + case IB_POLL_SOFTIRQ: + irq_poll_disable(&cq->iop); + break; + case IB_POLL_WORKQUEUE: + flush_work(&cq->work); + break; + default: + WARN_ON_ONCE(1); + } + + kfree(cq->wc); + ret = cq->device->destroy_cq(cq); + WARN_ON_ONCE(ret); +} +EXPORT_SYMBOL(ib_free_cq); diff --git a/drivers/infiniband/core/device.c b/drivers/infiniband/core/device.c index 179e8134d57f..00da80e02154 100644 --- a/drivers/infiniband/core/device.c +++ b/drivers/infiniband/core/device.c @@ -58,6 +58,7 @@ struct ib_client_data { bool going_down; }; +struct workqueue_struct *ib_comp_wq; struct workqueue_struct *ib_wq; EXPORT_SYMBOL_GPL(ib_wq); @@ -325,6 +326,7 @@ int ib_register_device(struct ib_device *device, { int ret; struct ib_client *client; + struct ib_udata uhw = {.outlen = 0, .inlen = 0}; mutex_lock(&device_mutex); @@ -352,6 +354,13 @@ int ib_register_device(struct ib_device *device, goto out; } + memset(&device->attrs, 0, sizeof(device->attrs)); + ret = device->query_device(device, &device->attrs, &uhw); + if (ret) { + printk(KERN_WARNING "Couldn't query the device attributes\n"); + goto out; + } + ret = ib_device_register_sysfs(device, port_callback); if (ret) { printk(KERN_WARNING "Couldn't register device %s with driver model\n", @@ -628,25 +637,6 @@ void ib_dispatch_event(struct ib_event *event) EXPORT_SYMBOL(ib_dispatch_event); /** - * ib_query_device - Query IB device attributes - * @device:Device to query - * @device_attr:Device attributes - * - * ib_query_device() returns the attributes of a device through the - * @device_attr pointer. - */ -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr) -{ - struct ib_udata uhw = {.outlen = 0, .inlen = 0}; - - memset(device_attr, 0, sizeof(*device_attr)); - - return device->query_device(device, device_attr, &uhw); -} -EXPORT_SYMBOL(ib_query_device); - -/** * ib_query_port - Query IB port attributes * @device:Device to query * @port_num:Port number to query @@ -825,26 +815,31 @@ EXPORT_SYMBOL(ib_modify_port); * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: Type of GID. * @ndev: The ndev related to the GID to search for. * @port_num: The port number of the device where the GID value was found. * @index: The index into the GID table where the GID was found. This * parameter may be NULL. */ int ib_find_gid(struct ib_device *device, union ib_gid *gid, - struct net_device *ndev, u8 *port_num, u16 *index) + enum ib_gid_type gid_type, struct net_device *ndev, + u8 *port_num, u16 *index) { union ib_gid tmp_gid; int ret, port, i; for (port = rdma_start_port(device); port <= rdma_end_port(device); ++port) { if (rdma_cap_roce_gid_table(device, port)) { - if (!ib_find_cached_gid_by_port(device, gid, port, + if (!ib_find_cached_gid_by_port(device, gid, gid_type, port, ndev, index)) { *port_num = port; return 0; } } + if (gid_type != IB_GID_TYPE_IB) + continue; + for (i = 0; i < device->port_immutable[port].gid_tbl_len; ++i) { ret = ib_query_gid(device, port, i, &tmp_gid, NULL); if (ret) @@ -954,10 +949,18 @@ static int __init ib_core_init(void) if (!ib_wq) return -ENOMEM; + ib_comp_wq = alloc_workqueue("ib-comp-wq", + WQ_UNBOUND | WQ_HIGHPRI | WQ_MEM_RECLAIM, + WQ_UNBOUND_MAX_ACTIVE); + if (!ib_comp_wq) { + ret = -ENOMEM; + goto err; + } + ret = class_register(&ib_class); if (ret) { printk(KERN_WARNING "Couldn't create InfiniBand device class\n"); - goto err; + goto err_comp; } ret = ibnl_init(); @@ -972,7 +975,8 @@ static int __init ib_core_init(void) err_sysfs: class_unregister(&ib_class); - +err_comp: + destroy_workqueue(ib_comp_wq); err: destroy_workqueue(ib_wq); return ret; @@ -983,6 +987,7 @@ static void __exit ib_core_cleanup(void) ib_cache_cleanup(); ibnl_cleanup(); class_unregister(&ib_class); + destroy_workqueue(ib_comp_wq); /* Make sure that any pending umem accounting work is done. */ destroy_workqueue(ib_wq); } diff --git a/drivers/infiniband/core/fmr_pool.c b/drivers/infiniband/core/fmr_pool.c index 9f5ad7cc33c8..6ac3683c144b 100644 --- a/drivers/infiniband/core/fmr_pool.c +++ b/drivers/infiniband/core/fmr_pool.c @@ -212,7 +212,6 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, { struct ib_device *device; struct ib_fmr_pool *pool; - struct ib_device_attr *attr; int i; int ret; int max_remaps; @@ -228,25 +227,10 @@ struct ib_fmr_pool *ib_create_fmr_pool(struct ib_pd *pd, return ERR_PTR(-ENOSYS); } - attr = kmalloc(sizeof *attr, GFP_KERNEL); - if (!attr) { - printk(KERN_WARNING PFX "couldn't allocate device attr struct\n"); - return ERR_PTR(-ENOMEM); - } - - ret = ib_query_device(device, attr); - if (ret) { - printk(KERN_WARNING PFX "couldn't query device: %d\n", ret); - kfree(attr); - return ERR_PTR(ret); - } - - if (!attr->max_map_per_fmr) + if (!device->attrs.max_map_per_fmr) max_remaps = IB_FMR_MAX_REMAPS; else - max_remaps = attr->max_map_per_fmr; - - kfree(attr); + max_remaps = device->attrs.max_map_per_fmr; pool = kmalloc(sizeof *pool, GFP_KERNEL); if (!pool) { diff --git a/drivers/infiniband/core/mad.c b/drivers/infiniband/core/mad.c index 2281de122038..9fa5bf33f5a3 100644 --- a/drivers/infiniband/core/mad.c +++ b/drivers/infiniband/core/mad.c @@ -84,6 +84,9 @@ static int add_nonoui_reg_req(struct ib_mad_reg_req *mad_reg_req, u8 mgmt_class); static int add_oui_reg_req(struct ib_mad_reg_req *mad_reg_req, struct ib_mad_agent_private *agent_priv); +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc); +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc); /* * Returns a ib_mad_port_private structure or NULL for a device/port @@ -681,7 +684,7 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info, atomic_inc(&mad_snoop_priv->refcount); spin_unlock_irqrestore(&qp_info->snoop_lock, flags); - mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, + mad_snoop_priv->agent.recv_handler(&mad_snoop_priv->agent, NULL, mad_recv_wc); deref_snoop_agent(mad_snoop_priv); spin_lock_irqsave(&qp_info->snoop_lock, flags); @@ -689,12 +692,11 @@ static void snoop_recv(struct ib_mad_qp_info *qp_info, spin_unlock_irqrestore(&qp_info->snoop_lock, flags); } -static void build_smp_wc(struct ib_qp *qp, - u64 wr_id, u16 slid, u16 pkey_index, u8 port_num, - struct ib_wc *wc) +static void build_smp_wc(struct ib_qp *qp, struct ib_cqe *cqe, u16 slid, + u16 pkey_index, u8 port_num, struct ib_wc *wc) { memset(wc, 0, sizeof *wc); - wc->wr_id = wr_id; + wc->wr_cqe = cqe; wc->status = IB_WC_SUCCESS; wc->opcode = IB_WC_RECV; wc->pkey_index = pkey_index; @@ -832,7 +834,7 @@ static int handle_outgoing_dr_smp(struct ib_mad_agent_private *mad_agent_priv, } build_smp_wc(mad_agent_priv->agent.qp, - send_wr->wr.wr_id, drslid, + send_wr->wr.wr_cqe, drslid, send_wr->pkey_index, send_wr->port_num, &mad_wc); @@ -1039,7 +1041,9 @@ struct ib_mad_send_buf * ib_create_send_mad(struct ib_mad_agent *mad_agent, mad_send_wr->sg_list[1].lkey = mad_agent->qp->pd->local_dma_lkey; - mad_send_wr->send_wr.wr.wr_id = (unsigned long) mad_send_wr; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_send_wr->send_wr.wr.sg_list = mad_send_wr->sg_list; mad_send_wr->send_wr.wr.num_sge = 2; mad_send_wr->send_wr.wr.opcode = IB_WR_SEND; @@ -1151,8 +1155,9 @@ int ib_send_mad(struct ib_mad_send_wr_private *mad_send_wr) /* Set WR ID to find mad_send_wr upon completion */ qp_info = mad_send_wr->mad_agent_priv->qp_info; - mad_send_wr->send_wr.wr.wr_id = (unsigned long)&mad_send_wr->mad_list; mad_send_wr->mad_list.mad_queue = &qp_info->send_queue; + mad_send_wr->mad_list.cqe.done = ib_mad_send_done; + mad_send_wr->send_wr.wr.wr_cqe = &mad_send_wr->mad_list.cqe; mad_agent = mad_send_wr->send_buf.mad_agent; sge = mad_send_wr->sg_list; @@ -1982,9 +1987,9 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, /* user rmpp is in effect * and this is an active RMPP MAD */ - mad_recv_wc->wc->wr_id = 0; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, NULL, + mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); } else { /* not user rmpp, revert to normal behavior and @@ -1998,9 +2003,10 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, spin_unlock_irqrestore(&mad_agent_priv->lock, flags); /* Defined behavior is to complete response before request */ - mad_recv_wc->wc->wr_id = (unsigned long) &mad_send_wr->send_buf; - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, - mad_recv_wc); + mad_agent_priv->agent.recv_handler( + &mad_agent_priv->agent, + &mad_send_wr->send_buf, + mad_recv_wc); atomic_dec(&mad_agent_priv->refcount); mad_send_wc.status = IB_WC_SUCCESS; @@ -2009,7 +2015,7 @@ static void ib_mad_complete_recv(struct ib_mad_agent_private *mad_agent_priv, ib_mad_complete_send_wr(mad_send_wr, &mad_send_wc); } } else { - mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, + mad_agent_priv->agent.recv_handler(&mad_agent_priv->agent, NULL, mad_recv_wc); deref_mad_agent(mad_agent_priv); } @@ -2172,13 +2178,14 @@ handle_smi(struct ib_mad_port_private *port_priv, return handle_ib_smi(port_priv, qp_info, wc, port_num, recv, response); } -static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_qp_info *qp_info; struct ib_mad_private_header *mad_priv_hdr; struct ib_mad_private *recv, *response = NULL; - struct ib_mad_list_head *mad_list; struct ib_mad_agent_private *mad_agent; int port_num; int ret = IB_MAD_RESULT_SUCCESS; @@ -2186,7 +2193,17 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, u16 resp_mad_pkey_index = 0; bool opa; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + /* + * Receive errors indicate that the QP has entered the error + * state - error handling/shutdown code will cleanup + */ + return; + } + qp_info = mad_list->mad_queue->qp_info; dequeue_mad(mad_list); @@ -2227,7 +2244,7 @@ static void ib_mad_recv_done_handler(struct ib_mad_port_private *port_priv, response = alloc_mad_private(mad_size, GFP_KERNEL); if (!response) { dev_err(&port_priv->device->dev, - "ib_mad_recv_done_handler no memory for response buffer\n"); + "%s: no memory for response buffer\n", __func__); goto out; } @@ -2413,11 +2430,12 @@ done: spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static void ib_mad_send_done(struct ib_cq *cq, struct ib_wc *wc) { + struct ib_mad_port_private *port_priv = cq->cq_context; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); struct ib_mad_send_wr_private *mad_send_wr, *queued_send_wr; - struct ib_mad_list_head *mad_list; struct ib_mad_qp_info *qp_info; struct ib_mad_queue *send_queue; struct ib_send_wr *bad_send_wr; @@ -2425,7 +2443,14 @@ static void ib_mad_send_done_handler(struct ib_mad_port_private *port_priv, unsigned long flags; int ret; - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; + if (list_empty_careful(&port_priv->port_list)) + return; + + if (wc->status != IB_WC_SUCCESS) { + if (!ib_mad_send_error(port_priv, wc)) + return; + } + mad_send_wr = container_of(mad_list, struct ib_mad_send_wr_private, mad_list); send_queue = mad_list->mad_queue; @@ -2490,24 +2515,15 @@ static void mark_sends_for_retry(struct ib_mad_qp_info *qp_info) spin_unlock_irqrestore(&qp_info->send_queue.lock, flags); } -static void mad_error_handler(struct ib_mad_port_private *port_priv, - struct ib_wc *wc) +static bool ib_mad_send_error(struct ib_mad_port_private *port_priv, + struct ib_wc *wc) { - struct ib_mad_list_head *mad_list; - struct ib_mad_qp_info *qp_info; + struct ib_mad_list_head *mad_list = + container_of(wc->wr_cqe, struct ib_mad_list_head, cqe); + struct ib_mad_qp_info *qp_info = mad_list->mad_queue->qp_info; struct ib_mad_send_wr_private *mad_send_wr; int ret; - /* Determine if failure was a send or receive */ - mad_list = (struct ib_mad_list_head *)(unsigned long)wc->wr_id; - qp_info = mad_list->mad_queue->qp_info; - if (mad_list->mad_queue == &qp_info->recv_queue) - /* - * Receive errors indicate that the QP has entered the error - * state - error handling/shutdown code will cleanup - */ - return; - /* * Send errors will transition the QP to SQE - move * QP to RTS and repost flushed work requests @@ -2522,10 +2538,9 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, mad_send_wr->retry = 0; ret = ib_post_send(qp_info->qp, &mad_send_wr->send_wr.wr, &bad_send_wr); - if (ret) - ib_mad_send_done_handler(port_priv, wc); - } else - ib_mad_send_done_handler(port_priv, wc); + if (!ret) + return false; + } } else { struct ib_qp_attr *attr; @@ -2539,42 +2554,14 @@ static void mad_error_handler(struct ib_mad_port_private *port_priv, kfree(attr); if (ret) dev_err(&port_priv->device->dev, - "mad_error_handler - ib_modify_qp to RTS : %d\n", - ret); + "%s - ib_modify_qp to RTS: %d\n", + __func__, ret); else mark_sends_for_retry(qp_info); } - ib_mad_send_done_handler(port_priv, wc); } -} -/* - * IB MAD completion callback - */ -static void ib_mad_completion_handler(struct work_struct *work) -{ - struct ib_mad_port_private *port_priv; - struct ib_wc wc; - - port_priv = container_of(work, struct ib_mad_port_private, work); - ib_req_notify_cq(port_priv->cq, IB_CQ_NEXT_COMP); - - while (ib_poll_cq(port_priv->cq, 1, &wc) == 1) { - if (wc.status == IB_WC_SUCCESS) { - switch (wc.opcode) { - case IB_WC_SEND: - ib_mad_send_done_handler(port_priv, &wc); - break; - case IB_WC_RECV: - ib_mad_recv_done_handler(port_priv, &wc); - break; - default: - BUG_ON(1); - break; - } - } else - mad_error_handler(port_priv, &wc); - } + return true; } static void cancel_mads(struct ib_mad_agent_private *mad_agent_priv) @@ -2716,7 +2703,7 @@ static void local_completions(struct work_struct *work) * before request */ build_smp_wc(recv_mad_agent->agent.qp, - (unsigned long) local->mad_send_wr, + local->mad_send_wr->send_wr.wr.wr_cqe, be16_to_cpu(IB_LID_PERMISSIVE), local->mad_send_wr->send_wr.pkey_index, recv_mad_agent->agent.port_num, &wc); @@ -2744,6 +2731,7 @@ static void local_completions(struct work_struct *work) IB_MAD_SNOOP_RECVS); recv_mad_agent->agent.recv_handler( &recv_mad_agent->agent, + &local->mad_send_wr->send_buf, &local->mad_priv->header.recv_wc); spin_lock_irqsave(&recv_mad_agent->lock, flags); atomic_dec(&recv_mad_agent->refcount); @@ -2855,17 +2843,6 @@ static void timeout_sends(struct work_struct *work) spin_unlock_irqrestore(&mad_agent_priv->lock, flags); } -static void ib_mad_thread_completion_handler(struct ib_cq *cq, void *arg) -{ - struct ib_mad_port_private *port_priv = cq->cq_context; - unsigned long flags; - - spin_lock_irqsave(&ib_mad_port_list_lock, flags); - if (!list_empty(&port_priv->port_list)) - queue_work(port_priv->wq, &port_priv->work); - spin_unlock_irqrestore(&ib_mad_port_list_lock, flags); -} - /* * Allocate receive MADs and post receive WRs for them */ @@ -2913,8 +2890,9 @@ static int ib_mad_post_receive_mads(struct ib_mad_qp_info *qp_info, break; } mad_priv->header.mapping = sg_list.addr; - recv_wr.wr_id = (unsigned long)&mad_priv->header.mad_list; mad_priv->header.mad_list.mad_queue = recv_queue; + mad_priv->header.mad_list.cqe.done = ib_mad_recv_done; + recv_wr.wr_cqe = &mad_priv->header.mad_list.cqe; /* Post receive WR */ spin_lock_irqsave(&recv_queue->lock, flags); @@ -3151,7 +3129,6 @@ static int ib_mad_port_open(struct ib_device *device, unsigned long flags; char name[sizeof "ib_mad123"]; int has_smi; - struct ib_cq_init_attr cq_attr = {}; if (WARN_ON(rdma_max_mad_size(device, port_num) < IB_MGMT_MAD_SIZE)) return -EFAULT; @@ -3179,10 +3156,8 @@ static int ib_mad_port_open(struct ib_device *device, if (has_smi) cq_size *= 2; - cq_attr.cqe = cq_size; - port_priv->cq = ib_create_cq(port_priv->device, - ib_mad_thread_completion_handler, - NULL, port_priv, &cq_attr); + port_priv->cq = ib_alloc_cq(port_priv->device, port_priv, cq_size, 0, + IB_POLL_WORKQUEUE); if (IS_ERR(port_priv->cq)) { dev_err(&device->dev, "Couldn't create ib_mad CQ\n"); ret = PTR_ERR(port_priv->cq); @@ -3211,7 +3186,6 @@ static int ib_mad_port_open(struct ib_device *device, ret = -ENOMEM; goto error8; } - INIT_WORK(&port_priv->work, ib_mad_completion_handler); spin_lock_irqsave(&ib_mad_port_list_lock, flags); list_add_tail(&port_priv->port_list, &ib_mad_port_list); @@ -3238,7 +3212,7 @@ error7: error6: ib_dealloc_pd(port_priv->pd); error4: - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); error3: @@ -3271,7 +3245,7 @@ static int ib_mad_port_close(struct ib_device *device, int port_num) destroy_mad_qp(&port_priv->qp_info[1]); destroy_mad_qp(&port_priv->qp_info[0]); ib_dealloc_pd(port_priv->pd); - ib_destroy_cq(port_priv->cq); + ib_free_cq(port_priv->cq); cleanup_recv_queue(&port_priv->qp_info[1]); cleanup_recv_queue(&port_priv->qp_info[0]); /* XXX: Handle deallocation of MAD registration tables */ diff --git a/drivers/infiniband/core/mad_priv.h b/drivers/infiniband/core/mad_priv.h index 990698a6ab4b..28669f6419e1 100644 --- a/drivers/infiniband/core/mad_priv.h +++ b/drivers/infiniband/core/mad_priv.h @@ -64,6 +64,7 @@ struct ib_mad_list_head { struct list_head list; + struct ib_cqe cqe; struct ib_mad_queue *mad_queue; }; @@ -204,7 +205,6 @@ struct ib_mad_port_private { struct ib_mad_mgmt_version_table version[MAX_MGMT_VERSION]; struct list_head agent_list; struct workqueue_struct *wq; - struct work_struct work; struct ib_mad_qp_info qp_info[IB_MAD_QPS_CORE]; }; diff --git a/drivers/infiniband/core/multicast.c b/drivers/infiniband/core/multicast.c index bb6685fb08c6..250937cb9a1a 100644 --- a/drivers/infiniband/core/multicast.c +++ b/drivers/infiniband/core/multicast.c @@ -723,14 +723,27 @@ EXPORT_SYMBOL(ib_sa_get_mcmember_rec); int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr) { int ret; u16 gid_index; u8 p; - ret = ib_find_cached_gid(device, &rec->port_gid, - NULL, &p, &gid_index); + if (rdma_protocol_roce(device, port_num)) { + ret = ib_find_cached_gid_by_port(device, &rec->port_gid, + gid_type, port_num, + ndev, + &gid_index); + } else if (rdma_protocol_ib(device, port_num)) { + ret = ib_find_cached_gid(device, &rec->port_gid, + IB_GID_TYPE_IB, NULL, &p, + &gid_index); + } else { + ret = -EINVAL; + } + if (ret) return ret; diff --git a/drivers/infiniband/core/roce_gid_mgmt.c b/drivers/infiniband/core/roce_gid_mgmt.c index 178f98482e13..06556c34606d 100644 --- a/drivers/infiniband/core/roce_gid_mgmt.c +++ b/drivers/infiniband/core/roce_gid_mgmt.c @@ -67,17 +67,53 @@ struct netdev_event_work { struct netdev_event_work_cmd cmds[ROCE_NETDEV_CALLBACK_SZ]; }; +static const struct { + bool (*is_supported)(const struct ib_device *device, u8 port_num); + enum ib_gid_type gid_type; +} PORT_CAP_TO_GID_TYPE[] = { + {rdma_protocol_roce_eth_encap, IB_GID_TYPE_ROCE}, + {rdma_protocol_roce_udp_encap, IB_GID_TYPE_ROCE_UDP_ENCAP}, +}; + +#define CAP_TO_GID_TABLE_SIZE ARRAY_SIZE(PORT_CAP_TO_GID_TYPE) + +unsigned long roce_gid_type_mask_support(struct ib_device *ib_dev, u8 port) +{ + int i; + unsigned int ret_flags = 0; + + if (!rdma_protocol_roce(ib_dev, port)) + return 1UL << IB_GID_TYPE_IB; + + for (i = 0; i < CAP_TO_GID_TABLE_SIZE; i++) + if (PORT_CAP_TO_GID_TYPE[i].is_supported(ib_dev, port)) + ret_flags |= 1UL << PORT_CAP_TO_GID_TYPE[i].gid_type; + + return ret_flags; +} +EXPORT_SYMBOL(roce_gid_type_mask_support); + static void update_gid(enum gid_op_type gid_op, struct ib_device *ib_dev, u8 port, union ib_gid *gid, struct ib_gid_attr *gid_attr) { - switch (gid_op) { - case GID_ADD: - ib_cache_gid_add(ib_dev, port, gid, gid_attr); - break; - case GID_DEL: - ib_cache_gid_del(ib_dev, port, gid, gid_attr); - break; + int i; + unsigned long gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + for (i = 0; i < IB_GID_TYPE_SIZE; i++) { + if ((1UL << i) & gid_type_mask) { + gid_attr->gid_type = i; + switch (gid_op) { + case GID_ADD: + ib_cache_gid_add(ib_dev, port, + gid, gid_attr); + break; + case GID_DEL: + ib_cache_gid_del(ib_dev, port, + gid, gid_attr); + break; + } + } } } @@ -103,18 +139,6 @@ static enum bonding_slave_state is_eth_active_slave_of_bonding_rcu(struct net_de return BONDING_SLAVE_STATE_NA; } -static bool is_upper_dev_rcu(struct net_device *dev, struct net_device *upper) -{ - struct net_device *_upper = NULL; - struct list_head *iter; - - netdev_for_each_all_upper_dev_rcu(dev, _upper, iter) - if (_upper == upper) - break; - - return _upper == upper; -} - #define REQUIRED_BOND_STATES (BONDING_SLAVE_STATE_ACTIVE | \ BONDING_SLAVE_STATE_NA) static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, @@ -132,7 +156,7 @@ static int is_eth_port_of_netdev(struct ib_device *ib_dev, u8 port, if (!real_dev) real_dev = event_ndev; - res = ((is_upper_dev_rcu(rdma_ndev, event_ndev) && + res = ((rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && (is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) & REQUIRED_BOND_STATES)) || real_dev == rdma_ndev); @@ -178,7 +202,7 @@ static int upper_device_filter(struct ib_device *ib_dev, u8 port, return 1; rcu_read_lock(); - res = is_upper_dev_rcu(rdma_ndev, event_ndev); + res = rdma_is_upper_dev_rcu(rdma_ndev, event_ndev); rcu_read_unlock(); return res; @@ -203,10 +227,12 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, u8 port, struct net_device *event_ndev, struct net_device *rdma_ndev) { + unsigned long gid_type_mask; + rcu_read_lock(); if (!rdma_ndev || ((rdma_ndev != event_ndev && - !is_upper_dev_rcu(rdma_ndev, event_ndev)) || + !rdma_is_upper_dev_rcu(rdma_ndev, event_ndev)) || is_eth_active_slave_of_bonding_rcu(rdma_ndev, netdev_master_upper_dev_get_rcu(rdma_ndev)) == BONDING_SLAVE_STATE_INACTIVE)) { @@ -215,7 +241,9 @@ static void enum_netdev_default_gids(struct ib_device *ib_dev, } rcu_read_unlock(); - ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_SET); } @@ -234,12 +262,17 @@ static void bond_delete_netdev_default_gids(struct ib_device *ib_dev, rcu_read_lock(); - if (is_upper_dev_rcu(rdma_ndev, event_ndev) && + if (rdma_is_upper_dev_rcu(rdma_ndev, event_ndev) && is_eth_active_slave_of_bonding_rcu(rdma_ndev, real_dev) == BONDING_SLAVE_STATE_INACTIVE) { + unsigned long gid_type_mask; + rcu_read_unlock(); + gid_type_mask = roce_gid_type_mask_support(ib_dev, port); + ib_cache_gid_set_default_gid(ib_dev, port, rdma_ndev, + gid_type_mask, IB_CACHE_GID_DEFAULT_MODE_DELETE); } else { rcu_read_unlock(); diff --git a/drivers/infiniband/core/sa_query.c b/drivers/infiniband/core/sa_query.c index a95a32ba596e..f334090bb612 100644 --- a/drivers/infiniband/core/sa_query.c +++ b/drivers/infiniband/core/sa_query.c @@ -49,7 +49,9 @@ #include <net/netlink.h> #include <uapi/rdma/ib_user_sa.h> #include <rdma/ib_marshall.h> +#include <rdma/ib_addr.h> #include "sa.h" +#include "core_priv.h" MODULE_AUTHOR("Roland Dreier"); MODULE_DESCRIPTION("InfiniBand subnet administration query support"); @@ -715,7 +717,9 @@ static int ib_nl_handle_set_timeout(struct sk_buff *skb, struct nlattr *tb[LS_NLA_TYPE_MAX]; int ret; - if (!netlink_capable(skb, CAP_NET_ADMIN)) + if (!(nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; ret = nla_parse(tb, LS_NLA_TYPE_MAX - 1, nlmsg_data(nlh), @@ -789,7 +793,9 @@ static int ib_nl_handle_resolve_resp(struct sk_buff *skb, int found = 0; int ret; - if (!netlink_capable(skb, CAP_NET_ADMIN)) + if ((nlh->nlmsg_flags & NLM_F_REQUEST) || + !(NETLINK_CB(skb).sk) || + !netlink_capable(skb, CAP_NET_ADMIN)) return -EPERM; spin_lock_irqsave(&ib_nl_request_lock, flags); @@ -996,7 +1002,8 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, { int ret; u16 gid_index; - int force_grh; + int use_roce; + struct net_device *ndev = NULL; memset(ah_attr, 0, sizeof *ah_attr); ah_attr->dlid = be16_to_cpu(rec->dlid); @@ -1006,16 +1013,71 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, ah_attr->port_num = port_num; ah_attr->static_rate = rec->rate; - force_grh = rdma_cap_eth_ah(device, port_num); + use_roce = rdma_cap_eth_ah(device, port_num); + + if (use_roce) { + struct net_device *idev; + struct net_device *resolved_dev; + struct rdma_dev_addr dev_addr = {.bound_dev_if = rec->ifindex, + .net = rec->net ? rec->net : + &init_net}; + union { + struct sockaddr _sockaddr; + struct sockaddr_in _sockaddr_in; + struct sockaddr_in6 _sockaddr_in6; + } sgid_addr, dgid_addr; + + if (!device->get_netdev) + return -EOPNOTSUPP; + + rdma_gid2ip(&sgid_addr._sockaddr, &rec->sgid); + rdma_gid2ip(&dgid_addr._sockaddr, &rec->dgid); + + /* validate the route */ + ret = rdma_resolve_ip_route(&sgid_addr._sockaddr, + &dgid_addr._sockaddr, &dev_addr); + if (ret) + return ret; - if (rec->hop_limit > 1 || force_grh) { - struct net_device *ndev = ib_get_ndev_from_path(rec); + if ((dev_addr.network == RDMA_NETWORK_IPV4 || + dev_addr.network == RDMA_NETWORK_IPV6) && + rec->gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return -EINVAL; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + resolved_dev = dev_get_by_index(dev_addr.net, + dev_addr.bound_dev_if); + if (resolved_dev->flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + ndev = ib_get_ndev_from_path(rec); + rcu_read_lock(); + if ((ndev && ndev != resolved_dev) || + (resolved_dev != idev && + !rdma_is_upper_dev_rcu(idev, resolved_dev))) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); + if (ret) { + if (ndev) + dev_put(ndev); + return ret; + } + } + if (rec->hop_limit > 1 || use_roce) { ah_attr->ah_flags = IB_AH_GRH; ah_attr->grh.dgid = rec->dgid; - ret = ib_find_cached_gid(device, &rec->sgid, ndev, &port_num, - &gid_index); + ret = ib_find_cached_gid_by_port(device, &rec->sgid, + rec->gid_type, port_num, ndev, + &gid_index); if (ret) { if (ndev) dev_put(ndev); @@ -1029,9 +1091,10 @@ int ib_init_ah_from_path(struct ib_device *device, u8 port_num, if (ndev) dev_put(ndev); } - if (force_grh) { + + if (use_roce) memcpy(ah_attr->dmac, rec->dmac, ETH_ALEN); - } + return 0; } EXPORT_SYMBOL(ib_init_ah_from_path); @@ -1157,6 +1220,7 @@ static void ib_sa_path_rec_callback(struct ib_sa_query *sa_query, mad->data, &rec); rec.net = NULL; rec.ifindex = 0; + rec.gid_type = IB_GID_TYPE_IB; memset(rec.dmac, 0, ETH_ALEN); query->callback(status, &rec, query->context); } else @@ -1609,14 +1673,15 @@ static void send_handler(struct ib_mad_agent *agent, } static void recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_sa_query *query; - struct ib_mad_send_buf *mad_buf; - mad_buf = (void *) (unsigned long) mad_recv_wc->wc->wr_id; - query = mad_buf->context[0]; + if (!send_buf) + return; + query = send_buf->context[0]; if (query->callback) { if (mad_recv_wc->wc->status == IB_WC_SUCCESS) query->callback(query, diff --git a/drivers/infiniband/core/sysfs.c b/drivers/infiniband/core/sysfs.c index b1f37d4095fa..3de93517efe4 100644 --- a/drivers/infiniband/core/sysfs.c +++ b/drivers/infiniband/core/sysfs.c @@ -37,15 +37,27 @@ #include <linux/slab.h> #include <linux/stat.h> #include <linux/string.h> +#include <linux/netdevice.h> #include <rdma/ib_mad.h> +#include <rdma/ib_pma.h> +struct ib_port; + +struct gid_attr_group { + struct ib_port *port; + struct kobject kobj; + struct attribute_group ndev; + struct attribute_group type; +}; struct ib_port { struct kobject kobj; struct ib_device *ibdev; + struct gid_attr_group *gid_attr_group; struct attribute_group gid_group; struct attribute_group pkey_group; u8 port_num; + struct attribute_group *pma_table; }; struct port_attribute { @@ -65,6 +77,7 @@ struct port_table_attribute { struct port_attribute attr; char name[8]; int index; + __be16 attr_id; }; static ssize_t port_attr_show(struct kobject *kobj, @@ -84,6 +97,24 @@ static const struct sysfs_ops port_sysfs_ops = { .show = port_attr_show }; +static ssize_t gid_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct port_attribute *port_attr = + container_of(attr, struct port_attribute, attr); + struct ib_port *p = container_of(kobj, struct gid_attr_group, + kobj)->port; + + if (!port_attr->show) + return -EIO; + + return port_attr->show(p, port_attr, buf); +} + +static const struct sysfs_ops gid_attr_sysfs_ops = { + .show = gid_attr_show +}; + static ssize_t state_show(struct ib_port *p, struct port_attribute *unused, char *buf) { @@ -281,6 +312,46 @@ static struct attribute *port_default_attrs[] = { NULL }; +static size_t print_ndev(struct ib_gid_attr *gid_attr, char *buf) +{ + if (!gid_attr->ndev) + return -EINVAL; + + return sprintf(buf, "%s\n", gid_attr->ndev->name); +} + +static size_t print_gid_type(struct ib_gid_attr *gid_attr, char *buf) +{ + return sprintf(buf, "%s\n", ib_cache_gid_type_str(gid_attr->gid_type)); +} + +static ssize_t _show_port_gid_attr(struct ib_port *p, + struct port_attribute *attr, + char *buf, + size_t (*print)(struct ib_gid_attr *gid_attr, + char *buf)) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + union ib_gid gid; + struct ib_gid_attr gid_attr = {}; + ssize_t ret; + va_list args; + + ret = ib_query_gid(p->ibdev, p->port_num, tab_attr->index, &gid, + &gid_attr); + if (ret) + goto err; + + ret = print(&gid_attr, buf); + +err: + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + va_end(args); + return ret; +} + static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -296,6 +367,19 @@ static ssize_t show_port_gid(struct ib_port *p, struct port_attribute *attr, return sprintf(buf, "%pI6\n", gid.raw); } +static ssize_t show_port_gid_attr_ndev(struct ib_port *p, + struct port_attribute *attr, char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_ndev); +} + +static ssize_t show_port_gid_attr_gid_type(struct ib_port *p, + struct port_attribute *attr, + char *buf) +{ + return _show_port_gid_attr(p, attr, buf, print_gid_type); +} + static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, char *buf) { @@ -314,24 +398,32 @@ static ssize_t show_port_pkey(struct ib_port *p, struct port_attribute *attr, #define PORT_PMA_ATTR(_name, _counter, _width, _offset) \ struct port_table_attribute port_pma_attr_##_name = { \ .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ - .index = (_offset) | ((_width) << 16) | ((_counter) << 24) \ + .index = (_offset) | ((_width) << 16) | ((_counter) << 24), \ + .attr_id = IB_PMA_PORT_COUNTERS , \ } -static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, - char *buf) +#define PORT_PMA_ATTR_EXT(_name, _width, _offset) \ +struct port_table_attribute port_pma_attr_ext_##_name = { \ + .attr = __ATTR(_name, S_IRUGO, show_pma_counter, NULL), \ + .index = (_offset) | ((_width) << 16), \ + .attr_id = IB_PMA_PORT_COUNTERS_EXT , \ +} + +/* + * Get a Perfmgmt MAD block of data. + * Returns error code or the number of bytes retrieved. + */ +static int get_perf_mad(struct ib_device *dev, int port_num, __be16 attr, + void *data, int offset, size_t size) { - struct port_table_attribute *tab_attr = - container_of(attr, struct port_table_attribute, attr); - int offset = tab_attr->index & 0xffff; - int width = (tab_attr->index >> 16) & 0xff; - struct ib_mad *in_mad = NULL; - struct ib_mad *out_mad = NULL; + struct ib_mad *in_mad; + struct ib_mad *out_mad; size_t mad_size = sizeof(*out_mad); u16 out_mad_pkey_index = 0; ssize_t ret; - if (!p->ibdev->process_mad) - return sprintf(buf, "N/A (no PMA)\n"); + if (!dev->process_mad) + return -ENOSYS; in_mad = kzalloc(sizeof *in_mad, GFP_KERNEL); out_mad = kmalloc(sizeof *out_mad, GFP_KERNEL); @@ -344,12 +436,13 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, in_mad->mad_hdr.mgmt_class = IB_MGMT_CLASS_PERF_MGMT; in_mad->mad_hdr.class_version = 1; in_mad->mad_hdr.method = IB_MGMT_METHOD_GET; - in_mad->mad_hdr.attr_id = cpu_to_be16(0x12); /* PortCounters */ + in_mad->mad_hdr.attr_id = attr; - in_mad->data[41] = p->port_num; /* PortSelect field */ + if (attr != IB_PMA_CLASS_PORT_INFO) + in_mad->data[41] = port_num; /* PortSelect field */ - if ((p->ibdev->process_mad(p->ibdev, IB_MAD_IGNORE_MKEY, - p->port_num, NULL, NULL, + if ((dev->process_mad(dev, IB_MAD_IGNORE_MKEY, + port_num, NULL, NULL, (const struct ib_mad_hdr *)in_mad, mad_size, (struct ib_mad_hdr *)out_mad, &mad_size, &out_mad_pkey_index) & @@ -358,31 +451,54 @@ static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, ret = -EINVAL; goto out; } + memcpy(data, out_mad->data + offset, size); + ret = size; +out: + kfree(in_mad); + kfree(out_mad); + return ret; +} + +static ssize_t show_pma_counter(struct ib_port *p, struct port_attribute *attr, + char *buf) +{ + struct port_table_attribute *tab_attr = + container_of(attr, struct port_table_attribute, attr); + int offset = tab_attr->index & 0xffff; + int width = (tab_attr->index >> 16) & 0xff; + ssize_t ret; + u8 data[8]; + + ret = get_perf_mad(p->ibdev, p->port_num, tab_attr->attr_id, &data, + 40 + offset / 8, sizeof(data)); + if (ret < 0) + return sprintf(buf, "N/A (no PMA)\n"); switch (width) { case 4: - ret = sprintf(buf, "%u\n", (out_mad->data[40 + offset / 8] >> + ret = sprintf(buf, "%u\n", (*data >> (4 - (offset % 8))) & 0xf); break; case 8: - ret = sprintf(buf, "%u\n", out_mad->data[40 + offset / 8]); + ret = sprintf(buf, "%u\n", *data); break; case 16: ret = sprintf(buf, "%u\n", - be16_to_cpup((__be16 *)(out_mad->data + 40 + offset / 8))); + be16_to_cpup((__be16 *)data)); break; case 32: ret = sprintf(buf, "%u\n", - be32_to_cpup((__be32 *)(out_mad->data + 40 + offset / 8))); + be32_to_cpup((__be32 *)data)); + break; + case 64: + ret = sprintf(buf, "%llu\n", + be64_to_cpup((__be64 *)data)); break; + default: ret = 0; } -out: - kfree(in_mad); - kfree(out_mad); - return ret; } @@ -403,6 +519,18 @@ static PORT_PMA_ATTR(port_rcv_data , 13, 32, 224); static PORT_PMA_ATTR(port_xmit_packets , 14, 32, 256); static PORT_PMA_ATTR(port_rcv_packets , 15, 32, 288); +/* + * Counters added by extended set + */ +static PORT_PMA_ATTR_EXT(port_xmit_data , 64, 64); +static PORT_PMA_ATTR_EXT(port_rcv_data , 64, 128); +static PORT_PMA_ATTR_EXT(port_xmit_packets , 64, 192); +static PORT_PMA_ATTR_EXT(port_rcv_packets , 64, 256); +static PORT_PMA_ATTR_EXT(unicast_xmit_packets , 64, 320); +static PORT_PMA_ATTR_EXT(unicast_rcv_packets , 64, 384); +static PORT_PMA_ATTR_EXT(multicast_xmit_packets , 64, 448); +static PORT_PMA_ATTR_EXT(multicast_rcv_packets , 64, 512); + static struct attribute *pma_attrs[] = { &port_pma_attr_symbol_error.attr.attr, &port_pma_attr_link_error_recovery.attr.attr, @@ -423,11 +551,65 @@ static struct attribute *pma_attrs[] = { NULL }; +static struct attribute *pma_attrs_ext[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_rcv_packets.attr.attr, + &port_pma_attr_ext_unicast_xmit_packets.attr.attr, + &port_pma_attr_ext_multicast_rcv_packets.attr.attr, + &port_pma_attr_ext_multicast_xmit_packets.attr.attr, + NULL +}; + +static struct attribute *pma_attrs_noietf[] = { + &port_pma_attr_symbol_error.attr.attr, + &port_pma_attr_link_error_recovery.attr.attr, + &port_pma_attr_link_downed.attr.attr, + &port_pma_attr_port_rcv_errors.attr.attr, + &port_pma_attr_port_rcv_remote_physical_errors.attr.attr, + &port_pma_attr_port_rcv_switch_relay_errors.attr.attr, + &port_pma_attr_port_xmit_discards.attr.attr, + &port_pma_attr_port_xmit_constraint_errors.attr.attr, + &port_pma_attr_port_rcv_constraint_errors.attr.attr, + &port_pma_attr_local_link_integrity_errors.attr.attr, + &port_pma_attr_excessive_buffer_overrun_errors.attr.attr, + &port_pma_attr_VL15_dropped.attr.attr, + &port_pma_attr_ext_port_xmit_data.attr.attr, + &port_pma_attr_ext_port_rcv_data.attr.attr, + &port_pma_attr_ext_port_xmit_packets.attr.attr, + &port_pma_attr_ext_port_rcv_packets.attr.attr, + NULL +}; + static struct attribute_group pma_group = { .name = "counters", .attrs = pma_attrs }; +static struct attribute_group pma_group_ext = { + .name = "counters", + .attrs = pma_attrs_ext +}; + +static struct attribute_group pma_group_noietf = { + .name = "counters", + .attrs = pma_attrs_noietf +}; + static void ib_port_release(struct kobject *kobj) { struct ib_port *p = container_of(kobj, struct ib_port, kobj); @@ -451,12 +633,41 @@ static void ib_port_release(struct kobject *kobj) kfree(p); } +static void ib_port_gid_attr_release(struct kobject *kobj) +{ + struct gid_attr_group *g = container_of(kobj, struct gid_attr_group, + kobj); + struct attribute *a; + int i; + + if (g->ndev.attrs) { + for (i = 0; (a = g->ndev.attrs[i]); ++i) + kfree(a); + + kfree(g->ndev.attrs); + } + + if (g->type.attrs) { + for (i = 0; (a = g->type.attrs[i]); ++i) + kfree(a); + + kfree(g->type.attrs); + } + + kfree(g); +} + static struct kobj_type port_type = { .release = ib_port_release, .sysfs_ops = &port_sysfs_ops, .default_attrs = port_default_attrs }; +static struct kobj_type gid_attr_type = { + .sysfs_ops = &gid_attr_sysfs_ops, + .release = ib_port_gid_attr_release +}; + static struct attribute ** alloc_group_attrs(ssize_t (*show)(struct ib_port *, struct port_attribute *, char *buf), @@ -500,6 +711,31 @@ err: return NULL; } +/* + * Figure out which counter table to use depending on + * the device capabilities. + */ +static struct attribute_group *get_counter_table(struct ib_device *dev, + int port_num) +{ + struct ib_class_port_info cpi; + + if (get_perf_mad(dev, port_num, IB_PMA_CLASS_PORT_INFO, + &cpi, 40, sizeof(cpi)) >= 0) { + + if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH) + /* We have extended counters */ + return &pma_group_ext; + + if (cpi.capability_mask && IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF) + /* But not the IETF ones */ + return &pma_group_noietf; + } + + /* Fall back to normal counters */ + return &pma_group; +} + static int add_port(struct ib_device *device, int port_num, int (*port_callback)(struct ib_device *, u8, struct kobject *)) @@ -528,9 +764,24 @@ static int add_port(struct ib_device *device, int port_num, return ret; } - ret = sysfs_create_group(&p->kobj, &pma_group); - if (ret) + p->gid_attr_group = kzalloc(sizeof(*p->gid_attr_group), GFP_KERNEL); + if (!p->gid_attr_group) { + ret = -ENOMEM; goto err_put; + } + + p->gid_attr_group->port = p; + ret = kobject_init_and_add(&p->gid_attr_group->kobj, &gid_attr_type, + &p->kobj, "gid_attrs"); + if (ret) { + kfree(p->gid_attr_group); + goto err_put; + } + + p->pma_table = get_counter_table(device, port_num); + ret = sysfs_create_group(&p->kobj, p->pma_table); + if (ret) + goto err_put_gid_attrs; p->gid_group.name = "gids"; p->gid_group.attrs = alloc_group_attrs(show_port_gid, attr.gid_tbl_len); @@ -543,12 +794,38 @@ static int add_port(struct ib_device *device, int port_num, if (ret) goto err_free_gid; + p->gid_attr_group->ndev.name = "ndevs"; + p->gid_attr_group->ndev.attrs = alloc_group_attrs(show_port_gid_attr_ndev, + attr.gid_tbl_len); + if (!p->gid_attr_group->ndev.attrs) { + ret = -ENOMEM; + goto err_remove_gid; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + if (ret) + goto err_free_gid_ndev; + + p->gid_attr_group->type.name = "types"; + p->gid_attr_group->type.attrs = alloc_group_attrs(show_port_gid_attr_gid_type, + attr.gid_tbl_len); + if (!p->gid_attr_group->type.attrs) { + ret = -ENOMEM; + goto err_remove_gid_ndev; + } + + ret = sysfs_create_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + if (ret) + goto err_free_gid_type; + p->pkey_group.name = "pkeys"; p->pkey_group.attrs = alloc_group_attrs(show_port_pkey, attr.pkey_tbl_len); if (!p->pkey_group.attrs) { ret = -ENOMEM; - goto err_remove_gid; + goto err_remove_gid_type; } ret = sysfs_create_group(&p->kobj, &p->pkey_group); @@ -576,6 +853,28 @@ err_free_pkey: kfree(p->pkey_group.attrs); p->pkey_group.attrs = NULL; +err_remove_gid_type: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->type); + +err_free_gid_type: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->type.attrs[i]); + + kfree(p->gid_attr_group->type.attrs); + p->gid_attr_group->type.attrs = NULL; + +err_remove_gid_ndev: + sysfs_remove_group(&p->gid_attr_group->kobj, + &p->gid_attr_group->ndev); + +err_free_gid_ndev: + for (i = 0; i < attr.gid_tbl_len; ++i) + kfree(p->gid_attr_group->ndev.attrs[i]); + + kfree(p->gid_attr_group->ndev.attrs); + p->gid_attr_group->ndev.attrs = NULL; + err_remove_gid: sysfs_remove_group(&p->kobj, &p->gid_group); @@ -587,7 +886,10 @@ err_free_gid: p->gid_group.attrs = NULL; err_remove_pma: - sysfs_remove_group(&p->kobj, &pma_group); + sysfs_remove_group(&p->kobj, p->pma_table); + +err_put_gid_attrs: + kobject_put(&p->gid_attr_group->kobj); err_put: kobject_put(&p->kobj); @@ -614,18 +916,12 @@ static ssize_t show_sys_image_guid(struct device *device, struct device_attribute *dev_attr, char *buf) { struct ib_device *dev = container_of(device, struct ib_device, dev); - struct ib_device_attr attr; - ssize_t ret; - - ret = ib_query_device(dev, &attr); - if (ret) - return ret; return sprintf(buf, "%04x:%04x:%04x:%04x\n", - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[0]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[1]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[2]), - be16_to_cpu(((__be16 *) &attr.sys_image_guid)[3])); + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[0]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[1]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[2]), + be16_to_cpu(((__be16 *) &dev->attrs.sys_image_guid)[3])); } static ssize_t show_node_guid(struct device *device, @@ -800,9 +1096,14 @@ static void free_port_list_attributes(struct ib_device *device) list_for_each_entry_safe(p, t, &device->port_list, entry) { struct ib_port *port = container_of(p, struct ib_port, kobj); list_del(&p->entry); - sysfs_remove_group(p, &pma_group); + sysfs_remove_group(p, port->pma_table); sysfs_remove_group(p, &port->pkey_group); sysfs_remove_group(p, &port->gid_group); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->ndev); + sysfs_remove_group(&port->gid_attr_group->kobj, + &port->gid_attr_group->type); + kobject_put(&port->gid_attr_group->kobj); kobject_put(p); } diff --git a/drivers/infiniband/core/ud_header.c b/drivers/infiniband/core/ud_header.c index 72feee620ebf..19837d270278 100644 --- a/drivers/infiniband/core/ud_header.c +++ b/drivers/infiniband/core/ud_header.c @@ -35,6 +35,7 @@ #include <linux/string.h> #include <linux/export.h> #include <linux/if_ether.h> +#include <linux/ip.h> #include <rdma/ib_pack.h> @@ -116,6 +117,72 @@ static const struct ib_field vlan_table[] = { .size_bits = 16 } }; +static const struct ib_field ip4_table[] = { + { STRUCT_FIELD(ip4, ver), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, hdr_len), + .offset_words = 0, + .offset_bits = 4, + .size_bits = 4 }, + { STRUCT_FIELD(ip4, tos), + .offset_words = 0, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, tot_len), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, id), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, frag_off), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, ttl), + .offset_words = 2, + .offset_bits = 0, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, protocol), + .offset_words = 2, + .offset_bits = 8, + .size_bits = 8 }, + { STRUCT_FIELD(ip4, check), + .offset_words = 2, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(ip4, saddr), + .offset_words = 3, + .offset_bits = 0, + .size_bits = 32 }, + { STRUCT_FIELD(ip4, daddr), + .offset_words = 4, + .offset_bits = 0, + .size_bits = 32 } +}; + +static const struct ib_field udp_table[] = { + { STRUCT_FIELD(udp, sport), + .offset_words = 0, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, dport), + .offset_words = 0, + .offset_bits = 16, + .size_bits = 16 }, + { STRUCT_FIELD(udp, length), + .offset_words = 1, + .offset_bits = 0, + .size_bits = 16 }, + { STRUCT_FIELD(udp, csum), + .offset_words = 1, + .offset_bits = 16, + .size_bits = 16 } +}; + static const struct ib_field grh_table[] = { { STRUCT_FIELD(grh, ip_version), .offset_words = 0, @@ -213,26 +280,57 @@ static const struct ib_field deth_table[] = { .size_bits = 24 } }; +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header) +{ + struct iphdr iph; + + iph.ihl = 5; + iph.version = 4; + iph.tos = header->ip4.tos; + iph.tot_len = header->ip4.tot_len; + iph.id = header->ip4.id; + iph.frag_off = header->ip4.frag_off; + iph.ttl = header->ip4.ttl; + iph.protocol = header->ip4.protocol; + iph.check = 0; + iph.saddr = header->ip4.saddr; + iph.daddr = header->ip4.daddr; + + return ip_fast_csum((u8 *)&iph, iph.ihl); +} +EXPORT_SYMBOL(ib_ud_ip4_csum); + /** * ib_ud_header_init - Initialize UD header structure * @payload_bytes:Length of packet payload * @lrh_present: specify if LRH is present * @eth_present: specify if Eth header is present * @vlan_present: packet is tagged vlan - * @grh_present:GRH flag (if non-zero, GRH will be included) + * @grh_present: GRH flag (if non-zero, GRH will be included) + * @ip_version: if non-zero, IP header, V4 or V6, will be included + * @udp_present :if non-zero, UDP header will be included * @immediate_present: specify if immediate data is present * @header:Structure to initialize */ -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header) +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header) { + grh_present = grh_present && !ip_version; memset(header, 0, sizeof *header); + /* + * UDP header without IP header doesn't make sense + */ + if (udp_present && ip_version != 4 && ip_version != 6) + return -EINVAL; + if (lrh_present) { u16 packet_length; @@ -252,7 +350,7 @@ void ib_ud_header_init(int payload_bytes, if (vlan_present) header->eth.type = cpu_to_be16(ETH_P_8021Q); - if (grh_present) { + if (ip_version == 6 || grh_present) { header->grh.ip_version = 6; header->grh.payload_length = cpu_to_be16((IB_BTH_BYTES + @@ -260,8 +358,30 @@ void ib_ud_header_init(int payload_bytes, payload_bytes + 4 + /* ICRC */ 3) & ~3); /* round up */ - header->grh.next_header = 0x1b; + header->grh.next_header = udp_present ? IPPROTO_UDP : 0x1b; + } + + if (ip_version == 4) { + int udp_bytes = udp_present ? IB_UDP_BYTES : 0; + + header->ip4.ver = 4; /* version 4 */ + header->ip4.hdr_len = 5; /* 5 words */ + header->ip4.tot_len = + cpu_to_be16(IB_IP4_BYTES + + udp_bytes + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ + header->ip4.protocol = IPPROTO_UDP; } + if (udp_present && ip_version) + header->udp.length = + cpu_to_be16(IB_UDP_BYTES + + IB_BTH_BYTES + + IB_DETH_BYTES + + payload_bytes + + 4); /* ICRC */ if (immediate_present) header->bth.opcode = IB_OPCODE_UD_SEND_ONLY_WITH_IMMEDIATE; @@ -273,8 +393,11 @@ void ib_ud_header_init(int payload_bytes, header->lrh_present = lrh_present; header->eth_present = eth_present; header->vlan_present = vlan_present; - header->grh_present = grh_present; + header->grh_present = grh_present || (ip_version == 6); + header->ipv4_present = ip_version == 4; + header->udp_present = udp_present; header->immediate_present = immediate_present; + return 0; } EXPORT_SYMBOL(ib_ud_header_init); @@ -311,6 +434,16 @@ int ib_ud_header_pack(struct ib_ud_header *header, &header->grh, buf + len); len += IB_GRH_BYTES; } + if (header->ipv4_present) { + ib_pack(ip4_table, ARRAY_SIZE(ip4_table), + &header->ip4, buf + len); + len += IB_IP4_BYTES; + } + if (header->udp_present) { + ib_pack(udp_table, ARRAY_SIZE(udp_table), + &header->udp, buf + len); + len += IB_UDP_BYTES; + } ib_pack(bth_table, ARRAY_SIZE(bth_table), &header->bth, buf + len); diff --git a/drivers/infiniband/core/umem_odp.c b/drivers/infiniband/core/umem_odp.c index 40becdb3196e..e69bf266049d 100644 --- a/drivers/infiniband/core/umem_odp.c +++ b/drivers/infiniband/core/umem_odp.c @@ -232,7 +232,7 @@ static void ib_umem_notifier_invalidate_range_end(struct mmu_notifier *mn, ib_ucontext_notifier_end_account(context); } -static struct mmu_notifier_ops ib_umem_notifiers = { +static const struct mmu_notifier_ops ib_umem_notifiers = { .release = ib_umem_notifier_release, .invalidate_page = ib_umem_notifier_invalidate_page, .invalidate_range_start = ib_umem_notifier_invalidate_range_start, diff --git a/drivers/infiniband/core/user_mad.c b/drivers/infiniband/core/user_mad.c index 57f281f8d686..415a3185cde7 100644 --- a/drivers/infiniband/core/user_mad.c +++ b/drivers/infiniband/core/user_mad.c @@ -210,6 +210,7 @@ static void send_handler(struct ib_mad_agent *agent, } static void recv_handler(struct ib_mad_agent *agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc) { struct ib_umad_file *file = agent->context; diff --git a/drivers/infiniband/core/uverbs.h b/drivers/infiniband/core/uverbs.h index 94bbd8c155fc..612ccfd39bf9 100644 --- a/drivers/infiniband/core/uverbs.h +++ b/drivers/infiniband/core/uverbs.h @@ -204,6 +204,8 @@ void ib_uverbs_event_handler(struct ib_event_handler *handler, struct ib_event *event); void ib_uverbs_dealloc_xrcd(struct ib_uverbs_device *dev, struct ib_xrcd *xrcd); +int uverbs_dealloc_mw(struct ib_mw *mw); + struct ib_uverbs_flow_spec { union { union { diff --git a/drivers/infiniband/core/uverbs_cmd.c b/drivers/infiniband/core/uverbs_cmd.c index 1c02deab068f..6ffc9c4e93af 100644 --- a/drivers/infiniband/core/uverbs_cmd.c +++ b/drivers/infiniband/core/uverbs_cmd.c @@ -291,9 +291,6 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, struct ib_uverbs_get_context cmd; struct ib_uverbs_get_context_resp resp; struct ib_udata udata; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - struct ib_device_attr dev_attr; -#endif struct ib_ucontext *ucontext; struct file *filp; int ret; @@ -342,10 +339,7 @@ ssize_t ib_uverbs_get_context(struct ib_uverbs_file *file, ucontext->odp_mrs_count = 0; INIT_LIST_HEAD(&ucontext->no_private_counters); - ret = ib_query_device(ib_dev, &dev_attr); - if (ret) - goto err_free; - if (!(dev_attr.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) + if (!(ib_dev->attrs.device_cap_flags & IB_DEVICE_ON_DEMAND_PAGING)) ucontext->invalidate_range = NULL; #endif @@ -447,8 +441,6 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, { struct ib_uverbs_query_device cmd; struct ib_uverbs_query_device_resp resp; - struct ib_device_attr attr; - int ret; if (out_len < sizeof resp) return -ENOSPC; @@ -456,12 +448,8 @@ ssize_t ib_uverbs_query_device(struct ib_uverbs_file *file, if (copy_from_user(&cmd, buf, sizeof cmd)) return -EFAULT; - ret = ib_query_device(ib_dev, &attr); - if (ret) - return ret; - memset(&resp, 0, sizeof resp); - copy_query_dev_fields(file, ib_dev, &resp, &attr); + copy_query_dev_fields(file, ib_dev, &resp, &ib_dev->attrs); if (copy_to_user((void __user *) (unsigned long) cmd.response, &resp, sizeof resp)) @@ -986,11 +974,8 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, } if (cmd.access_flags & IB_ACCESS_ON_DEMAND) { - struct ib_device_attr attr; - - ret = ib_query_device(pd->device, &attr); - if (ret || !(attr.device_cap_flags & - IB_DEVICE_ON_DEMAND_PAGING)) { + if (!(pd->device->attrs.device_cap_flags & + IB_DEVICE_ON_DEMAND_PAGING)) { pr_debug("ODP support not available\n"); ret = -EINVAL; goto err_put; @@ -1008,7 +993,6 @@ ssize_t ib_uverbs_reg_mr(struct ib_uverbs_file *file, mr->pd = pd; mr->uobject = uobj; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); uobj->object = mr; ret = idr_add_uobj(&ib_uverbs_mr_idr, uobj); @@ -1106,11 +1090,6 @@ ssize_t ib_uverbs_rereg_mr(struct ib_uverbs_file *file, } } - if (atomic_read(&mr->usecnt)) { - ret = -EBUSY; - goto put_uobj_pd; - } - old_pd = mr->pd; ret = mr->device->rereg_user_mr(mr, cmd.flags, cmd.start, cmd.length, cmd.hca_va, @@ -1258,7 +1237,7 @@ err_copy: idr_remove_uobj(&ib_uverbs_mw_idr, uobj); err_unalloc: - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); err_put: put_pd_read(pd); @@ -1287,7 +1266,7 @@ ssize_t ib_uverbs_dealloc_mw(struct ib_uverbs_file *file, mw = uobj->object; - ret = ib_dealloc_mw(mw); + ret = uverbs_dealloc_mw(mw); if (!ret) uobj->live = 0; @@ -1845,7 +1824,10 @@ static int create_qp(struct ib_uverbs_file *file, sizeof(cmd->create_flags)) attr.create_flags = cmd->create_flags; - if (attr.create_flags & ~IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK) { + if (attr.create_flags & ~(IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK | + IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { ret = -EINVAL; goto err_put; } diff --git a/drivers/infiniband/core/uverbs_main.c b/drivers/infiniband/core/uverbs_main.c index e3ef28861be6..39680aed99dd 100644 --- a/drivers/infiniband/core/uverbs_main.c +++ b/drivers/infiniband/core/uverbs_main.c @@ -133,6 +133,17 @@ static int (*uverbs_ex_cmd_table[])(struct ib_uverbs_file *file, static void ib_uverbs_add_one(struct ib_device *device); static void ib_uverbs_remove_one(struct ib_device *device, void *client_data); +int uverbs_dealloc_mw(struct ib_mw *mw) +{ + struct ib_pd *pd = mw->pd; + int ret; + + ret = mw->device->dealloc_mw(mw); + if (!ret) + atomic_dec(&pd->usecnt); + return ret; +} + static void ib_uverbs_release_dev(struct kobject *kobj) { struct ib_uverbs_device *dev = @@ -224,7 +235,7 @@ static int ib_uverbs_cleanup_ucontext(struct ib_uverbs_file *file, struct ib_mw *mw = uobj->object; idr_remove_uobj(&ib_uverbs_mw_idr, uobj); - ib_dealloc_mw(mw); + uverbs_dealloc_mw(mw); kfree(uobj); } diff --git a/drivers/infiniband/core/uverbs_marshall.c b/drivers/infiniband/core/uverbs_marshall.c index 7d2f14c9bbef..af020f80d50f 100644 --- a/drivers/infiniband/core/uverbs_marshall.c +++ b/drivers/infiniband/core/uverbs_marshall.c @@ -144,5 +144,6 @@ void ib_copy_path_rec_from_user(struct ib_sa_path_rec *dst, memset(dst->dmac, 0, sizeof(dst->dmac)); dst->net = NULL; dst->ifindex = 0; + dst->gid_type = IB_GID_TYPE_IB; } EXPORT_SYMBOL(ib_copy_path_rec_from_user); diff --git a/drivers/infiniband/core/verbs.c b/drivers/infiniband/core/verbs.c index 545906dec26d..5af6d024e053 100644 --- a/drivers/infiniband/core/verbs.c +++ b/drivers/infiniband/core/verbs.c @@ -229,12 +229,6 @@ EXPORT_SYMBOL(rdma_port_get_link_layer); struct ib_pd *ib_alloc_pd(struct ib_device *device) { struct ib_pd *pd; - struct ib_device_attr devattr; - int rc; - - rc = ib_query_device(device, &devattr); - if (rc) - return ERR_PTR(rc); pd = device->alloc_pd(device, NULL, NULL); if (IS_ERR(pd)) @@ -245,7 +239,7 @@ struct ib_pd *ib_alloc_pd(struct ib_device *device) pd->local_mr = NULL; atomic_set(&pd->usecnt, 0); - if (devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) + if (device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY) pd->local_dma_lkey = device->local_dma_lkey; else { struct ib_mr *mr; @@ -311,8 +305,61 @@ struct ib_ah *ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) } EXPORT_SYMBOL(ib_create_ah); +static int ib_get_header_version(const union rdma_network_hdr *hdr) +{ + const struct iphdr *ip4h = (struct iphdr *)&hdr->roce4grh; + struct iphdr ip4h_checked; + const struct ipv6hdr *ip6h = (struct ipv6hdr *)&hdr->ibgrh; + + /* If it's IPv6, the version must be 6, otherwise, the first + * 20 bytes (before the IPv4 header) are garbled. + */ + if (ip6h->version != 6) + return (ip4h->version == 4) ? 4 : 0; + /* version may be 6 or 4 because the first 20 bytes could be garbled */ + + /* RoCE v2 requires no options, thus header length + * must be 5 words + */ + if (ip4h->ihl != 5) + return 6; + + /* Verify checksum. + * We can't write on scattered buffers so we need to copy to + * temp buffer. + */ + memcpy(&ip4h_checked, ip4h, sizeof(ip4h_checked)); + ip4h_checked.check = 0; + ip4h_checked.check = ip_fast_csum((u8 *)&ip4h_checked, 5); + /* if IPv4 header checksum is OK, believe it */ + if (ip4h->check == ip4h_checked.check) + return 4; + return 6; +} + +static enum rdma_network_type ib_get_net_type_by_grh(struct ib_device *device, + u8 port_num, + const struct ib_grh *grh) +{ + int grh_version; + + if (rdma_protocol_ib(device, port_num)) + return RDMA_NETWORK_IB; + + grh_version = ib_get_header_version((union rdma_network_hdr *)grh); + + if (grh_version == 4) + return RDMA_NETWORK_IPV4; + + if (grh->next_hdr == IPPROTO_UDP) + return RDMA_NETWORK_IPV6; + + return RDMA_NETWORK_ROCE_V1; +} + struct find_gid_index_context { u16 vlan_id; + enum ib_gid_type gid_type; }; static bool find_gid_index(const union ib_gid *gid, @@ -322,6 +369,9 @@ static bool find_gid_index(const union ib_gid *gid, struct find_gid_index_context *ctx = (struct find_gid_index_context *)context; + if (ctx->gid_type != gid_attr->gid_type) + return false; + if ((!!(ctx->vlan_id != 0xffff) == !is_vlan_dev(gid_attr->ndev)) || (is_vlan_dev(gid_attr->ndev) && vlan_dev_vlan_id(gid_attr->ndev) != ctx->vlan_id)) @@ -332,14 +382,49 @@ static bool find_gid_index(const union ib_gid *gid, static int get_sgid_index_from_eth(struct ib_device *device, u8 port_num, u16 vlan_id, const union ib_gid *sgid, + enum ib_gid_type gid_type, u16 *gid_index) { - struct find_gid_index_context context = {.vlan_id = vlan_id}; + struct find_gid_index_context context = {.vlan_id = vlan_id, + .gid_type = gid_type}; return ib_find_gid_by_filter(device, sgid, port_num, find_gid_index, &context, gid_index); } +static int get_gids_from_rdma_hdr(union rdma_network_hdr *hdr, + enum rdma_network_type net_type, + union ib_gid *sgid, union ib_gid *dgid) +{ + struct sockaddr_in src_in; + struct sockaddr_in dst_in; + __be32 src_saddr, dst_saddr; + + if (!sgid || !dgid) + return -EINVAL; + + if (net_type == RDMA_NETWORK_IPV4) { + memcpy(&src_in.sin_addr.s_addr, + &hdr->roce4grh.saddr, 4); + memcpy(&dst_in.sin_addr.s_addr, + &hdr->roce4grh.daddr, 4); + src_saddr = src_in.sin_addr.s_addr; + dst_saddr = dst_in.sin_addr.s_addr; + ipv6_addr_set_v4mapped(src_saddr, + (struct in6_addr *)sgid); + ipv6_addr_set_v4mapped(dst_saddr, + (struct in6_addr *)dgid); + return 0; + } else if (net_type == RDMA_NETWORK_IPV6 || + net_type == RDMA_NETWORK_IB) { + *dgid = hdr->ibgrh.dgid; + *sgid = hdr->ibgrh.sgid; + return 0; + } else { + return -EINVAL; + } +} + int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, const struct ib_wc *wc, const struct ib_grh *grh, struct ib_ah_attr *ah_attr) @@ -347,33 +432,72 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, u32 flow_class; u16 gid_index; int ret; + enum rdma_network_type net_type = RDMA_NETWORK_IB; + enum ib_gid_type gid_type = IB_GID_TYPE_IB; + int hoplimit = 0xff; + union ib_gid dgid; + union ib_gid sgid; memset(ah_attr, 0, sizeof *ah_attr); if (rdma_cap_eth_ah(device, port_num)) { + if (wc->wc_flags & IB_WC_WITH_NETWORK_HDR_TYPE) + net_type = wc->network_hdr_type; + else + net_type = ib_get_net_type_by_grh(device, port_num, grh); + gid_type = ib_network_to_gid_type(net_type); + } + ret = get_gids_from_rdma_hdr((union rdma_network_hdr *)grh, net_type, + &sgid, &dgid); + if (ret) + return ret; + + if (rdma_protocol_roce(device, port_num)) { + int if_index = 0; u16 vlan_id = wc->wc_flags & IB_WC_WITH_VLAN ? wc->vlan_id : 0xffff; + struct net_device *idev; + struct net_device *resolved_dev; if (!(wc->wc_flags & IB_WC_GRH)) return -EPROTOTYPE; - if (!(wc->wc_flags & IB_WC_WITH_SMAC) || - !(wc->wc_flags & IB_WC_WITH_VLAN)) { - ret = rdma_addr_find_dmac_by_grh(&grh->dgid, &grh->sgid, - ah_attr->dmac, - wc->wc_flags & IB_WC_WITH_VLAN ? - NULL : &vlan_id, - 0); - if (ret) - return ret; + if (!device->get_netdev) + return -EOPNOTSUPP; + + idev = device->get_netdev(device, port_num); + if (!idev) + return -ENODEV; + + ret = rdma_addr_find_l2_eth_by_grh(&dgid, &sgid, + ah_attr->dmac, + wc->wc_flags & IB_WC_WITH_VLAN ? + NULL : &vlan_id, + &if_index, &hoplimit); + if (ret) { + dev_put(idev); + return ret; } - ret = get_sgid_index_from_eth(device, port_num, vlan_id, - &grh->dgid, &gid_index); + resolved_dev = dev_get_by_index(&init_net, if_index); + if (resolved_dev->flags & IFF_LOOPBACK) { + dev_put(resolved_dev); + resolved_dev = idev; + dev_hold(resolved_dev); + } + rcu_read_lock(); + if (resolved_dev != idev && !rdma_is_upper_dev_rcu(idev, + resolved_dev)) + ret = -EHOSTUNREACH; + rcu_read_unlock(); + dev_put(idev); + dev_put(resolved_dev); if (ret) return ret; - if (wc->wc_flags & IB_WC_WITH_SMAC) - memcpy(ah_attr->dmac, wc->smac, ETH_ALEN); + ret = get_sgid_index_from_eth(device, port_num, vlan_id, + &dgid, gid_type, &gid_index); + if (ret) + return ret; } ah_attr->dlid = wc->slid; @@ -383,10 +507,11 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, if (wc->wc_flags & IB_WC_GRH) { ah_attr->ah_flags = IB_AH_GRH; - ah_attr->grh.dgid = grh->sgid; + ah_attr->grh.dgid = sgid; if (!rdma_cap_eth_ah(device, port_num)) { - ret = ib_find_cached_gid_by_port(device, &grh->dgid, + ret = ib_find_cached_gid_by_port(device, &dgid, + IB_GID_TYPE_IB, port_num, NULL, &gid_index); if (ret) @@ -396,7 +521,7 @@ int ib_init_ah_from_wc(struct ib_device *device, u8 port_num, ah_attr->grh.sgid_index = (u8) gid_index; flow_class = be32_to_cpu(grh->version_tclass_flow); ah_attr->grh.flow_label = flow_class & 0xFFFFF; - ah_attr->grh.hop_limit = 0xFF; + ah_attr->grh.hop_limit = hoplimit; ah_attr->grh.traffic_class = (flow_class >> 20) & 0xFF; } return 0; @@ -1014,6 +1139,7 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, union ib_gid sgid; struct ib_gid_attr sgid_attr; int ifindex; + int hop_limit; ret = ib_query_gid(qp->device, qp_attr->ah_attr.port_num, @@ -1028,12 +1154,14 @@ int ib_resolve_eth_dmac(struct ib_qp *qp, ifindex = sgid_attr.ndev->ifindex; - ret = rdma_addr_find_dmac_by_grh(&sgid, - &qp_attr->ah_attr.grh.dgid, - qp_attr->ah_attr.dmac, - NULL, ifindex); + ret = rdma_addr_find_l2_eth_by_grh(&sgid, + &qp_attr->ah_attr.grh.dgid, + qp_attr->ah_attr.dmac, + NULL, &ifindex, &hop_limit); dev_put(sgid_attr.ndev); + + qp_attr->ah_attr.grh.hop_limit = hop_limit; } } out: @@ -1215,29 +1343,17 @@ struct ib_mr *ib_get_dma_mr(struct ib_pd *pd, int mr_access_flags) mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); } return mr; } EXPORT_SYMBOL(ib_get_dma_mr); -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) -{ - return mr->device->query_mr ? - mr->device->query_mr(mr, mr_attr) : -ENOSYS; -} -EXPORT_SYMBOL(ib_query_mr); - int ib_dereg_mr(struct ib_mr *mr) { - struct ib_pd *pd; + struct ib_pd *pd = mr->pd; int ret; - if (atomic_read(&mr->usecnt)) - return -EBUSY; - - pd = mr->pd; ret = mr->device->dereg_mr(mr); if (!ret) atomic_dec(&pd->usecnt); @@ -1273,49 +1389,12 @@ struct ib_mr *ib_alloc_mr(struct ib_pd *pd, mr->pd = pd; mr->uobject = NULL; atomic_inc(&pd->usecnt); - atomic_set(&mr->usecnt, 0); } return mr; } EXPORT_SYMBOL(ib_alloc_mr); -/* Memory windows */ - -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) -{ - struct ib_mw *mw; - - if (!pd->device->alloc_mw) - return ERR_PTR(-ENOSYS); - - mw = pd->device->alloc_mw(pd, type); - if (!IS_ERR(mw)) { - mw->device = pd->device; - mw->pd = pd; - mw->uobject = NULL; - mw->type = type; - atomic_inc(&pd->usecnt); - } - - return mw; -} -EXPORT_SYMBOL(ib_alloc_mw); - -int ib_dealloc_mw(struct ib_mw *mw) -{ - struct ib_pd *pd; - int ret; - - pd = mw->pd; - ret = mw->device->dealloc_mw(mw); - if (!ret) - atomic_dec(&pd->usecnt); - - return ret; -} -EXPORT_SYMBOL(ib_dealloc_mw); - /* "Fast" memory regions */ struct ib_fmr *ib_alloc_fmr(struct ib_pd *pd, @@ -1530,7 +1609,7 @@ int ib_sg_to_pages(struct ib_mr *mr, int (*set_page)(struct ib_mr *, u64)) { struct scatterlist *sg; - u64 last_end_dma_addr = 0, last_page_addr = 0; + u64 last_end_dma_addr = 0; unsigned int last_page_off = 0; u64 page_mask = ~((u64)mr->page_size - 1); int i, ret; @@ -1572,7 +1651,6 @@ next_page: mr->length += dma_len; last_end_dma_addr = end_dma_addr; - last_page_addr = end_dma_addr & page_mask; last_page_off = end_dma_addr & ~page_mask; } diff --git a/drivers/infiniband/hw/cxgb3/iwch_cm.c b/drivers/infiniband/hw/cxgb3/iwch_cm.c index cb78b1e9bcd9..f504ba73e5dc 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cm.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cm.c @@ -149,7 +149,7 @@ static int iwch_l2t_send(struct t3cdev *tdev, struct sk_buff *skb, struct l2t_en error = l2t_send(tdev, skb, l2e); if (error < 0) kfree_skb(skb); - return error; + return error < 0 ? error : 0; } int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb) @@ -165,7 +165,7 @@ int iwch_cxgb3_ofld_send(struct t3cdev *tdev, struct sk_buff *skb) error = cxgb3_ofld_send(tdev, skb); if (error < 0) kfree_skb(skb); - return error; + return error < 0 ? error : 0; } static void release_tid(struct t3cdev *tdev, u32 hwtid, struct sk_buff *skb) diff --git a/drivers/infiniband/hw/cxgb3/iwch_cq.c b/drivers/infiniband/hw/cxgb3/iwch_cq.c index cfe404925a39..97fbfd2c298e 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_cq.c +++ b/drivers/infiniband/hw/cxgb3/iwch_cq.c @@ -115,10 +115,6 @@ static int iwch_poll_cq_one(struct iwch_dev *rhp, struct iwch_cq *chp, case T3_SEND_WITH_SE_INV: wc->opcode = IB_WC_SEND; break; - case T3_BIND_MW: - wc->opcode = IB_WC_BIND_MW; - break; - case T3_LOCAL_INV: wc->opcode = IB_WC_LOCAL_INV; break; diff --git a/drivers/infiniband/hw/cxgb3/iwch_mem.c b/drivers/infiniband/hw/cxgb3/iwch_mem.c index 5c36ee2809ac..1d04c872c9d5 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_mem.c +++ b/drivers/infiniband/hw/cxgb3/iwch_mem.c @@ -75,37 +75,6 @@ int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, return ret; } -int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, - struct iwch_mr *mhp, - int shift, - int npages) -{ - u32 stag; - int ret; - - /* We could support this... */ - if (npages > mhp->attr.pbl_size) - return -ENOMEM; - - stag = mhp->attr.stag; - if (cxio_reregister_phys_mem(&rhp->rdev, - &stag, mhp->attr.pdid, - mhp->attr.perms, - mhp->attr.zbva, - mhp->attr.va_fbo, - mhp->attr.len, - shift - 12, - mhp->attr.pbl_size, mhp->attr.pbl_addr)) - return -ENOMEM; - - ret = iwch_finish_mem_reg(mhp, stag); - if (ret) - cxio_dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, - mhp->attr.pbl_addr); - - return ret; -} - int iwch_alloc_pbl(struct iwch_mr *mhp, int npages) { mhp->attr.pbl_addr = cxio_hal_pblpool_alloc(&mhp->rhp->rdev, @@ -130,74 +99,3 @@ int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset) return cxio_write_pbl(&mhp->rhp->rdev, pages, mhp->attr.pbl_addr + (offset << 3), npages); } - -int build_phys_page_list(struct ib_phys_buf *buffer_list, - int num_phys_buf, - u64 *iova_start, - u64 *total_size, - int *npages, - int *shift, - __be64 **page_list) -{ - u64 mask; - int i, j, n; - - mask = 0; - *total_size = 0; - for (i = 0; i < num_phys_buf; ++i) { - if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) - return -EINVAL; - if (i != 0 && i != num_phys_buf - 1 && - (buffer_list[i].size & ~PAGE_MASK)) - return -EINVAL; - *total_size += buffer_list[i].size; - if (i > 0) - mask |= buffer_list[i].addr; - else - mask |= buffer_list[i].addr & PAGE_MASK; - if (i != num_phys_buf - 1) - mask |= buffer_list[i].addr + buffer_list[i].size; - else - mask |= (buffer_list[i].addr + buffer_list[i].size + - PAGE_SIZE - 1) & PAGE_MASK; - } - - if (*total_size > 0xFFFFFFFFULL) - return -ENOMEM; - - /* Find largest page shift we can use to cover buffers */ - for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) - if ((1ULL << *shift) & mask) - break; - - buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); - buffer_list[0].addr &= ~0ull << *shift; - - *npages = 0; - for (i = 0; i < num_phys_buf; ++i) - *npages += (buffer_list[i].size + - (1ULL << *shift) - 1) >> *shift; - - if (!*npages) - return -EINVAL; - - *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); - if (!*page_list) - return -ENOMEM; - - n = 0; - for (i = 0; i < num_phys_buf; ++i) - for (j = 0; - j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; - ++j) - (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + - ((u64) j << *shift)); - - PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", - __func__, (unsigned long long) *iova_start, - (unsigned long long) mask, *shift, (unsigned long long) *total_size, - *npages); - - return 0; - -} diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.c b/drivers/infiniband/hw/cxgb3/iwch_provider.c index c34725ca0bb4..2734820d291b 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.c +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.c @@ -458,9 +458,6 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) u32 mmid; PDBG("%s ib_mr %p\n", __func__, ib_mr); - /* There can be no memory windows */ - if (atomic_read(&ib_mr->usecnt)) - return -EINVAL; mhp = to_iwch_mr(ib_mr); kfree(mhp->pages); @@ -479,24 +476,25 @@ static int iwch_dereg_mr(struct ib_mr *ib_mr) return 0; } -static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int acc, - u64 *iova_start) +static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) { - __be64 *page_list; - int shift; - u64 total_size; - int npages; - struct iwch_dev *rhp; - struct iwch_pd *php; + const u64 total_size = 0xffffffff; + const u64 mask = (total_size + PAGE_SIZE - 1) & PAGE_MASK; + struct iwch_pd *php = to_iwch_pd(pd); + struct iwch_dev *rhp = php->rhp; struct iwch_mr *mhp; - int ret; + __be64 *page_list; + int shift = 26, npages, ret, i; PDBG("%s ib_pd %p\n", __func__, pd); - php = to_iwch_pd(pd); - rhp = php->rhp; + + /* + * T3 only supports 32 bits of size. + */ + if (sizeof(phys_addr_t) > 4) { + pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n"); + return ERR_PTR(-ENOTSUPP); + } mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); if (!mhp) @@ -504,22 +502,23 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, mhp->rhp = rhp; - /* First check that we have enough alignment */ - if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { + npages = (total_size + (1ULL << shift) - 1) >> shift; + if (!npages) { ret = -EINVAL; goto err; } - if (num_phys_buf > 1 && - ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { - ret = -EINVAL; + page_list = kmalloc_array(npages, sizeof(u64), GFP_KERNEL); + if (!page_list) { + ret = -ENOMEM; goto err; } - ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, - &total_size, &npages, &shift, &page_list); - if (ret) - goto err; + for (i = 0; i < npages; i++) + page_list[i] = cpu_to_be64((u64)i << shift); + + PDBG("%s mask 0x%llx shift %d len %lld pbl_size %d\n", + __func__, mask, shift, total_size, npages); ret = iwch_alloc_pbl(mhp, npages); if (ret) { @@ -536,7 +535,7 @@ static struct ib_mr *iwch_register_phys_mem(struct ib_pd *pd, mhp->attr.zbva = 0; mhp->attr.perms = iwch_ib_to_tpt_access(acc); - mhp->attr.va_fbo = *iova_start; + mhp->attr.va_fbo = 0; mhp->attr.page_size = shift - 12; mhp->attr.len = (u32) total_size; @@ -553,76 +552,8 @@ err_pbl: err: kfree(mhp); return ERR_PTR(ret); - -} - -static int iwch_reregister_phys_mem(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int acc, u64 * iova_start) -{ - - struct iwch_mr mh, *mhp; - struct iwch_pd *php; - struct iwch_dev *rhp; - __be64 *page_list = NULL; - int shift = 0; - u64 total_size; - int npages = 0; - int ret; - - PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); - - /* There can be no memory windows */ - if (atomic_read(&mr->usecnt)) - return -EINVAL; - - mhp = to_iwch_mr(mr); - rhp = mhp->rhp; - php = to_iwch_pd(mr->pd); - - /* make sure we are on the same adapter */ - if (rhp != php->rhp) - return -EINVAL; - - memcpy(&mh, mhp, sizeof *mhp); - - if (mr_rereg_mask & IB_MR_REREG_PD) - php = to_iwch_pd(pd); - if (mr_rereg_mask & IB_MR_REREG_ACCESS) - mh.attr.perms = iwch_ib_to_tpt_access(acc); - if (mr_rereg_mask & IB_MR_REREG_TRANS) { - ret = build_phys_page_list(buffer_list, num_phys_buf, - iova_start, - &total_size, &npages, - &shift, &page_list); - if (ret) - return ret; - } - - ret = iwch_reregister_mem(rhp, php, &mh, shift, npages); - kfree(page_list); - if (ret) { - return ret; - } - if (mr_rereg_mask & IB_MR_REREG_PD) - mhp->attr.pdid = php->pdid; - if (mr_rereg_mask & IB_MR_REREG_ACCESS) - mhp->attr.perms = iwch_ib_to_tpt_access(acc); - if (mr_rereg_mask & IB_MR_REREG_TRANS) { - mhp->attr.zbva = 0; - mhp->attr.va_fbo = *iova_start; - mhp->attr.page_size = shift - 12; - mhp->attr.len = (u32) total_size; - mhp->attr.pbl_size = npages; - } - - return 0; } - static struct ib_mr *iwch_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { @@ -726,28 +657,6 @@ err: return ERR_PTR(err); } -static struct ib_mr *iwch_get_dma_mr(struct ib_pd *pd, int acc) -{ - struct ib_phys_buf bl; - u64 kva; - struct ib_mr *ibmr; - - PDBG("%s ib_pd %p\n", __func__, pd); - - /* - * T3 only supports 32 bits of size. - */ - if (sizeof(phys_addr_t) > 4) { - pr_warn_once(MOD "Cannot support dma_mrs on this platform.\n"); - return ERR_PTR(-ENOTSUPP); - } - bl.size = 0xffffffff; - bl.addr = 0; - kva = 0; - ibmr = iwch_register_phys_mem(pd, &bl, 1, acc, &kva); - return ibmr; -} - static struct ib_mw *iwch_alloc_mw(struct ib_pd *pd, enum ib_mw_type type) { struct iwch_dev *rhp; @@ -1452,12 +1361,9 @@ int iwch_register_device(struct iwch_dev *dev) dev->ibdev.resize_cq = iwch_resize_cq; dev->ibdev.poll_cq = iwch_poll_cq; dev->ibdev.get_dma_mr = iwch_get_dma_mr; - dev->ibdev.reg_phys_mr = iwch_register_phys_mem; - dev->ibdev.rereg_phys_mr = iwch_reregister_phys_mem; dev->ibdev.reg_user_mr = iwch_reg_user_mr; dev->ibdev.dereg_mr = iwch_dereg_mr; dev->ibdev.alloc_mw = iwch_alloc_mw; - dev->ibdev.bind_mw = iwch_bind_mw; dev->ibdev.dealloc_mw = iwch_dealloc_mw; dev->ibdev.alloc_mr = iwch_alloc_mr; dev->ibdev.map_mr_sg = iwch_map_mr_sg; diff --git a/drivers/infiniband/hw/cxgb3/iwch_provider.h b/drivers/infiniband/hw/cxgb3/iwch_provider.h index 2ac85b86a680..252c464a09f6 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_provider.h +++ b/drivers/infiniband/hw/cxgb3/iwch_provider.h @@ -330,9 +330,6 @@ int iwch_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr); int iwch_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); -int iwch_bind_mw(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind); int iwch_poll_cq(struct ib_cq *ibcq, int num_entries, struct ib_wc *wc); int iwch_post_terminate(struct iwch_qp *qhp, struct respQ_msg_t *rsp_msg); int iwch_post_zb_read(struct iwch_ep *ep); @@ -341,21 +338,9 @@ void iwch_unregister_device(struct iwch_dev *dev); void stop_read_rep_timer(struct iwch_qp *qhp); int iwch_register_mem(struct iwch_dev *rhp, struct iwch_pd *php, struct iwch_mr *mhp, int shift); -int iwch_reregister_mem(struct iwch_dev *rhp, struct iwch_pd *php, - struct iwch_mr *mhp, - int shift, - int npages); int iwch_alloc_pbl(struct iwch_mr *mhp, int npages); void iwch_free_pbl(struct iwch_mr *mhp); int iwch_write_pbl(struct iwch_mr *mhp, __be64 *pages, int npages, int offset); -int build_phys_page_list(struct ib_phys_buf *buffer_list, - int num_phys_buf, - u64 *iova_start, - u64 *total_size, - int *npages, - int *shift, - __be64 **page_list); - #define IWCH_NODE_DESC "cxgb3 Chelsio Communications" diff --git a/drivers/infiniband/hw/cxgb3/iwch_qp.c b/drivers/infiniband/hw/cxgb3/iwch_qp.c index d0548fc6395e..d939980a708f 100644 --- a/drivers/infiniband/hw/cxgb3/iwch_qp.c +++ b/drivers/infiniband/hw/cxgb3/iwch_qp.c @@ -526,88 +526,6 @@ out: return err; } -int iwch_bind_mw(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind) -{ - struct iwch_dev *rhp; - struct iwch_mw *mhp; - struct iwch_qp *qhp; - union t3_wr *wqe; - u32 pbl_addr; - u8 page_size; - u32 num_wrs; - unsigned long flag; - struct ib_sge sgl; - int err=0; - enum t3_wr_flags t3_wr_flags; - u32 idx; - struct t3_swsq *sqp; - - qhp = to_iwch_qp(qp); - mhp = to_iwch_mw(mw); - rhp = qhp->rhp; - - spin_lock_irqsave(&qhp->lock, flag); - if (qhp->attr.state > IWCH_QP_STATE_RTS) { - spin_unlock_irqrestore(&qhp->lock, flag); - return -EINVAL; - } - num_wrs = Q_FREECNT(qhp->wq.sq_rptr, qhp->wq.sq_wptr, - qhp->wq.sq_size_log2); - if (num_wrs == 0) { - spin_unlock_irqrestore(&qhp->lock, flag); - return -ENOMEM; - } - idx = Q_PTR2IDX(qhp->wq.wptr, qhp->wq.size_log2); - PDBG("%s: idx 0x%0x, mw 0x%p, mw_bind 0x%p\n", __func__, idx, - mw, mw_bind); - wqe = (union t3_wr *) (qhp->wq.queue + idx); - - t3_wr_flags = 0; - if (mw_bind->send_flags & IB_SEND_SIGNALED) - t3_wr_flags = T3_COMPLETION_FLAG; - - sgl.addr = mw_bind->bind_info.addr; - sgl.lkey = mw_bind->bind_info.mr->lkey; - sgl.length = mw_bind->bind_info.length; - wqe->bind.reserved = 0; - wqe->bind.type = TPT_VATO; - - /* TBD: check perms */ - wqe->bind.perms = iwch_ib_to_tpt_bind_access( - mw_bind->bind_info.mw_access_flags); - wqe->bind.mr_stag = cpu_to_be32(mw_bind->bind_info.mr->lkey); - wqe->bind.mw_stag = cpu_to_be32(mw->rkey); - wqe->bind.mw_len = cpu_to_be32(mw_bind->bind_info.length); - wqe->bind.mw_va = cpu_to_be64(mw_bind->bind_info.addr); - err = iwch_sgl2pbl_map(rhp, &sgl, 1, &pbl_addr, &page_size); - if (err) { - spin_unlock_irqrestore(&qhp->lock, flag); - return err; - } - wqe->send.wrid.id0.hi = qhp->wq.sq_wptr; - sqp = qhp->wq.sq + Q_PTR2IDX(qhp->wq.sq_wptr, qhp->wq.sq_size_log2); - sqp->wr_id = mw_bind->wr_id; - sqp->opcode = T3_BIND_MW; - sqp->sq_wptr = qhp->wq.sq_wptr; - sqp->complete = 0; - sqp->signaled = (mw_bind->send_flags & IB_SEND_SIGNALED); - wqe->bind.mr_pbl_addr = cpu_to_be32(pbl_addr); - wqe->bind.mr_pagesz = page_size; - build_fw_riwrh((void *)wqe, T3_WR_BIND, t3_wr_flags, - Q_GENBIT(qhp->wq.wptr, qhp->wq.size_log2), 0, - sizeof(struct t3_bind_mw_wr) >> 3, T3_SOPEOP); - ++(qhp->wq.wptr); - ++(qhp->wq.sq_wptr); - spin_unlock_irqrestore(&qhp->lock, flag); - - if (cxio_wq_db_enabled(&qhp->wq)) - ring_doorbell(qhp->wq.doorbell, qhp->wq.qpid); - - return err; -} - static inline void build_term_codes(struct respQ_msg_t *rsp_msg, u8 *layer_type, u8 *ecode) { diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c index 326d07d823a5..cd2ff5f9518a 100644 --- a/drivers/infiniband/hw/cxgb4/cm.c +++ b/drivers/infiniband/hw/cxgb4/cm.c @@ -3271,6 +3271,12 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) struct sockaddr_in6 *sin6 = (struct sockaddr_in6 *) &ep->com.mapped_local_addr; + if (ipv6_addr_type(&sin6->sin6_addr) != IPV6_ADDR_ANY) { + err = cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0], + (const u32 *)&sin6->sin6_addr.s6_addr, 1); + if (err) + return err; + } c4iw_init_wr_wait(&ep->com.wr_wait); err = cxgb4_create_server6(ep->com.dev->rdev.lldi.ports[0], ep->stid, &sin6->sin6_addr, @@ -3282,13 +3288,13 @@ static int create_server6(struct c4iw_dev *dev, struct c4iw_listen_ep *ep) 0, 0, __func__); else if (err > 0) err = net_xmit_errno(err); - if (err) + if (err) { + cxgb4_clip_release(ep->com.dev->rdev.lldi.ports[0], + (const u32 *)&sin6->sin6_addr.s6_addr, 1); pr_err("cxgb4_create_server6/filter failed err %d stid %d laddr %pI6 lport %d\n", err, ep->stid, sin6->sin6_addr.s6_addr, ntohs(sin6->sin6_port)); - else - cxgb4_clip_get(ep->com.dev->rdev.lldi.ports[0], - (const u32 *)&sin6->sin6_addr.s6_addr, 1); + } return err; } diff --git a/drivers/infiniband/hw/cxgb4/cq.c b/drivers/infiniband/hw/cxgb4/cq.c index de9cd6901752..cf21df4a8bf5 100644 --- a/drivers/infiniband/hw/cxgb4/cq.c +++ b/drivers/infiniband/hw/cxgb4/cq.c @@ -744,9 +744,6 @@ static int c4iw_poll_cq_one(struct c4iw_cq *chp, struct ib_wc *wc) case FW_RI_SEND_WITH_SE: wc->opcode = IB_WC_SEND; break; - case FW_RI_BIND_MW: - wc->opcode = IB_WC_BIND_MW; - break; case FW_RI_LOCAL_INV: wc->opcode = IB_WC_LOCAL_INV; diff --git a/drivers/infiniband/hw/cxgb4/device.c b/drivers/infiniband/hw/cxgb4/device.c index 58fce1742b8d..8024ea4417b8 100644 --- a/drivers/infiniband/hw/cxgb4/device.c +++ b/drivers/infiniband/hw/cxgb4/device.c @@ -315,14 +315,12 @@ static int qp_release(struct inode *inode, struct file *file) static int qp_open(struct inode *inode, struct file *file) { struct c4iw_debugfs_data *qpd; - int ret = 0; int count = 1; qpd = kmalloc(sizeof *qpd, GFP_KERNEL); - if (!qpd) { - ret = -ENOMEM; - goto out; - } + if (!qpd) + return -ENOMEM; + qpd->devp = inode->i_private; qpd->pos = 0; @@ -333,8 +331,8 @@ static int qp_open(struct inode *inode, struct file *file) qpd->bufsize = count * 128; qpd->buf = vmalloc(qpd->bufsize); if (!qpd->buf) { - ret = -ENOMEM; - goto err1; + kfree(qpd); + return -ENOMEM; } spin_lock_irq(&qpd->devp->lock); @@ -343,11 +341,7 @@ static int qp_open(struct inode *inode, struct file *file) qpd->buf[qpd->pos++] = 0; file->private_data = qpd; - goto out; -err1: - kfree(qpd); -out: - return ret; + return 0; } static const struct file_operations qp_debugfs_fops = { @@ -781,8 +775,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) pr_err(MOD "%s: unsupported udb/ucq densities %u/%u\n", pci_name(rdev->lldi.pdev), rdev->lldi.udb_density, rdev->lldi.ucq_density); - err = -EINVAL; - goto err1; + return -EINVAL; } if (rdev->lldi.vr->qp.start != rdev->lldi.vr->cq.start || rdev->lldi.vr->qp.size != rdev->lldi.vr->cq.size) { @@ -791,8 +784,7 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) pci_name(rdev->lldi.pdev), rdev->lldi.vr->qp.start, rdev->lldi.vr->qp.size, rdev->lldi.vr->cq.size, rdev->lldi.vr->cq.size); - err = -EINVAL; - goto err1; + return -EINVAL; } rdev->qpmask = rdev->lldi.udb_density - 1; @@ -816,10 +808,8 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->lldi.db_reg, rdev->lldi.gts_reg, rdev->qpmask, rdev->cqmask); - if (c4iw_num_stags(rdev) == 0) { - err = -EINVAL; - goto err1; - } + if (c4iw_num_stags(rdev) == 0) + return -EINVAL; rdev->stats.pd.total = T4_MAX_NUM_PD; rdev->stats.stag.total = rdev->lldi.vr->stag.size; @@ -831,29 +821,31 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) err = c4iw_init_resource(rdev, c4iw_num_stags(rdev), T4_MAX_NUM_PD); if (err) { printk(KERN_ERR MOD "error %d initializing resources\n", err); - goto err1; + return err; } err = c4iw_pblpool_create(rdev); if (err) { printk(KERN_ERR MOD "error %d initializing pbl pool\n", err); - goto err2; + goto destroy_resource; } err = c4iw_rqtpool_create(rdev); if (err) { printk(KERN_ERR MOD "error %d initializing rqt pool\n", err); - goto err3; + goto destroy_pblpool; } err = c4iw_ocqp_pool_create(rdev); if (err) { printk(KERN_ERR MOD "error %d initializing ocqp pool\n", err); - goto err4; + goto destroy_rqtpool; } rdev->status_page = (struct t4_dev_status_page *) __get_free_page(GFP_KERNEL); - if (!rdev->status_page) { - pr_err(MOD "error allocating status page\n"); - goto err4; - } + if (!rdev->status_page) + goto destroy_ocqp_pool; + rdev->status_page->qp_start = rdev->lldi.vr->qp.start; + rdev->status_page->qp_size = rdev->lldi.vr->qp.size; + rdev->status_page->cq_start = rdev->lldi.vr->cq.start; + rdev->status_page->cq_size = rdev->lldi.vr->cq.size; if (c4iw_wr_log) { rdev->wr_log = kzalloc((1 << c4iw_wr_log_size_order) * @@ -869,13 +861,14 @@ static int c4iw_rdev_open(struct c4iw_rdev *rdev) rdev->status_page->db_off = 0; return 0; -err4: +destroy_ocqp_pool: + c4iw_ocqp_pool_destroy(rdev); +destroy_rqtpool: c4iw_rqtpool_destroy(rdev); -err3: +destroy_pblpool: c4iw_pblpool_destroy(rdev); -err2: +destroy_resource: c4iw_destroy_resource(&rdev->resource); -err1: return err; } diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h index 00e55faa086a..fb2de75a0392 100644 --- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h +++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h @@ -947,8 +947,6 @@ int c4iw_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, struct ib_send_wr **bad_wr); int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); -int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, - struct ib_mw_bind *mw_bind); int c4iw_connect(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param); int c4iw_create_listen(struct iw_cm_id *cm_id, int backlog); int c4iw_destroy_listen(struct iw_cm_id *cm_id); @@ -968,17 +966,6 @@ struct ib_mr *c4iw_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata); struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc); -struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int acc, - u64 *iova_start); -int c4iw_reregister_phys_mem(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int acc, u64 *iova_start); int c4iw_dereg_mr(struct ib_mr *ib_mr); int c4iw_destroy_cq(struct ib_cq *ib_cq); struct ib_cq *c4iw_create_cq(struct ib_device *ibdev, diff --git a/drivers/infiniband/hw/cxgb4/mem.c b/drivers/infiniband/hw/cxgb4/mem.c index e1629ab58db7..7849890c4781 100644 --- a/drivers/infiniband/hw/cxgb4/mem.c +++ b/drivers/infiniband/hw/cxgb4/mem.c @@ -392,32 +392,6 @@ static int register_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, return ret; } -static int reregister_mem(struct c4iw_dev *rhp, struct c4iw_pd *php, - struct c4iw_mr *mhp, int shift, int npages) -{ - u32 stag; - int ret; - - if (npages > mhp->attr.pbl_size) - return -ENOMEM; - - stag = mhp->attr.stag; - ret = write_tpt_entry(&rhp->rdev, 0, &stag, 1, mhp->attr.pdid, - FW_RI_STAG_NSMR, mhp->attr.perms, - mhp->attr.mw_bind_enable, mhp->attr.zbva, - mhp->attr.va_fbo, mhp->attr.len, shift - 12, - mhp->attr.pbl_size, mhp->attr.pbl_addr); - if (ret) - return ret; - - ret = finish_mem_reg(mhp, stag); - if (ret) - dereg_mem(&rhp->rdev, mhp->attr.stag, mhp->attr.pbl_size, - mhp->attr.pbl_addr); - - return ret; -} - static int alloc_pbl(struct c4iw_mr *mhp, int npages) { mhp->attr.pbl_addr = c4iw_pblpool_alloc(&mhp->rhp->rdev, @@ -431,228 +405,6 @@ static int alloc_pbl(struct c4iw_mr *mhp, int npages) return 0; } -static int build_phys_page_list(struct ib_phys_buf *buffer_list, - int num_phys_buf, u64 *iova_start, - u64 *total_size, int *npages, - int *shift, __be64 **page_list) -{ - u64 mask; - int i, j, n; - - mask = 0; - *total_size = 0; - for (i = 0; i < num_phys_buf; ++i) { - if (i != 0 && buffer_list[i].addr & ~PAGE_MASK) - return -EINVAL; - if (i != 0 && i != num_phys_buf - 1 && - (buffer_list[i].size & ~PAGE_MASK)) - return -EINVAL; - *total_size += buffer_list[i].size; - if (i > 0) - mask |= buffer_list[i].addr; - else - mask |= buffer_list[i].addr & PAGE_MASK; - if (i != num_phys_buf - 1) - mask |= buffer_list[i].addr + buffer_list[i].size; - else - mask |= (buffer_list[i].addr + buffer_list[i].size + - PAGE_SIZE - 1) & PAGE_MASK; - } - - if (*total_size > 0xFFFFFFFFULL) - return -ENOMEM; - - /* Find largest page shift we can use to cover buffers */ - for (*shift = PAGE_SHIFT; *shift < 27; ++(*shift)) - if ((1ULL << *shift) & mask) - break; - - buffer_list[0].size += buffer_list[0].addr & ((1ULL << *shift) - 1); - buffer_list[0].addr &= ~0ull << *shift; - - *npages = 0; - for (i = 0; i < num_phys_buf; ++i) - *npages += (buffer_list[i].size + - (1ULL << *shift) - 1) >> *shift; - - if (!*npages) - return -EINVAL; - - *page_list = kmalloc(sizeof(u64) * *npages, GFP_KERNEL); - if (!*page_list) - return -ENOMEM; - - n = 0; - for (i = 0; i < num_phys_buf; ++i) - for (j = 0; - j < (buffer_list[i].size + (1ULL << *shift) - 1) >> *shift; - ++j) - (*page_list)[n++] = cpu_to_be64(buffer_list[i].addr + - ((u64) j << *shift)); - - PDBG("%s va 0x%llx mask 0x%llx shift %d len %lld pbl_size %d\n", - __func__, (unsigned long long)*iova_start, - (unsigned long long)mask, *shift, (unsigned long long)*total_size, - *npages); - - return 0; - -} - -int c4iw_reregister_phys_mem(struct ib_mr *mr, int mr_rereg_mask, - struct ib_pd *pd, struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start) -{ - - struct c4iw_mr mh, *mhp; - struct c4iw_pd *php; - struct c4iw_dev *rhp; - __be64 *page_list = NULL; - int shift = 0; - u64 total_size; - int npages; - int ret; - - PDBG("%s ib_mr %p ib_pd %p\n", __func__, mr, pd); - - /* There can be no memory windows */ - if (atomic_read(&mr->usecnt)) - return -EINVAL; - - mhp = to_c4iw_mr(mr); - rhp = mhp->rhp; - php = to_c4iw_pd(mr->pd); - - /* make sure we are on the same adapter */ - if (rhp != php->rhp) - return -EINVAL; - - memcpy(&mh, mhp, sizeof *mhp); - - if (mr_rereg_mask & IB_MR_REREG_PD) - php = to_c4iw_pd(pd); - if (mr_rereg_mask & IB_MR_REREG_ACCESS) { - mh.attr.perms = c4iw_ib_to_tpt_access(acc); - mh.attr.mw_bind_enable = (acc & IB_ACCESS_MW_BIND) == - IB_ACCESS_MW_BIND; - } - if (mr_rereg_mask & IB_MR_REREG_TRANS) { - ret = build_phys_page_list(buffer_list, num_phys_buf, - iova_start, - &total_size, &npages, - &shift, &page_list); - if (ret) - return ret; - } - - if (mr_exceeds_hw_limits(rhp, total_size)) { - kfree(page_list); - return -EINVAL; - } - - ret = reregister_mem(rhp, php, &mh, shift, npages); - kfree(page_list); - if (ret) - return ret; - if (mr_rereg_mask & IB_MR_REREG_PD) - mhp->attr.pdid = php->pdid; - if (mr_rereg_mask & IB_MR_REREG_ACCESS) - mhp->attr.perms = c4iw_ib_to_tpt_access(acc); - if (mr_rereg_mask & IB_MR_REREG_TRANS) { - mhp->attr.zbva = 0; - mhp->attr.va_fbo = *iova_start; - mhp->attr.page_size = shift - 12; - mhp->attr.len = (u32) total_size; - mhp->attr.pbl_size = npages; - } - - return 0; -} - -struct ib_mr *c4iw_register_phys_mem(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start) -{ - __be64 *page_list; - int shift; - u64 total_size; - int npages; - struct c4iw_dev *rhp; - struct c4iw_pd *php; - struct c4iw_mr *mhp; - int ret; - - PDBG("%s ib_pd %p\n", __func__, pd); - php = to_c4iw_pd(pd); - rhp = php->rhp; - - mhp = kzalloc(sizeof(*mhp), GFP_KERNEL); - if (!mhp) - return ERR_PTR(-ENOMEM); - - mhp->rhp = rhp; - - /* First check that we have enough alignment */ - if ((*iova_start & ~PAGE_MASK) != (buffer_list[0].addr & ~PAGE_MASK)) { - ret = -EINVAL; - goto err; - } - - if (num_phys_buf > 1 && - ((buffer_list[0].addr + buffer_list[0].size) & ~PAGE_MASK)) { - ret = -EINVAL; - goto err; - } - - ret = build_phys_page_list(buffer_list, num_phys_buf, iova_start, - &total_size, &npages, &shift, - &page_list); - if (ret) - goto err; - - if (mr_exceeds_hw_limits(rhp, total_size)) { - kfree(page_list); - ret = -EINVAL; - goto err; - } - - ret = alloc_pbl(mhp, npages); - if (ret) { - kfree(page_list); - goto err; - } - - ret = write_pbl(&mhp->rhp->rdev, page_list, mhp->attr.pbl_addr, - npages); - kfree(page_list); - if (ret) - goto err_pbl; - - mhp->attr.pdid = php->pdid; - mhp->attr.zbva = 0; - - mhp->attr.perms = c4iw_ib_to_tpt_access(acc); - mhp->attr.va_fbo = *iova_start; - mhp->attr.page_size = shift - 12; - - mhp->attr.len = (u32) total_size; - mhp->attr.pbl_size = npages; - ret = register_mem(rhp, php, mhp, shift); - if (ret) - goto err_pbl; - - return &mhp->ibmr; - -err_pbl: - c4iw_pblpool_free(&mhp->rhp->rdev, mhp->attr.pbl_addr, - mhp->attr.pbl_size << 3); - -err: - kfree(mhp); - return ERR_PTR(ret); - -} - struct ib_mr *c4iw_get_dma_mr(struct ib_pd *pd, int acc) { struct c4iw_dev *rhp; @@ -952,9 +704,6 @@ int c4iw_dereg_mr(struct ib_mr *ib_mr) u32 mmid; PDBG("%s ib_mr %p\n", __func__, ib_mr); - /* There can be no memory windows */ - if (atomic_read(&ib_mr->usecnt)) - return -EINVAL; mhp = to_c4iw_mr(ib_mr); rhp = mhp->rhp; diff --git a/drivers/infiniband/hw/cxgb4/provider.c b/drivers/infiniband/hw/cxgb4/provider.c index 0a7d99818b17..ec04272fbdc2 100644 --- a/drivers/infiniband/hw/cxgb4/provider.c +++ b/drivers/infiniband/hw/cxgb4/provider.c @@ -549,12 +549,9 @@ int c4iw_register_device(struct c4iw_dev *dev) dev->ibdev.resize_cq = c4iw_resize_cq; dev->ibdev.poll_cq = c4iw_poll_cq; dev->ibdev.get_dma_mr = c4iw_get_dma_mr; - dev->ibdev.reg_phys_mr = c4iw_register_phys_mem; - dev->ibdev.rereg_phys_mr = c4iw_reregister_phys_mem; dev->ibdev.reg_user_mr = c4iw_reg_user_mr; dev->ibdev.dereg_mr = c4iw_dereg_mr; dev->ibdev.alloc_mw = c4iw_alloc_mw; - dev->ibdev.bind_mw = c4iw_bind_mw; dev->ibdev.dealloc_mw = c4iw_dealloc_mw; dev->ibdev.alloc_mr = c4iw_alloc_mr; dev->ibdev.map_mr_sg = c4iw_map_mr_sg; diff --git a/drivers/infiniband/hw/cxgb4/qp.c b/drivers/infiniband/hw/cxgb4/qp.c index aa515afee724..e99345eb875a 100644 --- a/drivers/infiniband/hw/cxgb4/qp.c +++ b/drivers/infiniband/hw/cxgb4/qp.c @@ -933,11 +933,6 @@ int c4iw_post_receive(struct ib_qp *ibqp, struct ib_recv_wr *wr, return err; } -int c4iw_bind_mw(struct ib_qp *qp, struct ib_mw *mw, struct ib_mw_bind *mw_bind) -{ - return -ENOSYS; -} - static inline void build_term_codes(struct t4_cqe *err_cqe, u8 *layer_type, u8 *ecode) { diff --git a/drivers/infiniband/hw/cxgb4/t4.h b/drivers/infiniband/hw/cxgb4/t4.h index 1092a2d1f607..6126bbe36095 100644 --- a/drivers/infiniband/hw/cxgb4/t4.h +++ b/drivers/infiniband/hw/cxgb4/t4.h @@ -699,4 +699,11 @@ static inline void t4_set_cq_in_error(struct t4_cq *cq) struct t4_dev_status_page { u8 db_off; + u8 pad1; + u16 pad2; + u32 pad3; + u64 qp_start; + u64 qp_size; + u64 cq_start; + u64 cq_size; }; diff --git a/drivers/infiniband/hw/cxgb4/user.h b/drivers/infiniband/hw/cxgb4/user.h index cbd0ce170728..295f422b9a3a 100644 --- a/drivers/infiniband/hw/cxgb4/user.h +++ b/drivers/infiniband/hw/cxgb4/user.h @@ -32,7 +32,7 @@ #ifndef __C4IW_USER_H__ #define __C4IW_USER_H__ -#define C4IW_UVERBS_ABI_VERSION 2 +#define C4IW_UVERBS_ABI_VERSION 3 /* * Make sure that all structs defined in this file remain laid out so diff --git a/drivers/infiniband/hw/mlx4/ah.c b/drivers/infiniband/hw/mlx4/ah.c index 86af71351d9a..105246fba2e7 100644 --- a/drivers/infiniband/hw/mlx4/ah.c +++ b/drivers/infiniband/hw/mlx4/ah.c @@ -92,7 +92,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr ah_attr->grh.sgid_index, &sgid, &gid_attr); if (ret) return ERR_PTR(ret); - memset(ah->av.eth.s_mac, 0, ETH_ALEN); + eth_zero_addr(ah->av.eth.s_mac); if (gid_attr.ndev) { if (is_vlan_dev(gid_attr.ndev)) vlan_tag = vlan_dev_vlan_id(gid_attr.ndev); @@ -104,6 +104,7 @@ static struct ib_ah *create_iboe_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr ah->av.eth.port_pd = cpu_to_be32(to_mpd(pd)->pdn | (ah_attr->port_num << 24)); ah->av.eth.gid_index = mlx4_ib_gid_index_to_real_index(ibdev, ah_attr->port_num, ah_attr->grh.sgid_index); ah->av.eth.vlan = cpu_to_be16(vlan_tag); + ah->av.eth.hop_limit = ah_attr->grh.hop_limit; if (ah_attr->static_rate) { ah->av.eth.stat_rate = ah_attr->static_rate + MLX4_STAT_RATE_OFFSET; while (ah->av.eth.stat_rate > IB_RATE_2_5_GBPS + MLX4_STAT_RATE_OFFSET && diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index b88fc8f5ab18..9f8b516eb2b0 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -811,9 +811,6 @@ repoll: wc->opcode = IB_WC_MASKED_FETCH_ADD; wc->byte_len = 8; break; - case MLX4_OPCODE_BIND_MW: - wc->opcode = IB_WC_BIND_MW; - break; case MLX4_OPCODE_LSO: wc->opcode = IB_WC_LSO; break; diff --git a/drivers/infiniband/hw/mlx4/main.c b/drivers/infiniband/hw/mlx4/main.c index 97d6878f9938..1c7ab6cabbb8 100644 --- a/drivers/infiniband/hw/mlx4/main.c +++ b/drivers/infiniband/hw/mlx4/main.c @@ -154,9 +154,9 @@ static struct net_device *mlx4_ib_get_netdev(struct ib_device *device, u8 port_n return dev; } -static int mlx4_ib_update_gids(struct gid_entry *gids, - struct mlx4_ib_dev *ibdev, - u8 port_num) +static int mlx4_ib_update_gids_v1(struct gid_entry *gids, + struct mlx4_ib_dev *ibdev, + u8 port_num) { struct mlx4_cmd_mailbox *mailbox; int err; @@ -187,6 +187,63 @@ static int mlx4_ib_update_gids(struct gid_entry *gids, return err; } +static int mlx4_ib_update_gids_v1_v2(struct gid_entry *gids, + struct mlx4_ib_dev *ibdev, + u8 port_num) +{ + struct mlx4_cmd_mailbox *mailbox; + int err; + struct mlx4_dev *dev = ibdev->dev; + int i; + struct { + union ib_gid gid; + __be32 rsrvd1[2]; + __be16 rsrvd2; + u8 type; + u8 version; + __be32 rsrvd3; + } *gid_tbl; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if (IS_ERR(mailbox)) + return -ENOMEM; + + gid_tbl = mailbox->buf; + for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { + memcpy(&gid_tbl[i].gid, &gids[i].gid, sizeof(union ib_gid)); + if (gids[i].gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) { + gid_tbl[i].version = 2; + if (!ipv6_addr_v4mapped((struct in6_addr *)&gids[i].gid)) + gid_tbl[i].type = 1; + else + memset(&gid_tbl[i].gid, 0, 12); + } + } + + err = mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_ROCE_ADDR << 8 | port_num, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + if (mlx4_is_bonded(dev)) + err += mlx4_cmd(dev, mailbox->dma, + MLX4_SET_PORT_ROCE_ADDR << 8 | 2, + 1, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, + MLX4_CMD_WRAPPED); + + mlx4_free_cmd_mailbox(dev, mailbox); + return err; +} + +static int mlx4_ib_update_gids(struct gid_entry *gids, + struct mlx4_ib_dev *ibdev, + u8 port_num) +{ + if (ibdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) + return mlx4_ib_update_gids_v1_v2(gids, ibdev, port_num); + + return mlx4_ib_update_gids_v1(gids, ibdev, port_num); +} + static int mlx4_ib_add_gid(struct ib_device *device, u8 port_num, unsigned int index, @@ -215,7 +272,8 @@ static int mlx4_ib_add_gid(struct ib_device *device, port_gid_table = &iboe->gids[port_num - 1]; spin_lock_bh(&iboe->lock); for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) { - if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid))) { + if (!memcmp(&port_gid_table->gids[i].gid, gid, sizeof(*gid)) && + (port_gid_table->gids[i].gid_type == attr->gid_type)) { found = i; break; } @@ -233,6 +291,7 @@ static int mlx4_ib_add_gid(struct ib_device *device, } else { *context = port_gid_table->gids[free].ctx; memcpy(&port_gid_table->gids[free].gid, gid, sizeof(*gid)); + port_gid_table->gids[free].gid_type = attr->gid_type; port_gid_table->gids[free].ctx->real_index = free; port_gid_table->gids[free].ctx->refcount = 1; hw_update = 1; @@ -248,8 +307,10 @@ static int mlx4_ib_add_gid(struct ib_device *device, if (!gids) { ret = -ENOMEM; } else { - for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) + for (i = 0; i < MLX4_MAX_PORT_GIDS; i++) { memcpy(&gids[i].gid, &port_gid_table->gids[i].gid, sizeof(union ib_gid)); + gids[i].gid_type = port_gid_table->gids[i].gid_type; + } } } spin_unlock_bh(&iboe->lock); @@ -325,6 +386,7 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, int i; int ret; unsigned long flags; + struct ib_gid_attr attr; if (port_num > MLX4_MAX_PORTS) return -EINVAL; @@ -335,10 +397,13 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, if (!rdma_cap_roce_gid_table(&ibdev->ib_dev, port_num)) return index; - ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, NULL); + ret = ib_get_cached_gid(&ibdev->ib_dev, port_num, index, &gid, &attr); if (ret) return ret; + if (attr.ndev) + dev_put(attr.ndev); + if (!memcmp(&gid, &zgid, sizeof(gid))) return -EINVAL; @@ -346,7 +411,8 @@ int mlx4_ib_gid_index_to_real_index(struct mlx4_ib_dev *ibdev, port_gid_table = &iboe->gids[port_num - 1]; for (i = 0; i < MLX4_MAX_PORT_GIDS; ++i) - if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid))) { + if (!memcmp(&port_gid_table->gids[i].gid, &gid, sizeof(gid)) && + attr.gid_type == port_gid_table->gids[i].gid_type) { ctx = port_gid_table->gids[i].ctx; break; } @@ -2119,6 +2185,7 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { struct ib_port_attr attr; + struct mlx4_ib_dev *mdev = to_mdev(ibdev); int err; err = mlx4_ib_query_port(ibdev, port_num, &attr); @@ -2128,10 +2195,15 @@ static int mlx4_port_immutable(struct ib_device *ibdev, u8 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; - if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) + if (mlx4_ib_port_link_layer(ibdev, port_num) == IB_LINK_LAYER_INFINIBAND) { immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; - else - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + } else { + if (mdev->dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE; + if (mdev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) + immutable->core_cap_flags = RDMA_CORE_PORT_IBA_ROCE | + RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + } immutable->max_mad_size = IB_MGMT_MAD_SIZE; @@ -2283,7 +2355,6 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (dev->caps.flags & MLX4_DEV_CAP_FLAG_MEM_WINDOW || dev->caps.bmme_flags & MLX4_BMME_FLAG_TYPE_2_WIN) { ibdev->ib_dev.alloc_mw = mlx4_ib_alloc_mw; - ibdev->ib_dev.bind_mw = mlx4_ib_bind_mw; ibdev->ib_dev.dealloc_mw = mlx4_ib_dealloc_mw; ibdev->ib_dev.uverbs_cmd_mask |= @@ -2423,7 +2494,8 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) if (mlx4_ib_init_sriov(ibdev)) goto err_mad; - if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) { + if (dev->caps.flags & MLX4_DEV_CAP_FLAG_IBOE || + dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { if (!iboe->nb.notifier_call) { iboe->nb.notifier_call = mlx4_ib_netdev_event; err = register_netdevice_notifier(&iboe->nb); @@ -2432,6 +2504,12 @@ static void *mlx4_ib_add(struct mlx4_dev *dev) goto err_notif; } } + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { + err = mlx4_config_roce_v2_port(dev, ROCE_V2_UDP_DPORT); + if (err) { + goto err_notif; + } + } } for (j = 0; j < ARRAY_SIZE(mlx4_class_attributes); ++j) { diff --git a/drivers/infiniband/hw/mlx4/mlx4_ib.h b/drivers/infiniband/hw/mlx4/mlx4_ib.h index 1caa11edac03..52ce7b000044 100644 --- a/drivers/infiniband/hw/mlx4/mlx4_ib.h +++ b/drivers/infiniband/hw/mlx4/mlx4_ib.h @@ -177,11 +177,18 @@ struct mlx4_ib_wq { unsigned tail; }; +enum { + MLX4_IB_QP_CREATE_ROCE_V2_GSI = IB_QP_CREATE_RESERVED_START +}; + enum mlx4_ib_qp_flags { MLX4_IB_QP_LSO = IB_QP_CREATE_IPOIB_UD_LSO, MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK = IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK, MLX4_IB_QP_NETIF = IB_QP_CREATE_NETIF_QP, MLX4_IB_QP_CREATE_USE_GFP_NOIO = IB_QP_CREATE_USE_GFP_NOIO, + + /* Mellanox specific flags start from IB_QP_CREATE_RESERVED_START */ + MLX4_IB_ROCE_V2_GSI_QP = MLX4_IB_QP_CREATE_ROCE_V2_GSI, MLX4_IB_SRIOV_TUNNEL_QP = 1 << 30, MLX4_IB_SRIOV_SQP = 1 << 31, }; @@ -478,6 +485,7 @@ struct gid_cache_context { struct gid_entry { union ib_gid gid; + enum ib_gid_type gid_type; struct gid_cache_context *ctx; }; @@ -704,8 +712,6 @@ struct ib_mr *mlx4_ib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, struct ib_udata *udata); int mlx4_ib_dereg_mr(struct ib_mr *mr); struct ib_mw *mlx4_ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); -int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, - struct ib_mw_bind *mw_bind); int mlx4_ib_dealloc_mw(struct ib_mw *mw); struct ib_mr *mlx4_ib_alloc_mr(struct ib_pd *pd, enum ib_mr_type mr_type, diff --git a/drivers/infiniband/hw/mlx4/mr.c b/drivers/infiniband/hw/mlx4/mr.c index 4d1e1c632603..242b94ec105b 100644 --- a/drivers/infiniband/hw/mlx4/mr.c +++ b/drivers/infiniband/hw/mlx4/mr.c @@ -366,28 +366,6 @@ err_free: return ERR_PTR(err); } -int mlx4_ib_bind_mw(struct ib_qp *qp, struct ib_mw *mw, - struct ib_mw_bind *mw_bind) -{ - struct ib_bind_mw_wr wr; - struct ib_send_wr *bad_wr; - int ret; - - memset(&wr, 0, sizeof(wr)); - wr.wr.opcode = IB_WR_BIND_MW; - wr.wr.wr_id = mw_bind->wr_id; - wr.wr.send_flags = mw_bind->send_flags; - wr.mw = mw; - wr.bind_info = mw_bind->bind_info; - wr.rkey = ib_inc_rkey(mw->rkey); - - ret = mlx4_ib_post_send(qp, &wr.wr, &bad_wr); - if (!ret) - mw->rkey = wr.rkey; - - return ret; -} - int mlx4_ib_dealloc_mw(struct ib_mw *ibmw) { struct mlx4_ib_mw *mw = to_mmw(ibmw); diff --git a/drivers/infiniband/hw/mlx4/qp.c b/drivers/infiniband/hw/mlx4/qp.c index 13eaaf45288f..bc5536f00b6c 100644 --- a/drivers/infiniband/hw/mlx4/qp.c +++ b/drivers/infiniband/hw/mlx4/qp.c @@ -32,6 +32,8 @@ */ #include <linux/log2.h> +#include <linux/etherdevice.h> +#include <net/ip.h> #include <linux/slab.h> #include <linux/netdevice.h> #include <linux/vmalloc.h> @@ -85,6 +87,7 @@ struct mlx4_ib_sqp { u32 send_psn; struct ib_ud_header ud_header; u8 header_buf[MLX4_IB_UD_HEADER_SIZE]; + struct ib_qp *roce_v2_gsi; }; enum { @@ -115,7 +118,6 @@ static const __be32 mlx4_ib_opcode[] = { [IB_WR_REG_MR] = cpu_to_be32(MLX4_OPCODE_FMR), [IB_WR_MASKED_ATOMIC_CMP_AND_SWP] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_CS), [IB_WR_MASKED_ATOMIC_FETCH_AND_ADD] = cpu_to_be32(MLX4_OPCODE_MASKED_ATOMIC_FA), - [IB_WR_BIND_MW] = cpu_to_be32(MLX4_OPCODE_BIND_MW), }; static struct mlx4_ib_sqp *to_msqp(struct mlx4_ib_qp *mqp) @@ -154,7 +156,10 @@ static int is_sqp(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) } } } - return proxy_sqp; + if (proxy_sqp) + return 1; + + return !!(qp->flags & MLX4_IB_ROCE_V2_GSI_QP); } /* used for INIT/CLOSE port logic */ @@ -796,11 +801,13 @@ static int create_qp_common(struct mlx4_ib_dev *dev, struct ib_pd *pd, if (err) goto err_mtt; - qp->sq.wrid = kmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp); + qp->sq.wrid = kmalloc_array(qp->sq.wqe_cnt, sizeof(u64), + gfp | __GFP_NOWARN); if (!qp->sq.wrid) qp->sq.wrid = __vmalloc(qp->sq.wqe_cnt * sizeof(u64), gfp, PAGE_KERNEL); - qp->rq.wrid = kmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp); + qp->rq.wrid = kmalloc_array(qp->rq.wqe_cnt, sizeof(u64), + gfp | __GFP_NOWARN); if (!qp->rq.wrid) qp->rq.wrid = __vmalloc(qp->rq.wqe_cnt * sizeof(u64), gfp, PAGE_KERNEL); @@ -1099,9 +1106,9 @@ static u32 get_sqp_num(struct mlx4_ib_dev *dev, struct ib_qp_init_attr *attr) return dev->dev->caps.qp1_proxy[attr->port_num - 1]; } -struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, - struct ib_qp_init_attr *init_attr, - struct ib_udata *udata) +static struct ib_qp *_mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) { struct mlx4_ib_qp *qp = NULL; int err; @@ -1120,6 +1127,7 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, MLX4_IB_SRIOV_TUNNEL_QP | MLX4_IB_SRIOV_SQP | MLX4_IB_QP_NETIF | + MLX4_IB_QP_CREATE_ROCE_V2_GSI | MLX4_IB_QP_CREATE_USE_GFP_NOIO)) return ERR_PTR(-EINVAL); @@ -1128,15 +1136,21 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); } - if (init_attr->create_flags && - ((udata && init_attr->create_flags & ~(sup_u_create_flags)) || - ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | - MLX4_IB_QP_CREATE_USE_GFP_NOIO | - MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK)) && - init_attr->qp_type != IB_QPT_UD) || - ((init_attr->create_flags & MLX4_IB_SRIOV_SQP) && - init_attr->qp_type > IB_QPT_GSI))) - return ERR_PTR(-EINVAL); + if (init_attr->create_flags) { + if (udata && init_attr->create_flags & ~(sup_u_create_flags)) + return ERR_PTR(-EINVAL); + + if ((init_attr->create_flags & ~(MLX4_IB_SRIOV_SQP | + MLX4_IB_QP_CREATE_USE_GFP_NOIO | + MLX4_IB_QP_CREATE_ROCE_V2_GSI | + MLX4_IB_QP_BLOCK_MULTICAST_LOOPBACK) && + init_attr->qp_type != IB_QPT_UD) || + (init_attr->create_flags & MLX4_IB_SRIOV_SQP && + init_attr->qp_type > IB_QPT_GSI) || + (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI && + init_attr->qp_type != IB_QPT_GSI)) + return ERR_PTR(-EINVAL); + } switch (init_attr->qp_type) { case IB_QPT_XRC_TGT: @@ -1173,19 +1187,29 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, case IB_QPT_SMI: case IB_QPT_GSI: { + int sqpn; + /* Userspace is not allowed to create special QPs: */ if (udata) return ERR_PTR(-EINVAL); + if (init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI) { + int res = mlx4_qp_reserve_range(to_mdev(pd->device)->dev, 1, 1, &sqpn, 0); + + if (res) + return ERR_PTR(res); + } else { + sqpn = get_sqp_num(to_mdev(pd->device), init_attr); + } err = create_qp_common(to_mdev(pd->device), pd, init_attr, udata, - get_sqp_num(to_mdev(pd->device), init_attr), + sqpn, &qp, gfp); if (err) return ERR_PTR(err); qp->port = init_attr->port_num; - qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : 1; - + qp->ibqp.qp_num = init_attr->qp_type == IB_QPT_SMI ? 0 : + init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI ? sqpn : 1; break; } default: @@ -1196,7 +1220,41 @@ struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, return &qp->ibqp; } -int mlx4_ib_destroy_qp(struct ib_qp *qp) +struct ib_qp *mlx4_ib_create_qp(struct ib_pd *pd, + struct ib_qp_init_attr *init_attr, + struct ib_udata *udata) { + struct ib_device *device = pd ? pd->device : init_attr->xrcd->device; + struct ib_qp *ibqp; + struct mlx4_ib_dev *dev = to_mdev(device); + + ibqp = _mlx4_ib_create_qp(pd, init_attr, udata); + + if (!IS_ERR(ibqp) && + (init_attr->qp_type == IB_QPT_GSI) && + !(init_attr->create_flags & MLX4_IB_QP_CREATE_ROCE_V2_GSI)) { + struct mlx4_ib_sqp *sqp = to_msqp((to_mqp(ibqp))); + int is_eth = rdma_cap_eth_ah(&dev->ib_dev, init_attr->port_num); + + if (is_eth && + dev->dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { + init_attr->create_flags |= MLX4_IB_QP_CREATE_ROCE_V2_GSI; + sqp->roce_v2_gsi = ib_create_qp(pd, init_attr); + + if (IS_ERR(sqp->roce_v2_gsi)) { + pr_err("Failed to create GSI QP for RoCEv2 (%ld)\n", PTR_ERR(sqp->roce_v2_gsi)); + sqp->roce_v2_gsi = NULL; + } else { + sqp = to_msqp(to_mqp(sqp->roce_v2_gsi)); + sqp->qp.flags |= MLX4_IB_ROCE_V2_GSI_QP; + } + + init_attr->create_flags &= ~MLX4_IB_QP_CREATE_ROCE_V2_GSI; + } + } + return ibqp; +} + +static int _mlx4_ib_destroy_qp(struct ib_qp *qp) { struct mlx4_ib_dev *dev = to_mdev(qp->device); struct mlx4_ib_qp *mqp = to_mqp(qp); @@ -1225,6 +1283,20 @@ int mlx4_ib_destroy_qp(struct ib_qp *qp) return 0; } +int mlx4_ib_destroy_qp(struct ib_qp *qp) +{ + struct mlx4_ib_qp *mqp = to_mqp(qp); + + if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { + struct mlx4_ib_sqp *sqp = to_msqp(mqp); + + if (sqp->roce_v2_gsi) + ib_destroy_qp(sqp->roce_v2_gsi); + } + + return _mlx4_ib_destroy_qp(qp); +} + static int to_mlx4_st(struct mlx4_ib_dev *dev, enum mlx4_ib_qp_type type) { switch (type) { @@ -1507,6 +1579,24 @@ static int create_qp_lb_counter(struct mlx4_ib_dev *dev, struct mlx4_ib_qp *qp) return 0; } +enum { + MLX4_QPC_ROCE_MODE_1 = 0, + MLX4_QPC_ROCE_MODE_2 = 2, + MLX4_QPC_ROCE_MODE_UNDEFINED = 0xff +}; + +static u8 gid_type_to_qpc(enum ib_gid_type gid_type) +{ + switch (gid_type) { + case IB_GID_TYPE_ROCE: + return MLX4_QPC_ROCE_MODE_1; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + return MLX4_QPC_ROCE_MODE_2; + default: + return MLX4_QPC_ROCE_MODE_UNDEFINED; + } +} + static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) @@ -1633,6 +1723,14 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, mlx4_ib_steer_qp_reg(dev, qp, 1); steer_qp = 1; } + + if (ibqp->qp_type == IB_QPT_GSI) { + enum ib_gid_type gid_type = qp->flags & MLX4_IB_ROCE_V2_GSI_QP ? + IB_GID_TYPE_ROCE_UDP_ENCAP : IB_GID_TYPE_ROCE; + u8 qpc_roce_mode = gid_type_to_qpc(gid_type); + + context->rlkey_roce_mode |= (qpc_roce_mode << 6); + } } if (attr_mask & IB_QP_PKEY_INDEX) { @@ -1650,9 +1748,10 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, u16 vlan = 0xffff; u8 smac[ETH_ALEN]; int status = 0; + int is_eth = rdma_cap_eth_ah(&dev->ib_dev, port_num) && + attr->ah_attr.ah_flags & IB_AH_GRH; - if (rdma_cap_eth_ah(&dev->ib_dev, port_num) && - attr->ah_attr.ah_flags & IB_AH_GRH) { + if (is_eth) { int index = attr->ah_attr.grh.sgid_index; status = ib_get_cached_gid(ibqp->device, port_num, @@ -1674,6 +1773,18 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, optpar |= (MLX4_QP_OPTPAR_PRIMARY_ADDR_PATH | MLX4_QP_OPTPAR_SCHED_QUEUE); + + if (is_eth && + (cur_state == IB_QPS_INIT && new_state == IB_QPS_RTR)) { + u8 qpc_roce_mode = gid_type_to_qpc(gid_attr.gid_type); + + if (qpc_roce_mode == MLX4_QPC_ROCE_MODE_UNDEFINED) { + err = -EINVAL; + goto out; + } + context->rlkey_roce_mode |= (qpc_roce_mode << 6); + } + } if (attr_mask & IB_QP_TIMEOUT) { @@ -1845,7 +1956,7 @@ static int __mlx4_ib_modify_qp(struct ib_qp *ibqp, sqd_event = 0; if (!ibqp->uobject && cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) - context->rlkey |= (1 << 4); + context->rlkey_roce_mode |= (1 << 4); /* * Before passing a kernel QP to the HW, make sure that the @@ -2022,8 +2133,8 @@ out: return err; } -int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, - int attr_mask, struct ib_udata *udata) +static int _mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) { struct mlx4_ib_dev *dev = to_mdev(ibqp->device); struct mlx4_ib_qp *qp = to_mqp(ibqp); @@ -2126,6 +2237,27 @@ out: return err; } +int mlx4_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, + int attr_mask, struct ib_udata *udata) +{ + struct mlx4_ib_qp *mqp = to_mqp(ibqp); + int ret; + + ret = _mlx4_ib_modify_qp(ibqp, attr, attr_mask, udata); + + if (mqp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { + struct mlx4_ib_sqp *sqp = to_msqp(mqp); + int err = 0; + + if (sqp->roce_v2_gsi) + err = ib_modify_qp(sqp->roce_v2_gsi, attr, attr_mask); + if (err) + pr_err("Failed to modify GSI QP for RoCEv2 (%d)\n", + err); + } + return ret; +} + static int vf_get_qp0_qkey(struct mlx4_dev *dev, int qpn, u32 *qkey) { int i; @@ -2168,7 +2300,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) send_size += sizeof (struct mlx4_ib_tunnel_header); - ib_ud_header_init(send_size, 1, 0, 0, 0, 0, &sqp->ud_header); + ib_ud_header_init(send_size, 1, 0, 0, 0, 0, 0, 0, &sqp->ud_header); if (sqp->qp.mlx4_ib_qp_type == MLX4_IB_QPT_PROXY_SMI_OWNER) { sqp->ud_header.lrh.service_level = @@ -2252,16 +2384,7 @@ static int build_sriov_qp0_header(struct mlx4_ib_sqp *sqp, return 0; } -static void mlx4_u64_to_smac(u8 *dst_mac, u64 src_mac) -{ - int i; - - for (i = ETH_ALEN; i; i--) { - dst_mac[i - 1] = src_mac & 0xff; - src_mac >>= 8; - } -} - +#define MLX4_ROCEV2_QP1_SPORT 0xC000 static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, void *wqe, unsigned *mlx_seg_len) { @@ -2281,6 +2404,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, bool is_eth; bool is_vlan = false; bool is_grh; + bool is_udp = false; + int ip_version = 0; send_size = 0; for (i = 0; i < wr->wr.num_sge; ++i) @@ -2289,6 +2414,8 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, is_eth = rdma_port_get_link_layer(sqp->qp.ibqp.device, sqp->qp.port) == IB_LINK_LAYER_ETHERNET; is_grh = mlx4_ib_ah_grh_present(ah); if (is_eth) { + struct ib_gid_attr gid_attr; + if (mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { /* When multi-function is enabled, the ib_core gid * indexes don't necessarily match the hw ones, so @@ -2302,19 +2429,35 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, err = ib_get_cached_gid(ib_dev, be32_to_cpu(ah->av.ib.port_pd) >> 24, ah->av.ib.gid_index, &sgid, - NULL); - if (!err && !memcmp(&sgid, &zgid, sizeof(sgid))) - err = -ENOENT; - if (err) + &gid_attr); + if (!err) { + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + if (!memcmp(&sgid, &zgid, sizeof(sgid))) + err = -ENOENT; + } + if (!err) { + is_udp = gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP; + if (is_udp) { + if (ipv6_addr_v4mapped((struct in6_addr *)&sgid)) + ip_version = 4; + else + ip_version = 6; + is_grh = false; + } + } else { return err; + } } - if (ah->av.eth.vlan != cpu_to_be16(0xffff)) { vlan = be16_to_cpu(ah->av.eth.vlan) & 0x0fff; is_vlan = 1; } } - ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, 0, &sqp->ud_header); + err = ib_ud_header_init(send_size, !is_eth, is_eth, is_vlan, is_grh, + ip_version, is_udp, 0, &sqp->ud_header); + if (err) + return err; if (!is_eth) { sqp->ud_header.lrh.service_level = @@ -2323,7 +2466,7 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, sqp->ud_header.lrh.source_lid = cpu_to_be16(ah->av.ib.g_slid & 0x7f); } - if (is_grh) { + if (is_grh || (ip_version == 6)) { sqp->ud_header.grh.traffic_class = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; sqp->ud_header.grh.flow_label = @@ -2352,6 +2495,25 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, ah->av.ib.dgid, 16); } + if (ip_version == 4) { + sqp->ud_header.ip4.tos = + (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 20) & 0xff; + sqp->ud_header.ip4.id = 0; + sqp->ud_header.ip4.frag_off = htons(IP_DF); + sqp->ud_header.ip4.ttl = ah->av.eth.hop_limit; + + memcpy(&sqp->ud_header.ip4.saddr, + sgid.raw + 12, 4); + memcpy(&sqp->ud_header.ip4.daddr, ah->av.ib.dgid + 12, 4); + sqp->ud_header.ip4.check = ib_ud_ip4_csum(&sqp->ud_header); + } + + if (is_udp) { + sqp->ud_header.udp.dport = htons(ROCE_V2_UDP_DPORT); + sqp->ud_header.udp.sport = htons(MLX4_ROCEV2_QP1_SPORT); + sqp->ud_header.udp.csum = 0; + } + mlx->flags &= cpu_to_be32(MLX4_WQE_CTRL_CQ_UPDATE); if (!is_eth) { @@ -2380,34 +2542,27 @@ static int build_mlx_header(struct mlx4_ib_sqp *sqp, struct ib_ud_wr *wr, if (is_eth) { struct in6_addr in6; - + u16 ether_type; u16 pcp = (be32_to_cpu(ah->av.ib.sl_tclass_flowlabel) >> 29) << 13; + ether_type = (!is_udp) ? MLX4_IB_IBOE_ETHERTYPE : + (ip_version == 4 ? ETH_P_IP : ETH_P_IPV6); + mlx->sched_prio = cpu_to_be16(pcp); + ether_addr_copy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac); memcpy(sqp->ud_header.eth.dmac_h, ah->av.eth.mac, 6); - /* FIXME: cache smac value? */ memcpy(&ctrl->srcrb_flags16[0], ah->av.eth.mac, 2); memcpy(&ctrl->imm, ah->av.eth.mac + 2, 4); memcpy(&in6, sgid.raw, sizeof(in6)); - if (!mlx4_is_mfunc(to_mdev(ib_dev)->dev)) { - u64 mac = atomic64_read(&to_mdev(ib_dev)->iboe.mac[sqp->qp.port - 1]); - u8 smac[ETH_ALEN]; - - mlx4_u64_to_smac(smac, mac); - memcpy(sqp->ud_header.eth.smac_h, smac, ETH_ALEN); - } else { - /* use the src mac of the tunnel */ - memcpy(sqp->ud_header.eth.smac_h, ah->av.eth.s_mac, ETH_ALEN); - } if (!memcmp(sqp->ud_header.eth.smac_h, sqp->ud_header.eth.dmac_h, 6)) mlx->flags |= cpu_to_be32(MLX4_WQE_CTRL_FORCE_LOOPBACK); if (!is_vlan) { - sqp->ud_header.eth.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + sqp->ud_header.eth.type = cpu_to_be16(ether_type); } else { - sqp->ud_header.vlan.type = cpu_to_be16(MLX4_IB_IBOE_ETHERTYPE); + sqp->ud_header.vlan.type = cpu_to_be16(ether_type); sqp->ud_header.vlan.tag = cpu_to_be16(vlan | pcp); } } else { @@ -2528,25 +2683,6 @@ static void set_reg_seg(struct mlx4_wqe_fmr_seg *fseg, fseg->reserved[1] = 0; } -static void set_bind_seg(struct mlx4_wqe_bind_seg *bseg, - struct ib_bind_mw_wr *wr) -{ - bseg->flags1 = - convert_access(wr->bind_info.mw_access_flags) & - cpu_to_be32(MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_READ | - MLX4_WQE_FMR_AND_BIND_PERM_REMOTE_WRITE | - MLX4_WQE_FMR_AND_BIND_PERM_ATOMIC); - bseg->flags2 = 0; - if (wr->mw->type == IB_MW_TYPE_2) - bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_TYPE_2); - if (wr->bind_info.mw_access_flags & IB_ZERO_BASED) - bseg->flags2 |= cpu_to_be32(MLX4_WQE_BIND_ZERO_BASED); - bseg->new_rkey = cpu_to_be32(wr->rkey); - bseg->lkey = cpu_to_be32(wr->bind_info.mr->lkey); - bseg->addr = cpu_to_be64(wr->bind_info.addr); - bseg->length = cpu_to_be64(wr->bind_info.length); -} - static void set_local_inv_seg(struct mlx4_wqe_local_inval_seg *iseg, u32 rkey) { memset(iseg, 0, sizeof(*iseg)); @@ -2766,6 +2902,29 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, int i; struct mlx4_ib_dev *mdev = to_mdev(ibqp->device); + if (qp->mlx4_ib_qp_type == MLX4_IB_QPT_GSI) { + struct mlx4_ib_sqp *sqp = to_msqp(qp); + + if (sqp->roce_v2_gsi) { + struct mlx4_ib_ah *ah = to_mah(ud_wr(wr)->ah); + struct ib_gid_attr gid_attr; + union ib_gid gid; + + if (!ib_get_cached_gid(ibqp->device, + be32_to_cpu(ah->av.ib.port_pd) >> 24, + ah->av.ib.gid_index, &gid, + &gid_attr)) { + if (gid_attr.ndev) + dev_put(gid_attr.ndev); + qp = (gid_attr.gid_type == IB_GID_TYPE_ROCE_UDP_ENCAP) ? + to_mqp(sqp->roce_v2_gsi) : qp; + } else { + pr_err("Failed to get gid at index %d. RoCEv2 will not work properly\n", + ah->av.ib.gid_index); + } + } + } + spin_lock_irqsave(&qp->sq.lock, flags); if (mdev->dev->persist->state & MLX4_DEVICE_STATE_INTERNAL_ERROR) { err = -EIO; @@ -2867,13 +3026,6 @@ int mlx4_ib_post_send(struct ib_qp *ibqp, struct ib_send_wr *wr, size += sizeof(struct mlx4_wqe_fmr_seg) / 16; break; - case IB_WR_BIND_MW: - ctrl->srcrb_flags |= - cpu_to_be32(MLX4_WQE_CTRL_STRONG_ORDER); - set_bind_seg(wqe, bind_mw_wr(wr)); - wqe += sizeof(struct mlx4_wqe_bind_seg); - size += sizeof(struct mlx4_wqe_bind_seg) / 16; - break; default: /* No extra segments required for sends */ break; diff --git a/drivers/infiniband/hw/mlx4/srq.c b/drivers/infiniband/hw/mlx4/srq.c index c394376ebe06..0597f3eef5d0 100644 --- a/drivers/infiniband/hw/mlx4/srq.c +++ b/drivers/infiniband/hw/mlx4/srq.c @@ -171,7 +171,8 @@ struct ib_srq *mlx4_ib_create_srq(struct ib_pd *pd, if (err) goto err_mtt; - srq->wrid = kmalloc(srq->msrq.max * sizeof (u64), GFP_KERNEL); + srq->wrid = kmalloc_array(srq->msrq.max, sizeof(u64), + GFP_KERNEL | __GFP_NOWARN); if (!srq->wrid) { srq->wrid = __vmalloc(srq->msrq.max * sizeof(u64), GFP_KERNEL, PAGE_KERNEL); diff --git a/drivers/infiniband/hw/mlx5/ah.c b/drivers/infiniband/hw/mlx5/ah.c index 66080580e24d..745efa4cfc71 100644 --- a/drivers/infiniband/hw/mlx5/ah.c +++ b/drivers/infiniband/hw/mlx5/ah.c @@ -32,8 +32,10 @@ #include "mlx5_ib.h" -struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, - struct mlx5_ib_ah *ah) +static struct ib_ah *create_ib_ah(struct mlx5_ib_dev *dev, + struct mlx5_ib_ah *ah, + struct ib_ah_attr *ah_attr, + enum rdma_link_layer ll) { if (ah_attr->ah_flags & IB_AH_GRH) { memcpy(ah->av.rgid, &ah_attr->grh.dgid, 16); @@ -44,9 +46,20 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, ah->av.tclass = ah_attr->grh.traffic_class; } - ah->av.rlid = cpu_to_be16(ah_attr->dlid); - ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; - ah->av.stat_rate_sl = (ah_attr->static_rate << 4) | (ah_attr->sl & 0xf); + ah->av.stat_rate_sl = (ah_attr->static_rate << 4); + + if (ll == IB_LINK_LAYER_ETHERNET) { + memcpy(ah->av.rmac, ah_attr->dmac, sizeof(ah_attr->dmac)); + ah->av.udp_sport = + mlx5_get_roce_udp_sport(dev, + ah_attr->port_num, + ah_attr->grh.sgid_index); + ah->av.stat_rate_sl |= (ah_attr->sl & 0x7) << 1; + } else { + ah->av.rlid = cpu_to_be16(ah_attr->dlid); + ah->av.fl_mlid = ah_attr->src_path_bits & 0x7f; + ah->av.stat_rate_sl |= (ah_attr->sl & 0xf); + } return &ah->ibah; } @@ -54,12 +67,19 @@ struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr) { struct mlx5_ib_ah *ah; + struct mlx5_ib_dev *dev = to_mdev(pd->device); + enum rdma_link_layer ll; + + ll = pd->device->get_link_layer(pd->device, ah_attr->port_num); + + if (ll == IB_LINK_LAYER_ETHERNET && !(ah_attr->ah_flags & IB_AH_GRH)) + return ERR_PTR(-EINVAL); ah = kzalloc(sizeof(*ah), GFP_ATOMIC); if (!ah) return ERR_PTR(-ENOMEM); - return create_ib_ah(ah_attr, ah); /* never fails */ + return create_ib_ah(dev, ah, ah_attr, ll); /* never fails */ } int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr) diff --git a/drivers/infiniband/hw/mlx5/cq.c b/drivers/infiniband/hw/mlx5/cq.c index 92ddae101ecc..fd1de31e0611 100644 --- a/drivers/infiniband/hw/mlx5/cq.c +++ b/drivers/infiniband/hw/mlx5/cq.c @@ -154,9 +154,6 @@ static void handle_good_req(struct ib_wc *wc, struct mlx5_cqe64 *cqe, wc->opcode = IB_WC_MASKED_FETCH_ADD; wc->byte_len = 8; break; - case MLX5_OPCODE_BIND_MW: - wc->opcode = IB_WC_BIND_MW; - break; case MLX5_OPCODE_UMR: wc->opcode = get_umr_comp(wq, idx); break; @@ -171,6 +168,7 @@ enum { static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, struct mlx5_ib_qp *qp) { + enum rdma_link_layer ll = rdma_port_get_link_layer(qp->ibqp.device, 1); struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.device); struct mlx5_ib_srq *srq; struct mlx5_ib_wq *wq; @@ -236,6 +234,22 @@ static void handle_responder(struct ib_wc *wc, struct mlx5_cqe64 *cqe, } else { wc->pkey_index = 0; } + + if (ll != IB_LINK_LAYER_ETHERNET) + return; + + switch (wc->sl & 0x3) { + case MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH: + wc->network_hdr_type = RDMA_NETWORK_IB; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6: + wc->network_hdr_type = RDMA_NETWORK_IPV6; + break; + case MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4: + wc->network_hdr_type = RDMA_NETWORK_IPV4; + break; + } + wc->wc_flags |= IB_WC_WITH_NETWORK_HDR_TYPE; } static void dump_cqe(struct mlx5_ib_dev *dev, struct mlx5_err_cqe *cqe) @@ -760,12 +774,12 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, int eqn; int err; - if (attr->flags) - return ERR_PTR(-EINVAL); - if (entries < 0) return ERR_PTR(-EINVAL); + if (check_cq_create_flags(attr->flags)) + return ERR_PTR(-EOPNOTSUPP); + entries = roundup_pow_of_two(entries + 1); if (entries > (1 << MLX5_CAP_GEN(dev->mdev, log_max_cq_sz))) return ERR_PTR(-EINVAL); @@ -779,6 +793,7 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, spin_lock_init(&cq->lock); cq->resize_buf = NULL; cq->resize_umem = NULL; + cq->create_flags = attr->flags; if (context) { err = create_cq_user(dev, udata, context, cq, entries, @@ -796,6 +811,10 @@ struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, cq->cqe_size = cqe_size; cqb->ctx.cqe_sz_flags = cqe_sz_to_mlx_sz(cqe_size) << 5; + + if (cq->create_flags & IB_CQ_FLAGS_IGNORE_OVERRUN) + cqb->ctx.cqe_sz_flags |= (1 << 1); + cqb->ctx.log_sz_usr_page = cpu_to_be32((ilog2(entries) << 24) | index); err = mlx5_vector2eqn(dev->mdev, vector, &eqn, &irqn); if (err) diff --git a/drivers/infiniband/hw/mlx5/main.c b/drivers/infiniband/hw/mlx5/main.c index b0ec175cc6ba..ec737e2287fe 100644 --- a/drivers/infiniband/hw/mlx5/main.c +++ b/drivers/infiniband/hw/mlx5/main.c @@ -40,6 +40,8 @@ #include <linux/io-mapping.h> #include <linux/sched.h> #include <rdma/ib_user_verbs.h> +#include <rdma/ib_addr.h> +#include <rdma/ib_cache.h> #include <linux/mlx5/vport.h> #include <rdma/ib_smi.h> #include <rdma/ib_umem.h> @@ -66,12 +68,14 @@ static char mlx5_version[] = DRIVER_NAME ": Mellanox Connect-IB Infiniband driver v" DRIVER_VERSION " (" DRIVER_RELDATE ")\n"; +enum { + MLX5_ATOMIC_SIZE_QP_8BYTES = 1 << 3, +}; + static enum rdma_link_layer -mlx5_ib_port_link_layer(struct ib_device *device) +mlx5_port_type_cap_to_rdma_ll(int port_type_cap) { - struct mlx5_ib_dev *dev = to_mdev(device); - - switch (MLX5_CAP_GEN(dev->mdev, port_type)) { + switch (port_type_cap) { case MLX5_CAP_PORT_TYPE_IB: return IB_LINK_LAYER_INFINIBAND; case MLX5_CAP_PORT_TYPE_ETH: @@ -81,6 +85,202 @@ mlx5_ib_port_link_layer(struct ib_device *device) } } +static enum rdma_link_layer +mlx5_ib_port_link_layer(struct ib_device *device, u8 port_num) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + int port_type_cap = MLX5_CAP_GEN(dev->mdev, port_type); + + return mlx5_port_type_cap_to_rdma_ll(port_type_cap); +} + +static int mlx5_netdev_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + struct net_device *ndev = netdev_notifier_info_to_dev(ptr); + struct mlx5_ib_dev *ibdev = container_of(this, struct mlx5_ib_dev, + roce.nb); + + if ((event != NETDEV_UNREGISTER) && (event != NETDEV_REGISTER)) + return NOTIFY_DONE; + + write_lock(&ibdev->roce.netdev_lock); + if (ndev->dev.parent == &ibdev->mdev->pdev->dev) + ibdev->roce.netdev = (event == NETDEV_UNREGISTER) ? NULL : ndev; + write_unlock(&ibdev->roce.netdev_lock); + + return NOTIFY_DONE; +} + +static struct net_device *mlx5_ib_get_netdev(struct ib_device *device, + u8 port_num) +{ + struct mlx5_ib_dev *ibdev = to_mdev(device); + struct net_device *ndev; + + /* Ensure ndev does not disappear before we invoke dev_hold() + */ + read_lock(&ibdev->roce.netdev_lock); + ndev = ibdev->roce.netdev; + if (ndev) + dev_hold(ndev); + read_unlock(&ibdev->roce.netdev_lock); + + return ndev; +} + +static int mlx5_query_port_roce(struct ib_device *device, u8 port_num, + struct ib_port_attr *props) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + struct net_device *ndev; + enum ib_mtu ndev_ib_mtu; + u16 qkey_viol_cntr; + + memset(props, 0, sizeof(*props)); + + props->port_cap_flags |= IB_PORT_CM_SUP; + props->port_cap_flags |= IB_PORT_IP_BASED_GIDS; + + props->gid_tbl_len = MLX5_CAP_ROCE(dev->mdev, + roce_address_table_size); + props->max_mtu = IB_MTU_4096; + props->max_msg_sz = 1 << MLX5_CAP_GEN(dev->mdev, log_max_msg); + props->pkey_tbl_len = 1; + props->state = IB_PORT_DOWN; + props->phys_state = 3; + + mlx5_query_nic_vport_qkey_viol_cntr(dev->mdev, &qkey_viol_cntr); + props->qkey_viol_cntr = qkey_viol_cntr; + + ndev = mlx5_ib_get_netdev(device, port_num); + if (!ndev) + return 0; + + if (netif_running(ndev) && netif_carrier_ok(ndev)) { + props->state = IB_PORT_ACTIVE; + props->phys_state = 5; + } + + ndev_ib_mtu = iboe_get_mtu(ndev->mtu); + + dev_put(ndev); + + props->active_mtu = min(props->max_mtu, ndev_ib_mtu); + + props->active_width = IB_WIDTH_4X; /* TODO */ + props->active_speed = IB_SPEED_QDR; /* TODO */ + + return 0; +} + +static void ib_gid_to_mlx5_roce_addr(const union ib_gid *gid, + const struct ib_gid_attr *attr, + void *mlx5_addr) +{ +#define MLX5_SET_RA(p, f, v) MLX5_SET(roce_addr_layout, p, f, v) + char *mlx5_addr_l3_addr = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_l3_address); + void *mlx5_addr_mac = MLX5_ADDR_OF(roce_addr_layout, mlx5_addr, + source_mac_47_32); + + if (!gid) + return; + + ether_addr_copy(mlx5_addr_mac, attr->ndev->dev_addr); + + if (is_vlan_dev(attr->ndev)) { + MLX5_SET_RA(mlx5_addr, vlan_valid, 1); + MLX5_SET_RA(mlx5_addr, vlan_id, vlan_dev_vlan_id(attr->ndev)); + } + + switch (attr->gid_type) { + case IB_GID_TYPE_IB: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_1); + break; + case IB_GID_TYPE_ROCE_UDP_ENCAP: + MLX5_SET_RA(mlx5_addr, roce_version, MLX5_ROCE_VERSION_2); + break; + + default: + WARN_ON(true); + } + + if (attr->gid_type != IB_GID_TYPE_IB) { + if (ipv6_addr_v4mapped((void *)gid)) + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV4); + else + MLX5_SET_RA(mlx5_addr, roce_l3_type, + MLX5_ROCE_L3_TYPE_IPV6); + } + + if ((attr->gid_type == IB_GID_TYPE_IB) || + !ipv6_addr_v4mapped((void *)gid)) + memcpy(mlx5_addr_l3_addr, gid, sizeof(*gid)); + else + memcpy(&mlx5_addr_l3_addr[12], &gid->raw[12], 4); +} + +static int set_roce_addr(struct ib_device *device, u8 port_num, + unsigned int index, + const union ib_gid *gid, + const struct ib_gid_attr *attr) +{ + struct mlx5_ib_dev *dev = to_mdev(device); + u32 in[MLX5_ST_SZ_DW(set_roce_address_in)]; + u32 out[MLX5_ST_SZ_DW(set_roce_address_out)]; + void *in_addr = MLX5_ADDR_OF(set_roce_address_in, in, roce_address); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(device, port_num); + + if (ll != IB_LINK_LAYER_ETHERNET) + return -EINVAL; + + memset(in, 0, sizeof(in)); + + ib_gid_to_mlx5_roce_addr(gid, attr, in_addr); + + MLX5_SET(set_roce_address_in, in, roce_address_index, index); + MLX5_SET(set_roce_address_in, in, opcode, MLX5_CMD_OP_SET_ROCE_ADDRESS); + + memset(out, 0, sizeof(out)); + return mlx5_cmd_exec(dev->mdev, in, sizeof(in), out, sizeof(out)); +} + +static int mlx5_ib_add_gid(struct ib_device *device, u8 port_num, + unsigned int index, const union ib_gid *gid, + const struct ib_gid_attr *attr, + __always_unused void **context) +{ + return set_roce_addr(device, port_num, index, gid, attr); +} + +static int mlx5_ib_del_gid(struct ib_device *device, u8 port_num, + unsigned int index, __always_unused void **context) +{ + return set_roce_addr(device, port_num, index, NULL, NULL); +} + +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, + int index) +{ + struct ib_gid_attr attr; + union ib_gid gid; + + if (ib_get_cached_gid(&dev->ib_dev, port_num, index, &gid, &attr)) + return 0; + + if (!attr.ndev) + return 0; + + dev_put(attr.ndev); + + if (attr.gid_type != IB_GID_TYPE_ROCE_UDP_ENCAP) + return 0; + + return cpu_to_be16(MLX5_CAP_ROCE(dev->mdev, r_roce_min_src_udp_port)); +} + static int mlx5_use_mad_ifc(struct mlx5_ib_dev *dev) { return !dev->mdev->issi; @@ -97,13 +297,35 @@ static int mlx5_get_vport_access_method(struct ib_device *ibdev) if (mlx5_use_mad_ifc(to_mdev(ibdev))) return MLX5_VPORT_ACCESS_METHOD_MAD; - if (mlx5_ib_port_link_layer(ibdev) == + if (mlx5_ib_port_link_layer(ibdev, 1) == IB_LINK_LAYER_ETHERNET) return MLX5_VPORT_ACCESS_METHOD_NIC; return MLX5_VPORT_ACCESS_METHOD_HCA; } +static void get_atomic_caps(struct mlx5_ib_dev *dev, + struct ib_device_attr *props) +{ + u8 tmp; + u8 atomic_operations = MLX5_CAP_ATOMIC(dev->mdev, atomic_operations); + u8 atomic_size_qp = MLX5_CAP_ATOMIC(dev->mdev, atomic_size_qp); + u8 atomic_req_8B_endianness_mode = + MLX5_CAP_ATOMIC(dev->mdev, atomic_req_8B_endianess_mode); + + /* Check if HW supports 8 bytes standard atomic operations and capable + * of host endianness respond + */ + tmp = MLX5_ATOMIC_OPS_CMP_SWAP | MLX5_ATOMIC_OPS_FETCH_ADD; + if (((atomic_operations & tmp) == tmp) && + (atomic_size_qp & MLX5_ATOMIC_SIZE_QP_8BYTES) && + (atomic_req_8B_endianness_mode)) { + props->atomic_cap = IB_ATOMIC_HCA; + } else { + props->atomic_cap = IB_ATOMIC_NONE; + } +} + static int mlx5_query_system_image_guid(struct ib_device *ibdev, __be64 *sys_image_guid) { @@ -119,13 +341,21 @@ static int mlx5_query_system_image_guid(struct ib_device *ibdev, case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_system_image_guid(mdev, &tmp); - if (!err) - *sys_image_guid = cpu_to_be64(tmp); - return err; + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_system_image_guid(mdev, &tmp); + break; default: return -EINVAL; } + + if (!err) + *sys_image_guid = cpu_to_be64(tmp); + + return err; + } static int mlx5_query_max_pkeys(struct ib_device *ibdev, @@ -179,13 +409,20 @@ static int mlx5_query_node_guid(struct mlx5_ib_dev *dev, case MLX5_VPORT_ACCESS_METHOD_HCA: err = mlx5_query_hca_vport_node_guid(dev->mdev, &tmp); - if (!err) - *node_guid = cpu_to_be64(tmp); - return err; + break; + + case MLX5_VPORT_ACCESS_METHOD_NIC: + err = mlx5_query_nic_vport_node_guid(dev->mdev, &tmp); + break; default: return -EINVAL; } + + if (!err) + *node_guid = cpu_to_be64(tmp); + + return err; } struct mlx5_reg_node_desc { @@ -263,6 +500,10 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, if (MLX5_CAP_GEN(mdev, block_lb_mc)) props->device_cap_flags |= IB_DEVICE_BLOCK_MULTICAST_LOOPBACK; + if (MLX5_CAP_GEN(dev->mdev, eth_net_offloads) && + (MLX5_CAP_ETH(dev->mdev, csum_cap))) + props->device_cap_flags |= IB_DEVICE_RAW_IP_CSUM; + props->vendor_part_id = mdev->pdev->device; props->hw_ver = mdev->pdev->revision; @@ -278,7 +519,7 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_sge = min(max_rq_sg, max_sq_sg); props->max_sge_rd = props->max_sge; props->max_cq = 1 << MLX5_CAP_GEN(mdev, log_max_cq); - props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_eq_sz)) - 1; + props->max_cqe = (1 << MLX5_CAP_GEN(mdev, log_max_cq_sz)) - 1; props->max_mr = 1 << MLX5_CAP_GEN(mdev, log_max_mkey); props->max_pd = 1 << MLX5_CAP_GEN(mdev, log_max_pd); props->max_qp_rd_atom = 1 << MLX5_CAP_GEN(mdev, log_max_ra_req_qp); @@ -289,13 +530,15 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->max_res_rd_atom = props->max_qp_rd_atom * props->max_qp; props->max_srq_sge = max_rq_sg - 1; props->max_fast_reg_page_list_len = (unsigned int)-1; - props->atomic_cap = IB_ATOMIC_NONE; + get_atomic_caps(dev, props); props->masked_atomic_cap = IB_ATOMIC_NONE; props->max_mcast_grp = 1 << MLX5_CAP_GEN(mdev, log_max_mcg); props->max_mcast_qp_attach = MLX5_CAP_GEN(mdev, max_qp_mcg); props->max_total_mcast_qp_attach = props->max_mcast_qp_attach * props->max_mcast_grp; props->max_map_per_fmr = INT_MAX; /* no limit in ConnectIB */ + props->hca_core_clock = MLX5_CAP_GEN(mdev, device_frequency_khz); + props->timestamp_mask = 0x7FFFFFFFFFFFFFFFULL; #ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING if (MLX5_CAP_GEN(mdev, pg)) @@ -303,6 +546,9 @@ static int mlx5_ib_query_device(struct ib_device *ibdev, props->odp_caps = dev->odp_caps; #endif + if (MLX5_CAP_GEN(mdev, cd)) + props->device_cap_flags |= IB_DEVICE_CROSS_CHANNEL; + return 0; } @@ -483,6 +729,9 @@ int mlx5_ib_query_port(struct ib_device *ibdev, u8 port, case MLX5_VPORT_ACCESS_METHOD_HCA: return mlx5_query_hca_port(ibdev, port, props); + case MLX5_VPORT_ACCESS_METHOD_NIC: + return mlx5_query_port_roce(ibdev, port, props); + default: return -EINVAL; } @@ -583,8 +832,8 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, struct ib_udata *udata) { struct mlx5_ib_dev *dev = to_mdev(ibdev); - struct mlx5_ib_alloc_ucontext_req_v2 req; - struct mlx5_ib_alloc_ucontext_resp resp; + struct mlx5_ib_alloc_ucontext_req_v2 req = {}; + struct mlx5_ib_alloc_ucontext_resp resp = {}; struct mlx5_ib_ucontext *context; struct mlx5_uuar_info *uuari; struct mlx5_uar *uars; @@ -599,20 +848,22 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (!dev->ib_active) return ERR_PTR(-EAGAIN); - memset(&req, 0, sizeof(req)); + if (udata->inlen < sizeof(struct ib_uverbs_cmd_hdr)) + return ERR_PTR(-EINVAL); + reqlen = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req)) ver = 0; - else if (reqlen == sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) + else if (reqlen >= sizeof(struct mlx5_ib_alloc_ucontext_req_v2)) ver = 2; else return ERR_PTR(-EINVAL); - err = ib_copy_from_udata(&req, udata, reqlen); + err = ib_copy_from_udata(&req, udata, min(reqlen, sizeof(req))); if (err) return ERR_PTR(err); - if (req.flags || req.reserved) + if (req.flags) return ERR_PTR(-EINVAL); if (req.total_num_uuars > MLX5_MAX_UUARS) @@ -621,6 +872,14 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, if (req.total_num_uuars == 0) return ERR_PTR(-EINVAL); + if (req.comp_mask || req.reserved0 || req.reserved1 || req.reserved2) + return ERR_PTR(-EOPNOTSUPP); + + if (reqlen > sizeof(req) && + !ib_is_udata_cleared(udata, sizeof(req), + reqlen - sizeof(req))) + return ERR_PTR(-EOPNOTSUPP); + req.total_num_uuars = ALIGN(req.total_num_uuars, MLX5_NON_FP_BF_REGS_PER_PAGE); if (req.num_low_latency_uuars > req.total_num_uuars - 1) @@ -636,6 +895,11 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, resp.max_send_wqebb = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_qp_sz); resp.max_srq_recv_wr = 1 << MLX5_CAP_GEN(dev->mdev, log_max_srq_sz); + resp.cqe_version = min_t(__u8, + (__u8)MLX5_CAP_GEN(dev->mdev, cqe_version), + req.max_cqe_version); + resp.response_length = min(offsetof(typeof(resp), response_length) + + sizeof(resp.response_length), udata->outlen); context = kzalloc(sizeof(*context), GFP_KERNEL); if (!context) @@ -681,22 +945,49 @@ static struct ib_ucontext *mlx5_ib_alloc_ucontext(struct ib_device *ibdev, context->ibucontext.invalidate_range = &mlx5_ib_invalidate_range; #endif + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) { + err = mlx5_core_alloc_transport_domain(dev->mdev, + &context->tdn); + if (err) + goto out_uars; + } + INIT_LIST_HEAD(&context->db_page_list); mutex_init(&context->db_page_mutex); resp.tot_uuars = req.total_num_uuars; resp.num_ports = MLX5_CAP_GEN(dev->mdev, num_ports); - err = ib_copy_to_udata(udata, &resp, - sizeof(resp) - sizeof(resp.reserved)); + + if (field_avail(typeof(resp), cqe_version, udata->outlen)) + resp.response_length += sizeof(resp.cqe_version); + + if (field_avail(typeof(resp), hca_core_clock_offset, udata->outlen)) { + resp.comp_mask |= + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET; + resp.hca_core_clock_offset = + offsetof(struct mlx5_init_seg, internal_timer_h) % + PAGE_SIZE; + resp.response_length += sizeof(resp.hca_core_clock_offset) + + sizeof(resp.reserved2) + + sizeof(resp.reserved3); + } + + err = ib_copy_to_udata(udata, &resp, resp.response_length); if (err) - goto out_uars; + goto out_td; uuari->ver = ver; uuari->num_low_latency_uuars = req.num_low_latency_uuars; uuari->uars = uars; uuari->num_uars = num_uars; + context->cqe_version = resp.cqe_version; + return &context->ibucontext; +out_td: + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + out_uars: for (i--; i >= 0; i--) mlx5_cmd_free_uar(dev->mdev, uars[i].index); @@ -721,6 +1012,9 @@ static int mlx5_ib_dealloc_ucontext(struct ib_ucontext *ibcontext) struct mlx5_uuar_info *uuari = &context->uuari; int i; + if (MLX5_CAP_GEN(dev->mdev, log_max_transport_domain)) + mlx5_core_dealloc_transport_domain(dev->mdev, context->tdn); + for (i = 0; i < uuari->num_uars; i++) { if (mlx5_cmd_free_uar(dev->mdev, uuari->uars[i].index)) mlx5_ib_warn(dev, "failed to free UAR 0x%x\n", uuari->uars[i].index); @@ -790,6 +1084,30 @@ static int mlx5_ib_mmap(struct ib_ucontext *ibcontext, struct vm_area_struct *vm case MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES: return -ENOSYS; + case MLX5_IB_MMAP_CORE_CLOCK: + if (vma->vm_end - vma->vm_start != PAGE_SIZE) + return -EINVAL; + + if (vma->vm_flags & (VM_WRITE | VM_EXEC)) + return -EPERM; + + /* Don't expose to user-space information it shouldn't have */ + if (PAGE_SIZE > 4096) + return -EOPNOTSUPP; + + vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot); + pfn = (dev->mdev->iseg_base + + offsetof(struct mlx5_init_seg, internal_timer_h)) >> + PAGE_SHIFT; + if (io_remap_pfn_range(vma, vma->vm_start, pfn, + PAGE_SIZE, vma->vm_page_prot)) + return -EAGAIN; + + mlx5_ib_dbg(dev, "mapped internal timer at 0x%lx, PA 0x%llx\n", + vma->vm_start, + (unsigned long long)pfn << PAGE_SHIFT); + break; + default: return -EINVAL; } @@ -1758,6 +2076,32 @@ static void destroy_dev_resources(struct mlx5_ib_resources *devr) mlx5_ib_dealloc_pd(devr->p0); } +static u32 get_core_cap_flags(struct ib_device *ibdev) +{ + struct mlx5_ib_dev *dev = to_mdev(ibdev); + enum rdma_link_layer ll = mlx5_ib_port_link_layer(ibdev, 1); + u8 l3_type_cap = MLX5_CAP_ROCE(dev->mdev, l3_type); + u8 roce_version_cap = MLX5_CAP_ROCE(dev->mdev, roce_version); + u32 ret = 0; + + if (ll == IB_LINK_LAYER_INFINIBAND) + return RDMA_CORE_PORT_IBA_IB; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV4_CAP)) + return 0; + + if (!(l3_type_cap & MLX5_ROCE_L3_TYPE_IPV6_CAP)) + return 0; + + if (roce_version_cap & MLX5_ROCE_VERSION_1_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE; + + if (roce_version_cap & MLX5_ROCE_VERSION_2_CAP) + ret |= RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP; + + return ret; +} + static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, struct ib_port_immutable *immutable) { @@ -1770,20 +2114,50 @@ static int mlx5_port_immutable(struct ib_device *ibdev, u8 port_num, immutable->pkey_tbl_len = attr.pkey_tbl_len; immutable->gid_tbl_len = attr.gid_tbl_len; - immutable->core_cap_flags = RDMA_CORE_PORT_IBA_IB; + immutable->core_cap_flags = get_core_cap_flags(ibdev); immutable->max_mad_size = IB_MGMT_MAD_SIZE; return 0; } +static int mlx5_enable_roce(struct mlx5_ib_dev *dev) +{ + int err; + + dev->roce.nb.notifier_call = mlx5_netdev_event; + err = register_netdevice_notifier(&dev->roce.nb); + if (err) + return err; + + err = mlx5_nic_vport_enable_roce(dev->mdev); + if (err) + goto err_unregister_netdevice_notifier; + + return 0; + +err_unregister_netdevice_notifier: + unregister_netdevice_notifier(&dev->roce.nb); + return err; +} + +static void mlx5_disable_roce(struct mlx5_ib_dev *dev) +{ + mlx5_nic_vport_disable_roce(dev->mdev); + unregister_netdevice_notifier(&dev->roce.nb); +} + static void *mlx5_ib_add(struct mlx5_core_dev *mdev) { struct mlx5_ib_dev *dev; + enum rdma_link_layer ll; + int port_type_cap; int err; int i; - /* don't create IB instance over Eth ports, no RoCE yet! */ - if (MLX5_CAP_GEN(mdev, port_type) == MLX5_CAP_PORT_TYPE_ETH) + port_type_cap = MLX5_CAP_GEN(mdev, port_type); + ll = mlx5_port_type_cap_to_rdma_ll(port_type_cap); + + if ((ll == IB_LINK_LAYER_ETHERNET) && !MLX5_CAP_GEN(mdev, roce)) return NULL; printk_once(KERN_INFO "%s", mlx5_version); @@ -1794,6 +2168,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->mdev = mdev; + rwlock_init(&dev->roce.netdev_lock); err = get_port_caps(dev); if (err) goto err_dealloc; @@ -1843,7 +2218,12 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) dev->ib_dev.query_device = mlx5_ib_query_device; dev->ib_dev.query_port = mlx5_ib_query_port; + dev->ib_dev.get_link_layer = mlx5_ib_port_link_layer; + if (ll == IB_LINK_LAYER_ETHERNET) + dev->ib_dev.get_netdev = mlx5_ib_get_netdev; dev->ib_dev.query_gid = mlx5_ib_query_gid; + dev->ib_dev.add_gid = mlx5_ib_add_gid; + dev->ib_dev.del_gid = mlx5_ib_del_gid; dev->ib_dev.query_pkey = mlx5_ib_query_pkey; dev->ib_dev.modify_device = mlx5_ib_modify_device; dev->ib_dev.modify_port = mlx5_ib_modify_port; @@ -1893,7 +2273,7 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) (1ull << IB_USER_VERBS_CMD_CLOSE_XRCD); } - if (mlx5_ib_port_link_layer(&dev->ib_dev) == + if (mlx5_ib_port_link_layer(&dev->ib_dev, 1) == IB_LINK_LAYER_ETHERNET) { dev->ib_dev.create_flow = mlx5_ib_create_flow; dev->ib_dev.destroy_flow = mlx5_ib_destroy_flow; @@ -1908,9 +2288,15 @@ static void *mlx5_ib_add(struct mlx5_core_dev *mdev) mutex_init(&dev->flow_db.lock); mutex_init(&dev->cap_mask_mutex); + if (ll == IB_LINK_LAYER_ETHERNET) { + err = mlx5_enable_roce(dev); + if (err) + goto err_dealloc; + } + err = create_dev_resources(&dev->devr); if (err) - goto err_dealloc; + goto err_disable_roce; err = mlx5_ib_odp_init_one(dev); if (err) @@ -1947,6 +2333,10 @@ err_odp: err_rsrc: destroy_dev_resources(&dev->devr); +err_disable_roce: + if (ll == IB_LINK_LAYER_ETHERNET) + mlx5_disable_roce(dev); + err_dealloc: ib_dealloc_device((struct ib_device *)dev); @@ -1956,11 +2346,14 @@ err_dealloc: static void mlx5_ib_remove(struct mlx5_core_dev *mdev, void *context) { struct mlx5_ib_dev *dev = context; + enum rdma_link_layer ll = mlx5_ib_port_link_layer(&dev->ib_dev, 1); ib_unregister_device(&dev->ib_dev); destroy_umrc_res(dev); mlx5_ib_odp_remove_one(dev); destroy_dev_resources(&dev->devr); + if (ll == IB_LINK_LAYER_ETHERNET) + mlx5_disable_roce(dev); ib_dealloc_device(&dev->ib_dev); } diff --git a/drivers/infiniband/hw/mlx5/mlx5_ib.h b/drivers/infiniband/hw/mlx5/mlx5_ib.h index 1474cccd1e0f..d2b9737baa36 100644 --- a/drivers/infiniband/hw/mlx5/mlx5_ib.h +++ b/drivers/infiniband/hw/mlx5/mlx5_ib.h @@ -42,6 +42,7 @@ #include <linux/mlx5/qp.h> #include <linux/mlx5/srq.h> #include <linux/types.h> +#include <linux/mlx5/transobj.h> #define mlx5_ib_dbg(dev, format, arg...) \ pr_debug("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ @@ -55,6 +56,11 @@ pr_err("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ pr_warn("%s:%s:%d:(pid %d): " format, (dev)->ib_dev.name, __func__, \ __LINE__, current->pid, ##arg) +#define field_avail(type, fld, sz) (offsetof(type, fld) + \ + sizeof(((type *)0)->fld) <= (sz)) +#define MLX5_IB_DEFAULT_UIDX 0xffffff +#define MLX5_USER_ASSIGNED_UIDX_MASK __mlx5_mask(qpc, user_index) + enum { MLX5_IB_MMAP_CMD_SHIFT = 8, MLX5_IB_MMAP_CMD_MASK = 0xff, @@ -62,7 +68,9 @@ enum { enum mlx5_ib_mmap_cmd { MLX5_IB_MMAP_REGULAR_PAGE = 0, - MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, /* always last */ + MLX5_IB_MMAP_GET_CONTIGUOUS_PAGES = 1, + /* 5 is chosen in order to be compatible with old versions of libmlx5 */ + MLX5_IB_MMAP_CORE_CLOCK = 5, }; enum { @@ -85,6 +93,15 @@ enum mlx5_ib_mad_ifc_flags { MLX5_MAD_IFC_NET_VIEW = 4, }; +enum { + MLX5_CROSS_CHANNEL_UUAR = 0, +}; + +enum { + MLX5_CQE_VERSION_V0, + MLX5_CQE_VERSION_V1, +}; + struct mlx5_ib_ucontext { struct ib_ucontext ibucontext; struct list_head db_page_list; @@ -93,6 +110,9 @@ struct mlx5_ib_ucontext { */ struct mutex db_page_mutex; struct mlx5_uuar_info uuari; + u8 cqe_version; + /* Transport Domain number */ + u32 tdn; }; static inline struct mlx5_ib_ucontext *to_mucontext(struct ib_ucontext *ibucontext) @@ -201,47 +221,70 @@ struct mlx5_ib_pfault { struct mlx5_pagefault mpfault; }; +struct mlx5_ib_ubuffer { + struct ib_umem *umem; + int buf_size; + u64 buf_addr; +}; + +struct mlx5_ib_qp_base { + struct mlx5_ib_qp *container_mibqp; + struct mlx5_core_qp mqp; + struct mlx5_ib_ubuffer ubuffer; +}; + +struct mlx5_ib_qp_trans { + struct mlx5_ib_qp_base base; + u16 xrcdn; + u8 alt_port; + u8 atomic_rd_en; + u8 resp_depth; +}; + struct mlx5_ib_rq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *rq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; u32 tirn; + u8 state; +}; + +struct mlx5_ib_sq { + struct mlx5_ib_qp_base base; + struct mlx5_ib_wq *sq; + struct mlx5_ib_ubuffer ubuffer; + struct mlx5_db *doorbell; + u32 tisn; + u8 state; }; struct mlx5_ib_raw_packet_qp { + struct mlx5_ib_sq sq; struct mlx5_ib_rq rq; }; struct mlx5_ib_qp { struct ib_qp ibqp; union { - struct mlx5_core_qp mqp; - struct mlx5_ib_raw_packet_qp raw_packet_qp; + struct mlx5_ib_qp_trans trans_qp; + struct mlx5_ib_raw_packet_qp raw_packet_qp; }; - struct mlx5_buf buf; struct mlx5_db db; struct mlx5_ib_wq rq; - u32 doorbell_qpn; u8 sq_signal_bits; u8 fm_cache; - int sq_max_wqes_per_wr; - int sq_spare_wqes; struct mlx5_ib_wq sq; - struct ib_umem *umem; - int buf_size; - /* serialize qp state modifications */ struct mutex mutex; - u16 xrcdn; u32 flags; u8 port; - u8 alt_port; - u8 atomic_rd_en; - u8 resp_depth; u8 state; - int mlx_type; int wq_sig; int scat_cqe; int max_inline_data; @@ -284,6 +327,9 @@ struct mlx5_ib_cq_buf { enum mlx5_ib_qp_flags { MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK = 1 << 0, MLX5_IB_QP_SIGNATURE_HANDLING = 1 << 1, + MLX5_IB_QP_CROSS_CHANNEL = 1 << 2, + MLX5_IB_QP_MANAGED_SEND = 1 << 3, + MLX5_IB_QP_MANAGED_RECV = 1 << 4, }; struct mlx5_umr_wr { @@ -326,6 +372,7 @@ struct mlx5_ib_cq { struct mlx5_ib_cq_buf *resize_buf; struct ib_umem *resize_umem; int cqe_size; + u32 create_flags; }; struct mlx5_ib_srq { @@ -449,9 +496,19 @@ struct mlx5_ib_resources { struct ib_srq *s1; }; +struct mlx5_roce { + /* Protect mlx5_ib_get_netdev from invoking dev_hold() with a NULL + * netdev pointer + */ + rwlock_t netdev_lock; + struct net_device *netdev; + struct notifier_block nb; +}; + struct mlx5_ib_dev { struct ib_device ib_dev; struct mlx5_core_dev *mdev; + struct mlx5_roce roce; MLX5_DECLARE_DOORBELL_LOCK(uar_lock); int num_ports; /* serialize update of capability mask @@ -498,7 +555,7 @@ static inline struct mlx5_ib_cq *to_mcq(struct ib_cq *ibcq) static inline struct mlx5_ib_qp *to_mibqp(struct mlx5_core_qp *mqp) { - return container_of(mqp, struct mlx5_ib_qp, mqp); + return container_of(mqp, struct mlx5_ib_qp_base, mqp)->container_mibqp; } static inline struct mlx5_ib_mr *to_mibmr(struct mlx5_core_mr *mmr) @@ -550,8 +607,6 @@ void mlx5_ib_free_srq_wqe(struct mlx5_ib_srq *srq, int wqe_index); int mlx5_MAD_IFC(struct mlx5_ib_dev *dev, int ignore_mkey, int ignore_bkey, u8 port, const struct ib_wc *in_wc, const struct ib_grh *in_grh, const void *in_mad, void *response_mad); -struct ib_ah *create_ib_ah(struct ib_ah_attr *ah_attr, - struct mlx5_ib_ah *ah); struct ib_ah *mlx5_ib_create_ah(struct ib_pd *pd, struct ib_ah_attr *ah_attr); int mlx5_ib_query_ah(struct ib_ah *ibah, struct ib_ah_attr *ah_attr); int mlx5_ib_destroy_ah(struct ib_ah *ah); @@ -578,7 +633,8 @@ int mlx5_ib_post_recv(struct ib_qp *ibqp, struct ib_recv_wr *wr, struct ib_recv_wr **bad_wr); void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n); int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, - void *buffer, u32 length); + void *buffer, u32 length, + struct mlx5_ib_qp_base *base); struct ib_cq *mlx5_ib_create_cq(struct ib_device *ibdev, const struct ib_cq_init_attr *attr, struct ib_ucontext *context, @@ -680,6 +736,9 @@ static inline void mlx5_ib_qp_enable_pagefaults(struct mlx5_ib_qp *qp) {} #endif /* CONFIG_INFINIBAND_ON_DEMAND_PAGING */ +__be16 mlx5_get_roce_udp_sport(struct mlx5_ib_dev *dev, u8 port_num, + int index); + static inline void init_query_mad(struct ib_smp *mad) { mad->base_version = 1; @@ -705,4 +764,28 @@ static inline int is_qp1(enum ib_qp_type qp_type) #define MLX5_MAX_UMR_SHIFT 16 #define MLX5_MAX_UMR_PAGES (1 << MLX5_MAX_UMR_SHIFT) +static inline u32 check_cq_create_flags(u32 flags) +{ + /* + * It returns non-zero value for unsupported CQ + * create flags, otherwise it returns zero. + */ + return (flags & ~(IB_CQ_FLAGS_IGNORE_OVERRUN | + IB_CQ_FLAGS_TIMESTAMP_COMPLETION)); +} + +static inline int verify_assign_uidx(u8 cqe_version, u32 cmd_uidx, + u32 *user_index) +{ + if (cqe_version) { + if ((cmd_uidx == MLX5_IB_DEFAULT_UIDX) || + (cmd_uidx & ~MLX5_USER_ASSIGNED_UIDX_MASK)) + return -EINVAL; + *user_index = cmd_uidx; + } else { + *user_index = MLX5_IB_DEFAULT_UIDX; + } + + return 0; +} #endif /* MLX5_IB_H */ diff --git a/drivers/infiniband/hw/mlx5/odp.c b/drivers/infiniband/hw/mlx5/odp.c index aa8391e75385..b8d76361a48d 100644 --- a/drivers/infiniband/hw/mlx5/odp.c +++ b/drivers/infiniband/hw/mlx5/odp.c @@ -153,14 +153,16 @@ static struct mlx5_ib_mr *mlx5_ib_odp_find_mr_lkey(struct mlx5_ib_dev *dev, static void mlx5_ib_page_fault_resume(struct mlx5_ib_qp *qp, struct mlx5_ib_pfault *pfault, - int error) { + int error) +{ struct mlx5_ib_dev *dev = to_mdev(qp->ibqp.pd->device); - int ret = mlx5_core_page_fault_resume(dev->mdev, qp->mqp.qpn, + u32 qpn = qp->trans_qp.base.mqp.qpn; + int ret = mlx5_core_page_fault_resume(dev->mdev, + qpn, pfault->mpfault.flags, error); if (ret) - pr_err("Failed to resolve the page fault on QP 0x%x\n", - qp->mqp.qpn); + pr_err("Failed to resolve the page fault on QP 0x%x\n", qpn); } /* @@ -391,6 +393,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( #if defined(DEBUG) u32 ctrl_wqe_index, ctrl_qpn; #endif + u32 qpn = qp->trans_qp.base.mqp.qpn; ds = be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_DS_MASK; if (ds * MLX5_WQE_DS_UNITS > wqe_length) { @@ -401,7 +404,7 @@ static int mlx5_ib_mr_initiator_pfault_handler( if (ds == 0) { mlx5_ib_err(dev, "Got WQE with zero DS. wqe_index=%x, qpn=%x\n", - wqe_index, qp->mqp.qpn); + wqe_index, qpn); return -EFAULT; } @@ -411,16 +414,16 @@ static int mlx5_ib_mr_initiator_pfault_handler( MLX5_WQE_CTRL_WQE_INDEX_SHIFT; if (wqe_index != ctrl_wqe_index) { mlx5_ib_err(dev, "Got WQE with invalid wqe_index. wqe_index=0x%x, qpn=0x%x ctrl->wqe_index=0x%x\n", - wqe_index, qp->mqp.qpn, + wqe_index, qpn, ctrl_wqe_index); return -EFAULT; } ctrl_qpn = (be32_to_cpu(ctrl->qpn_ds) & MLX5_WQE_CTRL_QPN_MASK) >> MLX5_WQE_CTRL_QPN_SHIFT; - if (qp->mqp.qpn != ctrl_qpn) { + if (qpn != ctrl_qpn) { mlx5_ib_err(dev, "Got WQE with incorrect QP number. wqe_index=0x%x, qpn=0x%x ctrl->qpn=0x%x\n", - wqe_index, qp->mqp.qpn, + wqe_index, qpn, ctrl_qpn); return -EFAULT; } @@ -537,6 +540,7 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, int resume_with_error = 0; u16 wqe_index = pfault->mpfault.wqe.wqe_index; int requestor = pfault->mpfault.flags & MLX5_PFAULT_REQUESTOR; + u32 qpn = qp->trans_qp.base.mqp.qpn; buffer = (char *)__get_free_page(GFP_KERNEL); if (!buffer) { @@ -546,10 +550,10 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, } ret = mlx5_ib_read_user_wqe(qp, requestor, wqe_index, buffer, - PAGE_SIZE); + PAGE_SIZE, &qp->trans_qp.base); if (ret < 0) { mlx5_ib_err(dev, "Failed reading a WQE following page fault, error=%x, wqe_index=%x, qpn=%x\n", - -ret, wqe_index, qp->mqp.qpn); + -ret, wqe_index, qpn); resume_with_error = 1; goto resolve_page_fault; } @@ -586,7 +590,8 @@ static void mlx5_ib_mr_wqe_pfault_handler(struct mlx5_ib_qp *qp, resolve_page_fault: mlx5_ib_page_fault_resume(qp, pfault, resume_with_error); mlx5_ib_dbg(dev, "PAGE FAULT completed. QP 0x%x resume_with_error=%d, flags: 0x%x\n", - qp->mqp.qpn, resume_with_error, pfault->mpfault.flags); + qpn, resume_with_error, + pfault->mpfault.flags); free_page((unsigned long)buffer); } @@ -753,7 +758,7 @@ void mlx5_ib_odp_create_qp(struct mlx5_ib_qp *qp) qp->disable_page_faults = 1; spin_lock_init(&qp->disable_page_faults_lock); - qp->mqp.pfault_handler = mlx5_ib_pfault_handler; + qp->trans_qp.base.mqp.pfault_handler = mlx5_ib_pfault_handler; for (i = 0; i < MLX5_IB_PAGEFAULT_CONTEXTS; ++i) INIT_WORK(&qp->pagefaults[i].work, mlx5_ib_qp_pfault_action); diff --git a/drivers/infiniband/hw/mlx5/qp.c b/drivers/infiniband/hw/mlx5/qp.c index 307bdbca8938..8fb9c27485e1 100644 --- a/drivers/infiniband/hw/mlx5/qp.c +++ b/drivers/infiniband/hw/mlx5/qp.c @@ -32,6 +32,8 @@ #include <linux/module.h> #include <rdma/ib_umem.h> +#include <rdma/ib_cache.h> +#include <rdma/ib_user_verbs.h> #include "mlx5_ib.h" #include "user.h" @@ -114,14 +116,15 @@ void *mlx5_get_send_wqe(struct mlx5_ib_qp *qp, int n) * Return: the number of bytes copied, or an error code. */ int mlx5_ib_read_user_wqe(struct mlx5_ib_qp *qp, int send, int wqe_index, - void *buffer, u32 length) + void *buffer, u32 length, + struct mlx5_ib_qp_base *base) { struct ib_device *ibdev = qp->ibqp.device; struct mlx5_ib_dev *dev = to_mdev(ibdev); struct mlx5_ib_wq *wq = send ? &qp->sq : &qp->rq; size_t offset; size_t wq_end; - struct ib_umem *umem = qp->umem; + struct ib_umem *umem = base->ubuffer.umem; u32 first_copy_length; int wqe_length; int ret; @@ -172,8 +175,10 @@ static void mlx5_ib_qp_event(struct mlx5_core_qp *qp, int type) struct ib_qp *ibqp = &to_mibqp(qp)->ibqp; struct ib_event event; - if (type == MLX5_EVENT_TYPE_PATH_MIG) - to_mibqp(qp)->port = to_mibqp(qp)->alt_port; + if (type == MLX5_EVENT_TYPE_PATH_MIG) { + /* This event is only valid for trans_qps */ + to_mibqp(qp)->port = to_mibqp(qp)->trans_qp.alt_port; + } if (ibqp->event_handler) { event.device = ibqp->device; @@ -366,7 +371,9 @@ static int calc_sq_size(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *attr, static int set_user_buf_size(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, - struct mlx5_ib_create_qp *ucmd) + struct mlx5_ib_create_qp *ucmd, + struct mlx5_ib_qp_base *base, + struct ib_qp_init_attr *attr) { int desc_sz = 1 << qp->sq.wqe_shift; @@ -391,8 +398,13 @@ static int set_user_buf_size(struct mlx5_ib_dev *dev, return -EINVAL; } - qp->buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + - (qp->sq.wqe_cnt << 6); + if (attr->qp_type == IB_QPT_RAW_PACKET) { + base->ubuffer.buf_size = qp->rq.wqe_cnt << qp->rq.wqe_shift; + qp->raw_packet_qp.sq.ubuffer.buf_size = qp->sq.wqe_cnt << 6; + } else { + base->ubuffer.buf_size = (qp->rq.wqe_cnt << qp->rq.wqe_shift) + + (qp->sq.wqe_cnt << 6); + } return 0; } @@ -578,8 +590,8 @@ static int to_mlx5_st(enum ib_qp_type type) case IB_QPT_SMI: return MLX5_QP_ST_QP0; case IB_QPT_GSI: return MLX5_QP_ST_QP1; case IB_QPT_RAW_IPV6: return MLX5_QP_ST_RAW_IPV6; - case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; case IB_QPT_RAW_PACKET: + case IB_QPT_RAW_ETHERTYPE: return MLX5_QP_ST_RAW_ETHERTYPE; case IB_QPT_MAX: default: return -EINVAL; } @@ -590,13 +602,51 @@ static int uuarn_to_uar_index(struct mlx5_uuar_info *uuari, int uuarn) return uuari->uars[uuarn / MLX5_BF_REGS_PER_PAGE].index; } +static int mlx5_ib_umem_get(struct mlx5_ib_dev *dev, + struct ib_pd *pd, + unsigned long addr, size_t size, + struct ib_umem **umem, + int *npages, int *page_shift, int *ncont, + u32 *offset) +{ + int err; + + *umem = ib_umem_get(pd->uobject->context, addr, size, 0, 0); + if (IS_ERR(*umem)) { + mlx5_ib_dbg(dev, "umem_get failed\n"); + return PTR_ERR(*umem); + } + + mlx5_ib_cont_pages(*umem, addr, npages, page_shift, ncont, NULL); + + err = mlx5_ib_get_buf_offset(addr, *page_shift, offset); + if (err) { + mlx5_ib_warn(dev, "bad offset\n"); + goto err_umem; + } + + mlx5_ib_dbg(dev, "addr 0x%lx, size %zu, npages %d, page_shift %d, ncont %d, offset %d\n", + addr, size, *npages, *page_shift, *ncont, *offset); + + return 0; + +err_umem: + ib_umem_release(*umem); + *umem = NULL; + + return err; +} + static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct mlx5_ib_qp *qp, struct ib_udata *udata, + struct ib_qp_init_attr *attr, struct mlx5_create_qp_mbox_in **in, - struct mlx5_ib_create_qp_resp *resp, int *inlen) + struct mlx5_ib_create_qp_resp *resp, int *inlen, + struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; struct mlx5_ib_create_qp ucmd; + struct mlx5_ib_ubuffer *ubuffer = &base->ubuffer; int page_shift = 0; int uar_index; int npages; @@ -615,18 +665,23 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, /* * TBD: should come from the verbs when we have the API */ - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); - if (uuarn < 0) { - mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); - mlx5_ib_dbg(dev, "reverting to medium latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + /* In CROSS_CHANNEL CQ and QP must use the same UAR */ + uuarn = MLX5_CROSS_CHANNEL_UUAR; + else { + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_HIGH); if (uuarn < 0) { - mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); - mlx5_ib_dbg(dev, "reverting to high latency\n"); - uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + mlx5_ib_dbg(dev, "failed to allocate low latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to medium latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_MEDIUM); if (uuarn < 0) { - mlx5_ib_warn(dev, "uuar allocation failed\n"); - return uuarn; + mlx5_ib_dbg(dev, "failed to allocate medium latency UUAR\n"); + mlx5_ib_dbg(dev, "reverting to high latency\n"); + uuarn = alloc_uuar(&context->uuari, MLX5_IB_LATENCY_CLASS_LOW); + if (uuarn < 0) { + mlx5_ib_warn(dev, "uuar allocation failed\n"); + return uuarn; + } } } } @@ -638,32 +693,20 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, qp->sq.wqe_shift = ilog2(MLX5_SEND_WQE_BB); qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - err = set_user_buf_size(dev, qp, &ucmd); + err = set_user_buf_size(dev, qp, &ucmd, base, attr); if (err) goto err_uuar; - if (ucmd.buf_addr && qp->buf_size) { - qp->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, - qp->buf_size, 0, 0); - if (IS_ERR(qp->umem)) { - mlx5_ib_dbg(dev, "umem_get failed\n"); - err = PTR_ERR(qp->umem); + if (ucmd.buf_addr && ubuffer->buf_size) { + ubuffer->buf_addr = ucmd.buf_addr; + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, + ubuffer->buf_size, + &ubuffer->umem, &npages, &page_shift, + &ncont, &offset); + if (err) goto err_uuar; - } } else { - qp->umem = NULL; - } - - if (qp->umem) { - mlx5_ib_cont_pages(qp->umem, ucmd.buf_addr, &npages, &page_shift, - &ncont, NULL); - err = mlx5_ib_get_buf_offset(ucmd.buf_addr, page_shift, &offset); - if (err) { - mlx5_ib_warn(dev, "bad offset\n"); - goto err_umem; - } - mlx5_ib_dbg(dev, "addr 0x%llx, size %d, npages %d, page_shift %d, ncont %d, offset %d\n", - ucmd.buf_addr, qp->buf_size, npages, page_shift, ncont, offset); + ubuffer->umem = NULL; } *inlen = sizeof(**in) + sizeof(*(*in)->pas) * ncont; @@ -672,8 +715,9 @@ static int create_user_qp(struct mlx5_ib_dev *dev, struct ib_pd *pd, err = -ENOMEM; goto err_umem; } - if (qp->umem) - mlx5_ib_populate_pas(dev, qp->umem, page_shift, (*in)->pas, 0); + if (ubuffer->umem) + mlx5_ib_populate_pas(dev, ubuffer->umem, page_shift, + (*in)->pas, 0); (*in)->ctx.log_pg_sz_remote_qpn = cpu_to_be32((page_shift - MLX5_ADAPTER_PAGE_SHIFT) << 24); (*in)->ctx.params2 = cpu_to_be32(offset << 6); @@ -704,29 +748,31 @@ err_free: kvfree(*in); err_umem: - if (qp->umem) - ib_umem_release(qp->umem); + if (ubuffer->umem) + ib_umem_release(ubuffer->umem); err_uuar: free_uuar(&context->uuari, uuarn); return err; } -static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp) +static void destroy_qp_user(struct ib_pd *pd, struct mlx5_ib_qp *qp, + struct mlx5_ib_qp_base *base) { struct mlx5_ib_ucontext *context; context = to_mucontext(pd->uobject->context); mlx5_ib_db_unmap_user(context, &qp->db); - if (qp->umem) - ib_umem_release(qp->umem); + if (base->ubuffer.umem) + ib_umem_release(base->ubuffer.umem); free_uuar(&context->uuari, qp->uuarn); } static int create_kernel_qp(struct mlx5_ib_dev *dev, struct ib_qp_init_attr *init_attr, struct mlx5_ib_qp *qp, - struct mlx5_create_qp_mbox_in **in, int *inlen) + struct mlx5_create_qp_mbox_in **in, int *inlen, + struct mlx5_ib_qp_base *base) { enum mlx5_ib_latency_class lc = MLX5_IB_LATENCY_CLASS_LOW; struct mlx5_uuar_info *uuari; @@ -758,9 +804,9 @@ static int create_kernel_qp(struct mlx5_ib_dev *dev, qp->rq.offset = 0; qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - qp->buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); + base->ubuffer.buf_size = err + (qp->rq.wqe_cnt << qp->rq.wqe_shift); - err = mlx5_buf_alloc(dev->mdev, qp->buf_size, &qp->buf); + err = mlx5_buf_alloc(dev->mdev, base->ubuffer.buf_size, &qp->buf); if (err) { mlx5_ib_dbg(dev, "err %d\n", err); goto err_uuar; @@ -853,19 +899,304 @@ static int is_connected(enum ib_qp_type qp_type) return 0; } +static int create_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, u32 tdn) +{ + u32 in[MLX5_ST_SZ_DW(create_tis_in)]; + void *tisc = MLX5_ADDR_OF(create_tis_in, in, ctx); + + memset(in, 0, sizeof(in)); + + MLX5_SET(tisc, tisc, transport_domain, tdn); + + return mlx5_core_create_tis(dev->mdev, in, sizeof(in), &sq->tisn); +} + +static void destroy_raw_packet_qp_tis(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_tis(dev->mdev, sq->tisn); +} + +static int create_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, void *qpin, + struct ib_pd *pd) +{ + struct mlx5_ib_ubuffer *ubuffer = &sq->ubuffer; + __be64 *pas; + void *in; + void *sqc; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + void *wq; + int inlen; + int err; + int page_shift = 0; + int npages; + int ncont = 0; + u32 offset = 0; + + err = mlx5_ib_umem_get(dev, pd, ubuffer->buf_addr, ubuffer->buf_size, + &sq->ubuffer.umem, &npages, &page_shift, + &ncont, &offset); + if (err) + return err; + + inlen = MLX5_ST_SZ_BYTES(create_sq_in) + sizeof(u64) * ncont; + in = mlx5_vzalloc(inlen); + if (!in) { + err = -ENOMEM; + goto err_umem; + } + + sqc = MLX5_ADDR_OF(create_sq_in, in, ctx); + MLX5_SET(sqc, sqc, flush_in_error_en, 1); + MLX5_SET(sqc, sqc, state, MLX5_SQC_STATE_RST); + MLX5_SET(sqc, sqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(sqc, sqc, cqn, MLX5_GET(qpc, qpc, cqn_snd)); + MLX5_SET(sqc, sqc, tis_lst_sz, 1); + MLX5_SET(sqc, sqc, tis_num_0, sq->tisn); + + wq = MLX5_ADDR_OF(sqc, sqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET(wq, wq, uar_page, MLX5_GET(qpc, qpc, uar_page)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, ilog2(MLX5_SEND_WQE_BB)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_sq_size)); + MLX5_SET(wq, wq, log_wq_pg_sz, page_shift - MLX5_ADAPTER_PAGE_SHIFT); + MLX5_SET(wq, wq, page_offset, offset); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + mlx5_ib_populate_pas(dev, sq->ubuffer.umem, page_shift, pas, 0); + + err = mlx5_core_create_sq_tracked(dev->mdev, in, inlen, &sq->base.mqp); + + kvfree(in); + + if (err) + goto err_umem; + + return 0; + +err_umem: + ib_umem_release(sq->ubuffer.umem); + sq->ubuffer.umem = NULL; + + return err; +} + +static void destroy_raw_packet_qp_sq(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq) +{ + mlx5_core_destroy_sq_tracked(dev->mdev, &sq->base.mqp); + ib_umem_release(sq->ubuffer.umem); +} + +static int get_rq_pas_size(void *qpc) +{ + u32 log_page_size = MLX5_GET(qpc, qpc, log_page_size) + 12; + u32 log_rq_stride = MLX5_GET(qpc, qpc, log_rq_stride); + u32 log_rq_size = MLX5_GET(qpc, qpc, log_rq_size); + u32 page_offset = MLX5_GET(qpc, qpc, page_offset); + u32 po_quanta = 1 << (log_page_size - 6); + u32 rq_sz = 1 << (log_rq_size + 4 + log_rq_stride); + u32 page_size = 1 << log_page_size; + u32 rq_sz_po = rq_sz + (page_offset * po_quanta); + u32 rq_num_pas = (rq_sz_po + page_size - 1) / page_size; + + return rq_num_pas * sizeof(u64); +} + +static int create_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, void *qpin) +{ + __be64 *pas; + __be64 *qp_pas; + void *in; + void *rqc; + void *wq; + void *qpc = MLX5_ADDR_OF(create_qp_in, qpin, qpc); + int inlen; + int err; + u32 rq_pas_size = get_rq_pas_size(qpc); + + inlen = MLX5_ST_SZ_BYTES(create_rq_in) + rq_pas_size; + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + rqc = MLX5_ADDR_OF(create_rq_in, in, ctx); + MLX5_SET(rqc, rqc, vsd, 1); + MLX5_SET(rqc, rqc, mem_rq_type, MLX5_RQC_MEM_RQ_TYPE_MEMORY_RQ_INLINE); + MLX5_SET(rqc, rqc, state, MLX5_RQC_STATE_RST); + MLX5_SET(rqc, rqc, flush_in_error_en, 1); + MLX5_SET(rqc, rqc, user_index, MLX5_GET(qpc, qpc, user_index)); + MLX5_SET(rqc, rqc, cqn, MLX5_GET(qpc, qpc, cqn_rcv)); + + wq = MLX5_ADDR_OF(rqc, rqc, wq); + MLX5_SET(wq, wq, wq_type, MLX5_WQ_TYPE_CYCLIC); + MLX5_SET(wq, wq, end_padding_mode, + MLX5_GET64(qpc, qpc, end_padding_mode)); + MLX5_SET(wq, wq, page_offset, MLX5_GET(qpc, qpc, page_offset)); + MLX5_SET(wq, wq, pd, MLX5_GET(qpc, qpc, pd)); + MLX5_SET64(wq, wq, dbr_addr, MLX5_GET64(qpc, qpc, dbr_addr)); + MLX5_SET(wq, wq, log_wq_stride, MLX5_GET(qpc, qpc, log_rq_stride) + 4); + MLX5_SET(wq, wq, log_wq_pg_sz, MLX5_GET(qpc, qpc, log_page_size)); + MLX5_SET(wq, wq, log_wq_sz, MLX5_GET(qpc, qpc, log_rq_size)); + + pas = (__be64 *)MLX5_ADDR_OF(wq, wq, pas); + qp_pas = (__be64 *)MLX5_ADDR_OF(create_qp_in, qpin, pas); + memcpy(pas, qp_pas, rq_pas_size); + + err = mlx5_core_create_rq_tracked(dev->mdev, in, inlen, &rq->base.mqp); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_rq(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_rq_tracked(dev->mdev, &rq->base.mqp); +} + +static int create_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, u32 tdn) +{ + u32 *in; + void *tirc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(create_tir_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + tirc = MLX5_ADDR_OF(create_tir_in, in, ctx); + MLX5_SET(tirc, tirc, disp_type, MLX5_TIRC_DISP_TYPE_DIRECT); + MLX5_SET(tirc, tirc, inline_rqn, rq->base.mqp.qpn); + MLX5_SET(tirc, tirc, transport_domain, tdn); + + err = mlx5_core_create_tir(dev->mdev, in, inlen, &rq->tirn); + + kvfree(in); + + return err; +} + +static void destroy_raw_packet_qp_tir(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq) +{ + mlx5_core_destroy_tir(dev->mdev, rq->tirn); +} + +static int create_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct mlx5_create_qp_mbox_in *in, + struct ib_pd *pd) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct ib_uobject *uobj = pd->uobject; + struct ib_ucontext *ucontext = uobj->context; + struct mlx5_ib_ucontext *mucontext = to_mucontext(ucontext); + int err; + u32 tdn = mucontext->tdn; + + if (qp->sq.wqe_cnt) { + err = create_raw_packet_qp_tis(dev, sq, tdn); + if (err) + return err; + + err = create_raw_packet_qp_sq(dev, sq, in, pd); + if (err) + goto err_destroy_tis; + + sq->base.container_mibqp = qp; + } + + if (qp->rq.wqe_cnt) { + err = create_raw_packet_qp_rq(dev, rq, in); + if (err) + goto err_destroy_sq; + + rq->base.container_mibqp = qp; + + err = create_raw_packet_qp_tir(dev, rq, tdn); + if (err) + goto err_destroy_rq; + } + + qp->trans_qp.base.mqp.qpn = qp->sq.wqe_cnt ? sq->base.mqp.qpn : + rq->base.mqp.qpn; + + return 0; + +err_destroy_rq: + destroy_raw_packet_qp_rq(dev, rq); +err_destroy_sq: + if (!qp->sq.wqe_cnt) + return err; + destroy_raw_packet_qp_sq(dev, sq); +err_destroy_tis: + destroy_raw_packet_qp_tis(dev, sq); + + return err; +} + +static void destroy_raw_packet_qp(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + if (qp->rq.wqe_cnt) { + destroy_raw_packet_qp_tir(dev, rq); + destroy_raw_packet_qp_rq(dev, rq); + } + + if (qp->sq.wqe_cnt) { + destroy_raw_packet_qp_sq(dev, sq); + destroy_raw_packet_qp_tis(dev, sq); + } +} + +static void raw_packet_qp_copy_info(struct mlx5_ib_qp *qp, + struct mlx5_ib_raw_packet_qp *raw_packet_qp) +{ + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + + sq->sq = &qp->sq; + rq->rq = &qp->rq; + sq->doorbell = &qp->db; + rq->doorbell = &qp->db; +} + static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, struct ib_qp_init_attr *init_attr, struct ib_udata *udata, struct mlx5_ib_qp *qp) { struct mlx5_ib_resources *devr = &dev->devr; struct mlx5_core_dev *mdev = dev->mdev; + struct mlx5_ib_qp_base *base; struct mlx5_ib_create_qp_resp resp; struct mlx5_create_qp_mbox_in *in; struct mlx5_ib_create_qp ucmd; int inlen = sizeof(*in); int err; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + void *qpc; + + base = init_attr->qp_type == IB_QPT_RAW_PACKET ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; - mlx5_ib_odp_create_qp(qp); + if (init_attr->qp_type != IB_QPT_RAW_PACKET) + mlx5_ib_odp_create_qp(qp); mutex_init(&qp->mutex); spin_lock_init(&qp->sq.lock); @@ -880,6 +1211,21 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, } } + if (init_attr->create_flags & + (IB_QP_CREATE_CROSS_CHANNEL | + IB_QP_CREATE_MANAGED_SEND | + IB_QP_CREATE_MANAGED_RECV)) { + if (!MLX5_CAP_GEN(mdev, cd)) { + mlx5_ib_dbg(dev, "cross-channel isn't supported\n"); + return -EINVAL; + } + if (init_attr->create_flags & IB_QP_CREATE_CROSS_CHANNEL) + qp->flags |= MLX5_IB_QP_CROSS_CHANNEL; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_SEND) + qp->flags |= MLX5_IB_QP_MANAGED_SEND; + if (init_attr->create_flags & IB_QP_CREATE_MANAGED_RECV) + qp->flags |= MLX5_IB_QP_MANAGED_RECV; + } if (init_attr->sq_sig_type == IB_SIGNAL_ALL_WR) qp->sq_signal_bits = MLX5_WQE_CTRL_CQ_UPDATE; @@ -889,6 +1235,11 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, return -EFAULT; } + err = get_qp_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + qp->wq_sig = !!(ucmd.flags & MLX5_QP_FLAG_SIGNATURE); qp->scat_cqe = !!(ucmd.flags & MLX5_QP_FLAG_SCATTER_CQE); } else { @@ -918,11 +1269,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, ucmd.sq_wqe_count, max_wqes); return -EINVAL; } - err = create_user_qp(dev, pd, qp, udata, &in, &resp, &inlen); + err = create_user_qp(dev, pd, qp, udata, init_attr, &in, + &resp, &inlen, base); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } else { - err = create_kernel_qp(dev, init_attr, qp, &in, &inlen); + err = create_kernel_qp(dev, init_attr, qp, &in, &inlen, + base); if (err) mlx5_ib_dbg(dev, "err %d\n", err); } @@ -954,6 +1307,13 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) in->ctx.flags_pd |= cpu_to_be32(MLX5_QP_BLOCK_MCAST); + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_MASTER); + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_SEND); + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + in->ctx.params2 |= cpu_to_be32(MLX5_QP_BIT_CC_SLAVE_RECV); + if (qp->scat_cqe && is_connected(init_attr->qp_type)) { int rcqe_sz; int scqe_sz; @@ -1018,26 +1378,35 @@ static int create_qp_common(struct mlx5_ib_dev *dev, struct ib_pd *pd, in->ctx.db_rec_addr = cpu_to_be64(qp->db.dma); - err = mlx5_core_create_qp(dev->mdev, &qp->mqp, in, inlen); + if (MLX5_CAP_GEN(mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(qpc, qpc, user_index, uidx); + } + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + qp->raw_packet_qp.sq.ubuffer.buf_addr = ucmd.sq_buf_addr; + raw_packet_qp_copy_info(qp, &qp->raw_packet_qp); + err = create_raw_packet_qp(dev, qp, in, pd); + } else { + err = mlx5_core_create_qp(dev->mdev, &base->mqp, in, inlen); + } + if (err) { mlx5_ib_dbg(dev, "create qp failed\n"); goto err_create; } kvfree(in); - /* Hardware wants QPN written in big-endian order (after - * shifting) for send doorbell. Precompute this value to save - * a little bit when posting sends. - */ - qp->doorbell_qpn = swab32(qp->mqp.qpn << 8); - qp->mqp.event = mlx5_ib_qp_event; + base->container_mibqp = qp; + base->mqp.event = mlx5_ib_qp_event; return 0; err_create: if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(pd, qp); + destroy_qp_user(pd, qp, base); else if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); @@ -1129,11 +1498,11 @@ static void get_cqs(struct mlx5_ib_qp *qp, case IB_QPT_UD: case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: + case IB_QPT_RAW_PACKET: *send_cq = to_mcq(qp->ibqp.send_cq); *recv_cq = to_mcq(qp->ibqp.recv_cq); break; - case IB_QPT_RAW_PACKET: case IB_QPT_MAX: default: *send_cq = NULL; @@ -1142,45 +1511,66 @@ static void get_cqs(struct mlx5_ib_qp *qp, } } +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u16 operation); + static void destroy_qp_common(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp) { struct mlx5_ib_cq *send_cq, *recv_cq; + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_modify_qp_mbox_in *in; int err; + base = qp->ibqp.qp_type == IB_QPT_RAW_PACKET ? + &qp->raw_packet_qp.rq.base : + &qp->trans_qp.base; + in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) return; if (qp->state != IB_QPS_RESET) { - mlx5_ib_qp_disable_pagefaults(qp); - if (mlx5_core_qp_modify(dev->mdev, to_mlx5_state(qp->state), - MLX5_QP_STATE_RST, in, 0, &qp->mqp)) - mlx5_ib_warn(dev, "mlx5_ib: modify QP %06x to RESET failed\n", - qp->mqp.qpn); + if (qp->ibqp.qp_type != IB_QPT_RAW_PACKET) { + mlx5_ib_qp_disable_pagefaults(qp); + err = mlx5_core_qp_modify(dev->mdev, + MLX5_CMD_OP_2RST_QP, in, 0, + &base->mqp); + } else { + err = modify_raw_packet_qp(dev, qp, + MLX5_CMD_OP_2RST_QP); + } + if (err) + mlx5_ib_warn(dev, "mlx5_ib: modify QP 0x%06x to RESET failed\n", + base->mqp.qpn); } get_cqs(qp, &send_cq, &recv_cq); if (qp->create_type == MLX5_QP_KERNEL) { mlx5_ib_lock_cqs(send_cq, recv_cq); - __mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + __mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, qp->ibqp.srq ? to_msrq(qp->ibqp.srq) : NULL); if (send_cq != recv_cq) - __mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + __mlx5_ib_cq_clean(send_cq, base->mqp.qpn, + NULL); mlx5_ib_unlock_cqs(send_cq, recv_cq); } - err = mlx5_core_destroy_qp(dev->mdev, &qp->mqp); - if (err) - mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", qp->mqp.qpn); - kfree(in); + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + destroy_raw_packet_qp(dev, qp); + } else { + err = mlx5_core_destroy_qp(dev->mdev, &base->mqp); + if (err) + mlx5_ib_warn(dev, "failed to destroy QP 0x%x\n", + base->mqp.qpn); + } + kfree(in); if (qp->create_type == MLX5_QP_KERNEL) destroy_qp_kernel(dev, qp); else if (qp->create_type == MLX5_QP_USER) - destroy_qp_user(&get_pd(qp)->ibpd, qp); + destroy_qp_user(&get_pd(qp)->ibpd, qp, base); } static const char *ib_qp_type_str(enum ib_qp_type type) @@ -1234,6 +1624,16 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, return ERR_PTR(-EINVAL); } dev = to_mdev(to_mxrcd(init_attr->xrcd)->ibxrcd.device); + + if (init_attr->qp_type == IB_QPT_RAW_PACKET) { + if (!pd->uobject) { + mlx5_ib_dbg(dev, "Raw Packet QP is not supported for kernel consumers\n"); + return ERR_PTR(-EINVAL); + } else if (!to_mucontext(pd->uobject->context)->cqe_version) { + mlx5_ib_dbg(dev, "Raw Packet QP is only supported for CQE version > 0\n"); + return ERR_PTR(-EINVAL); + } + } } switch (init_attr->qp_type) { @@ -1250,6 +1650,7 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, } /* fall through */ + case IB_QPT_RAW_PACKET: case IB_QPT_RC: case IB_QPT_UC: case IB_QPT_UD: @@ -1272,19 +1673,19 @@ struct ib_qp *mlx5_ib_create_qp(struct ib_pd *pd, else if (is_qp1(init_attr->qp_type)) qp->ibqp.qp_num = 1; else - qp->ibqp.qp_num = qp->mqp.qpn; + qp->ibqp.qp_num = qp->trans_qp.base.mqp.qpn; mlx5_ib_dbg(dev, "ib qpnum 0x%x, mlx qpn 0x%x, rcqn 0x%x, scqn 0x%x\n", - qp->ibqp.qp_num, qp->mqp.qpn, to_mcq(init_attr->recv_cq)->mcq.cqn, + qp->ibqp.qp_num, qp->trans_qp.base.mqp.qpn, + to_mcq(init_attr->recv_cq)->mcq.cqn, to_mcq(init_attr->send_cq)->mcq.cqn); - qp->xrcdn = xrcdn; + qp->trans_qp.xrcdn = xrcdn; break; case IB_QPT_RAW_IPV6: case IB_QPT_RAW_ETHERTYPE: - case IB_QPT_RAW_PACKET: case IB_QPT_MAX: default: mlx5_ib_dbg(dev, "unsupported qp type %d\n", @@ -1318,12 +1719,12 @@ static __be32 to_mlx5_access_flags(struct mlx5_ib_qp *qp, const struct ib_qp_att if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) dest_rd_atomic = attr->max_dest_rd_atomic; else - dest_rd_atomic = qp->resp_depth; + dest_rd_atomic = qp->trans_qp.resp_depth; if (attr_mask & IB_QP_ACCESS_FLAGS) access_flags = attr->qp_access_flags; else - access_flags = qp->atomic_rd_en; + access_flags = qp->trans_qp.atomic_rd_en; if (!dest_rd_atomic) access_flags &= IB_ACCESS_REMOTE_WRITE; @@ -1360,21 +1761,42 @@ static int ib_rate_to_mlx5(struct mlx5_ib_dev *dev, u8 rate) return rate + MLX5_STAT_RATE_OFFSET; } -static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, +static int modify_raw_packet_eth_prio(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, u8 sl) +{ + void *in; + void *tisc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_tis_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_tis_in, in, bitmask.prio, 1); + + tisc = MLX5_ADDR_OF(modify_tis_in, in, ctx); + MLX5_SET(tisc, tisc, prio, ((sl & 0x7) << 1)); + + err = mlx5_core_modify_tis(dev, sq->tisn, in, inlen); + + kvfree(in); + + return err; +} + +static int mlx5_set_path(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + const struct ib_ah_attr *ah, struct mlx5_qp_path *path, u8 port, int attr_mask, u32 path_flags, const struct ib_qp_attr *attr) { + enum rdma_link_layer ll = rdma_port_get_link_layer(&dev->ib_dev, port); int err; - path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; - path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : 0; - if (attr_mask & IB_QP_PKEY_INDEX) path->pkey_index = attr->pkey_index; - path->grh_mlid = ah->src_path_bits & 0x7f; - path->rlid = cpu_to_be16(ah->dlid); - if (ah->ah_flags & IB_AH_GRH) { if (ah->grh.sgid_index >= dev->mdev->port_caps[port - 1].gid_table_len) { @@ -1383,7 +1805,27 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, dev->mdev->port_caps[port - 1].gid_table_len); return -EINVAL; } - path->grh_mlid |= 1 << 7; + } + + if (ll == IB_LINK_LAYER_ETHERNET) { + if (!(ah->ah_flags & IB_AH_GRH)) + return -EINVAL; + memcpy(path->rmac, ah->dmac, sizeof(ah->dmac)); + path->udp_sport = mlx5_get_roce_udp_sport(dev, port, + ah->grh.sgid_index); + path->dci_cfi_prio_sl = (ah->sl & 0x7) << 4; + } else { + path->fl = (path_flags & MLX5_PATH_FLAG_FL) ? 0x80 : 0; + path->free_ar = (path_flags & MLX5_PATH_FLAG_FREE_AR) ? 0x80 : + 0; + path->rlid = cpu_to_be16(ah->dlid); + path->grh_mlid = ah->src_path_bits & 0x7f; + if (ah->ah_flags & IB_AH_GRH) + path->grh_mlid |= 1 << 7; + path->dci_cfi_prio_sl = ah->sl & 0xf; + } + + if (ah->ah_flags & IB_AH_GRH) { path->mgid_index = ah->grh.sgid_index; path->hop_limit = ah->grh.hop_limit; path->tclass_flowlabel = @@ -1401,7 +1843,10 @@ static int mlx5_set_path(struct mlx5_ib_dev *dev, const struct ib_ah_attr *ah, if (attr_mask & IB_QP_TIMEOUT) path->ackto_lt = attr->timeout << 3; - path->sl = ah->sl & 0xf; + if ((qp->ibqp.qp_type == IB_QPT_RAW_PACKET) && qp->sq.wqe_cnt) + return modify_raw_packet_eth_prio(dev->mdev, + &qp->raw_packet_qp.sq, + ah->sl & 0xf); return 0; } @@ -1549,12 +1994,154 @@ static int ib_mask_to_mlx5_opt(int ib_mask) return result; } +static int modify_raw_packet_qp_rq(struct mlx5_core_dev *dev, + struct mlx5_ib_rq *rq, int new_state) +{ + void *in; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_rq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_rq_in, in, rq_state, rq->state); + + rqc = MLX5_ADDR_OF(modify_rq_in, in, ctx); + MLX5_SET(rqc, rqc, state, new_state); + + err = mlx5_core_modify_rq(dev, rq->base.mqp.qpn, in, inlen); + if (err) + goto out; + + rq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp_sq(struct mlx5_core_dev *dev, + struct mlx5_ib_sq *sq, int new_state) +{ + void *in; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(modify_sq_in); + in = mlx5_vzalloc(inlen); + if (!in) + return -ENOMEM; + + MLX5_SET(modify_sq_in, in, sq_state, sq->state); + + sqc = MLX5_ADDR_OF(modify_sq_in, in, ctx); + MLX5_SET(sqc, sqc, state, new_state); + + err = mlx5_core_modify_sq(dev, sq->base.mqp.qpn, in, inlen); + if (err) + goto out; + + sq->state = new_state; + +out: + kvfree(in); + return err; +} + +static int modify_raw_packet_qp(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + u16 operation) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + int rq_state; + int sq_state; + int err; + + switch (operation) { + case MLX5_CMD_OP_RST2INIT_QP: + rq_state = MLX5_RQC_STATE_RDY; + sq_state = MLX5_SQC_STATE_RDY; + break; + case MLX5_CMD_OP_2ERR_QP: + rq_state = MLX5_RQC_STATE_ERR; + sq_state = MLX5_SQC_STATE_ERR; + break; + case MLX5_CMD_OP_2RST_QP: + rq_state = MLX5_RQC_STATE_RST; + sq_state = MLX5_SQC_STATE_RST; + break; + case MLX5_CMD_OP_INIT2INIT_QP: + case MLX5_CMD_OP_INIT2RTR_QP: + case MLX5_CMD_OP_RTR2RTS_QP: + case MLX5_CMD_OP_RTS2RTS_QP: + /* Nothing to do here... */ + return 0; + default: + WARN_ON(1); + return -EINVAL; + } + + if (qp->rq.wqe_cnt) { + err = modify_raw_packet_qp_rq(dev->mdev, rq, rq_state); + if (err) + return err; + } + + if (qp->sq.wqe_cnt) + return modify_raw_packet_qp_sq(dev->mdev, sq, sq_state); + + return 0; +} + static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, const struct ib_qp_attr *attr, int attr_mask, enum ib_qp_state cur_state, enum ib_qp_state new_state) { + static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { + [MLX5_QP_STATE_RST] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, + }, + [MLX5_QP_STATE_INIT] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, + [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, + }, + [MLX5_QP_STATE_RTR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, + }, + [MLX5_QP_STATE_RTS] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, + }, + [MLX5_QP_STATE_SQD] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + }, + [MLX5_QP_STATE_SQER] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, + }, + [MLX5_QP_STATE_ERR] = { + [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, + [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, + } + }; + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); struct mlx5_ib_qp *qp = to_mqp(ibqp); + struct mlx5_ib_qp_base *base = &qp->trans_qp.base; struct mlx5_ib_cq *send_cq, *recv_cq; struct mlx5_qp_context *context; struct mlx5_modify_qp_mbox_in *in; @@ -1564,6 +2151,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, int sqd_event; int mlx5_st; int err; + u16 op; in = kzalloc(sizeof(*in), GFP_KERNEL); if (!in) @@ -1623,7 +2211,7 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.port = attr->port_num; if (attr_mask & IB_QP_AV) { - err = mlx5_set_path(dev, &attr->ah_attr, &context->pri_path, + err = mlx5_set_path(dev, qp, &attr->ah_attr, &context->pri_path, attr_mask & IB_QP_PORT ? attr->port_num : qp->port, attr_mask, 0, attr); if (err) @@ -1634,7 +2222,8 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, context->pri_path.ackto_lt |= attr->timeout << 3; if (attr_mask & IB_QP_ALT_PATH) { - err = mlx5_set_path(dev, &attr->alt_ah_attr, &context->alt_path, + err = mlx5_set_path(dev, qp, &attr->alt_ah_attr, + &context->alt_path, attr->alt_port_num, attr_mask, 0, attr); if (err) goto out; @@ -1706,41 +2295,51 @@ static int __mlx5_ib_modify_qp(struct ib_qp *ibqp, * again to RTS, and may cause the driver and the device to get out of * sync. */ if (cur_state != IB_QPS_RESET && cur_state != IB_QPS_ERR && - (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR)) + (new_state == IB_QPS_RESET || new_state == IB_QPS_ERR) && + (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) mlx5_ib_qp_disable_pagefaults(qp); + if (mlx5_cur >= MLX5_QP_NUM_STATE || mlx5_new >= MLX5_QP_NUM_STATE || + !optab[mlx5_cur][mlx5_new]) + goto out; + + op = optab[mlx5_cur][mlx5_new]; optpar = ib_mask_to_mlx5_opt(attr_mask); optpar &= opt_mask[mlx5_cur][mlx5_new][mlx5_st]; in->optparam = cpu_to_be32(optpar); - err = mlx5_core_qp_modify(dev->mdev, to_mlx5_state(cur_state), - to_mlx5_state(new_state), in, sqd_event, - &qp->mqp); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) + err = modify_raw_packet_qp(dev, qp, op); + else + err = mlx5_core_qp_modify(dev->mdev, op, in, sqd_event, + &base->mqp); if (err) goto out; - if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT) + if (cur_state == IB_QPS_RESET && new_state == IB_QPS_INIT && + (qp->ibqp.qp_type != IB_QPT_RAW_PACKET)) mlx5_ib_qp_enable_pagefaults(qp); qp->state = new_state; if (attr_mask & IB_QP_ACCESS_FLAGS) - qp->atomic_rd_en = attr->qp_access_flags; + qp->trans_qp.atomic_rd_en = attr->qp_access_flags; if (attr_mask & IB_QP_MAX_DEST_RD_ATOMIC) - qp->resp_depth = attr->max_dest_rd_atomic; + qp->trans_qp.resp_depth = attr->max_dest_rd_atomic; if (attr_mask & IB_QP_PORT) qp->port = attr->port_num; if (attr_mask & IB_QP_ALT_PATH) - qp->alt_port = attr->alt_port_num; + qp->trans_qp.alt_port = attr->alt_port_num; /* * If we moved a kernel QP to RESET, clean up all old CQ * entries and reinitialize the QP. */ if (new_state == IB_QPS_RESET && !ibqp->uobject) { - mlx5_ib_cq_clean(recv_cq, qp->mqp.qpn, + mlx5_ib_cq_clean(recv_cq, base->mqp.qpn, ibqp->srq ? to_msrq(ibqp->srq) : NULL); if (send_cq != recv_cq) - mlx5_ib_cq_clean(send_cq, qp->mqp.qpn, NULL); + mlx5_ib_cq_clean(send_cq, base->mqp.qpn, NULL); qp->rq.head = 0; qp->rq.tail = 0; @@ -1765,15 +2364,21 @@ int mlx5_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, enum ib_qp_state cur_state, new_state; int err = -EINVAL; int port; + enum rdma_link_layer ll = IB_LINK_LAYER_UNSPECIFIED; mutex_lock(&qp->mutex); cur_state = attr_mask & IB_QP_CUR_STATE ? attr->cur_qp_state : qp->state; new_state = attr_mask & IB_QP_STATE ? attr->qp_state : cur_state; + if (!(cur_state == new_state && cur_state == IB_QPS_RESET)) { + port = attr_mask & IB_QP_PORT ? attr->port_num : qp->port; + ll = dev->ib_dev.get_link_layer(&dev->ib_dev, port); + } + if (ibqp->qp_type != MLX5_IB_QPT_REG_UMR && !ib_modify_qp_is_ok(cur_state, new_state, ibqp->qp_type, attr_mask, - IB_LINK_LAYER_UNSPECIFIED)) + ll)) goto out; if ((attr_mask & IB_QP_PORT) && @@ -2570,7 +3175,7 @@ static void finish_wqe(struct mlx5_ib_qp *qp, ctrl->opmod_idx_opcode = cpu_to_be32(((u32)(qp->sq.cur_post) << 8) | mlx5_opcode | ((u32)opmod << 24)); - ctrl->qpn_ds = cpu_to_be32(size | (qp->mqp.qpn << 8)); + ctrl->qpn_ds = cpu_to_be32(size | (qp->trans_qp.base.mqp.qpn << 8)); ctrl->fm_ce_se |= fence; qp->fm_cache = next_fence; if (unlikely(qp->wq_sig)) @@ -3003,7 +3608,7 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at ib_ah_attr->port_num > MLX5_CAP_GEN(dev, num_ports)) return; - ib_ah_attr->sl = path->sl & 0xf; + ib_ah_attr->sl = path->dci_cfi_prio_sl & 0xf; ib_ah_attr->dlid = be16_to_cpu(path->rlid); ib_ah_attr->src_path_bits = path->grh_mlid & 0x7f; @@ -3021,39 +3626,153 @@ static void to_ib_ah_attr(struct mlx5_ib_dev *ibdev, struct ib_ah_attr *ib_ah_at } } -int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr_mask, - struct ib_qp_init_attr *qp_init_attr) +static int query_raw_packet_qp_sq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_sq *sq, + u8 *sq_state) +{ + void *out; + void *sqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_sq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_sq(dev->mdev, sq->base.mqp.qpn, out); + if (err) + goto out; + + sqc = MLX5_ADDR_OF(query_sq_out, out, sq_context); + *sq_state = MLX5_GET(sqc, sqc, state); + sq->state = *sq_state; + +out: + kvfree(out); + return err; +} + +static int query_raw_packet_qp_rq_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_rq *rq, + u8 *rq_state) +{ + void *out; + void *rqc; + int inlen; + int err; + + inlen = MLX5_ST_SZ_BYTES(query_rq_out); + out = mlx5_vzalloc(inlen); + if (!out) + return -ENOMEM; + + err = mlx5_core_query_rq(dev->mdev, rq->base.mqp.qpn, out); + if (err) + goto out; + + rqc = MLX5_ADDR_OF(query_rq_out, out, rq_context); + *rq_state = MLX5_GET(rqc, rqc, state); + rq->state = *rq_state; + +out: + kvfree(out); + return err; +} + +static int sqrq_state_to_qp_state(u8 sq_state, u8 rq_state, + struct mlx5_ib_qp *qp, u8 *qp_state) +{ + static const u8 sqrq_trans[MLX5_RQ_NUM_STATE][MLX5_SQ_NUM_STATE] = { + [MLX5_RQC_STATE_RST] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE_BAD, + [MLX5_SQ_STATE_NA] = IB_QPS_RESET, + }, + [MLX5_RQC_STATE_RDY] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = IB_QPS_SQE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE, + }, + [MLX5_RQC_STATE_ERR] = { + [MLX5_SQC_STATE_RST] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE_BAD, + [MLX5_SQC_STATE_ERR] = IB_QPS_ERR, + [MLX5_SQ_STATE_NA] = IB_QPS_ERR, + }, + [MLX5_RQ_STATE_NA] = { + [MLX5_SQC_STATE_RST] = IB_QPS_RESET, + [MLX5_SQC_STATE_RDY] = MLX5_QP_STATE, + [MLX5_SQC_STATE_ERR] = MLX5_QP_STATE, + [MLX5_SQ_STATE_NA] = MLX5_QP_STATE_BAD, + }, + }; + + *qp_state = sqrq_trans[rq_state][sq_state]; + + if (*qp_state == MLX5_QP_STATE_BAD) { + WARN(1, "Buggy Raw Packet QP state, SQ 0x%x state: 0x%x, RQ 0x%x state: 0x%x", + qp->raw_packet_qp.sq.base.mqp.qpn, sq_state, + qp->raw_packet_qp.rq.base.mqp.qpn, rq_state); + return -EINVAL; + } + + if (*qp_state == MLX5_QP_STATE) + *qp_state = qp->state; + + return 0; +} + +static int query_raw_packet_qp_state(struct mlx5_ib_dev *dev, + struct mlx5_ib_qp *qp, + u8 *raw_packet_qp_state) +{ + struct mlx5_ib_raw_packet_qp *raw_packet_qp = &qp->raw_packet_qp; + struct mlx5_ib_sq *sq = &raw_packet_qp->sq; + struct mlx5_ib_rq *rq = &raw_packet_qp->rq; + int err; + u8 sq_state = MLX5_SQ_STATE_NA; + u8 rq_state = MLX5_RQ_STATE_NA; + + if (qp->sq.wqe_cnt) { + err = query_raw_packet_qp_sq_state(dev, sq, &sq_state); + if (err) + return err; + } + + if (qp->rq.wqe_cnt) { + err = query_raw_packet_qp_rq_state(dev, rq, &rq_state); + if (err) + return err; + } + + return sqrq_state_to_qp_state(sq_state, rq_state, qp, + raw_packet_qp_state); +} + +static int query_qp_attr(struct mlx5_ib_dev *dev, struct mlx5_ib_qp *qp, + struct ib_qp_attr *qp_attr) { - struct mlx5_ib_dev *dev = to_mdev(ibqp->device); - struct mlx5_ib_qp *qp = to_mqp(ibqp); struct mlx5_query_qp_mbox_out *outb; struct mlx5_qp_context *context; int mlx5_state; int err = 0; -#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING - /* - * Wait for any outstanding page faults, in case the user frees memory - * based upon this query's result. - */ - flush_workqueue(mlx5_ib_page_fault_wq); -#endif - - mutex_lock(&qp->mutex); outb = kzalloc(sizeof(*outb), GFP_KERNEL); - if (!outb) { - err = -ENOMEM; - goto out; - } + if (!outb) + return -ENOMEM; + context = &outb->ctx; - err = mlx5_core_qp_query(dev->mdev, &qp->mqp, outb, sizeof(*outb)); + err = mlx5_core_qp_query(dev->mdev, &qp->trans_qp.base.mqp, outb, + sizeof(*outb)); if (err) - goto out_free; + goto out; mlx5_state = be32_to_cpu(context->flags) >> 28; qp->state = to_ib_qp_state(mlx5_state); - qp_attr->qp_state = qp->state; qp_attr->path_mtu = context->mtu_msgmax >> 5; qp_attr->path_mig_state = to_ib_mig_state((be32_to_cpu(context->flags) >> 11) & 0x3); @@ -3087,6 +3806,43 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr qp_attr->retry_cnt = (be32_to_cpu(context->params1) >> 16) & 0x7; qp_attr->rnr_retry = (be32_to_cpu(context->params1) >> 13) & 0x7; qp_attr->alt_timeout = context->alt_path.ackto_lt >> 3; + +out: + kfree(outb); + return err; +} + +int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, + int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr) +{ + struct mlx5_ib_dev *dev = to_mdev(ibqp->device); + struct mlx5_ib_qp *qp = to_mqp(ibqp); + int err = 0; + u8 raw_packet_qp_state; + +#ifdef CONFIG_INFINIBAND_ON_DEMAND_PAGING + /* + * Wait for any outstanding page faults, in case the user frees memory + * based upon this query's result. + */ + flush_workqueue(mlx5_ib_page_fault_wq); +#endif + + mutex_lock(&qp->mutex); + + if (qp->ibqp.qp_type == IB_QPT_RAW_PACKET) { + err = query_raw_packet_qp_state(dev, qp, &raw_packet_qp_state); + if (err) + goto out; + qp->state = raw_packet_qp_state; + qp_attr->port_num = 1; + } else { + err = query_qp_attr(dev, qp, qp_attr); + if (err) + goto out; + } + + qp_attr->qp_state = qp->state; qp_attr->cur_qp_state = qp_attr->qp_state; qp_attr->cap.max_recv_wr = qp->rq.wqe_cnt; qp_attr->cap.max_recv_sge = qp->rq.max_gs; @@ -3110,12 +3866,16 @@ int mlx5_ib_query_qp(struct ib_qp *ibqp, struct ib_qp_attr *qp_attr, int qp_attr if (qp->flags & MLX5_IB_QP_BLOCK_MULTICAST_LOOPBACK) qp_init_attr->create_flags |= IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK; + if (qp->flags & MLX5_IB_QP_CROSS_CHANNEL) + qp_init_attr->create_flags |= IB_QP_CREATE_CROSS_CHANNEL; + if (qp->flags & MLX5_IB_QP_MANAGED_SEND) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_SEND; + if (qp->flags & MLX5_IB_QP_MANAGED_RECV) + qp_init_attr->create_flags |= IB_QP_CREATE_MANAGED_RECV; + qp_init_attr->sq_sig_type = qp->sq_signal_bits & MLX5_WQE_CTRL_CQ_UPDATE ? IB_SIGNAL_ALL_WR : IB_SIGNAL_REQ_WR; -out_free: - kfree(outb); - out: mutex_unlock(&qp->mutex); return err; diff --git a/drivers/infiniband/hw/mlx5/srq.c b/drivers/infiniband/hw/mlx5/srq.c index e008505e96e9..4659256cd95e 100644 --- a/drivers/infiniband/hw/mlx5/srq.c +++ b/drivers/infiniband/hw/mlx5/srq.c @@ -78,28 +78,41 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, struct ib_udata *udata, int buf_size, int *inlen) { struct mlx5_ib_dev *dev = to_mdev(pd->device); - struct mlx5_ib_create_srq ucmd; + struct mlx5_ib_create_srq ucmd = {}; size_t ucmdlen; + void *xsrqc; int err; int npages; int page_shift; int ncont; u32 offset; + u32 uidx = MLX5_IB_DEFAULT_UIDX; + int drv_data = udata->inlen - sizeof(struct ib_uverbs_cmd_hdr); - ucmdlen = - (udata->inlen - sizeof(struct ib_uverbs_cmd_hdr) < - sizeof(ucmd)) ? (sizeof(ucmd) - - sizeof(ucmd.reserved)) : sizeof(ucmd); + if (drv_data < 0) + return -EINVAL; + + ucmdlen = (drv_data < sizeof(ucmd)) ? + drv_data : sizeof(ucmd); if (ib_copy_from_udata(&ucmd, udata, ucmdlen)) { mlx5_ib_dbg(dev, "failed copy udata\n"); return -EFAULT; } - if (ucmdlen == sizeof(ucmd) && - ucmd.reserved != 0) + if (ucmd.reserved0 || ucmd.reserved1) return -EINVAL; + if (drv_data > sizeof(ucmd) && + !ib_is_udata_cleared(udata, sizeof(ucmd), + drv_data - sizeof(ucmd))) + return -EINVAL; + + err = get_srq_user_index(to_mucontext(pd->uobject->context), + &ucmd, udata->inlen, &uidx); + if (err) + return err; + srq->wq_sig = !!(ucmd.flags & MLX5_SRQ_FLAG_SIGNATURE); srq->umem = ib_umem_get(pd->uobject->context, ucmd.buf_addr, buf_size, @@ -138,6 +151,12 @@ static int create_srq_user(struct ib_pd *pd, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; (*in)->ctx.pgoff_cqn = cpu_to_be32(offset << 26); + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + MLX5_SET(xrc_srqc, xsrqc, user_index, uidx); + } + return 0; err_in: @@ -158,6 +177,7 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, struct mlx5_wqe_srq_next_seg *next; int page_shift; int npages; + void *xsrqc; err = mlx5_db_alloc(dev->mdev, &srq->db); if (err) { @@ -204,6 +224,13 @@ static int create_srq_kernel(struct mlx5_ib_dev *dev, struct mlx5_ib_srq *srq, (*in)->ctx.log_pg_sz = page_shift - MLX5_ADAPTER_PAGE_SHIFT; + if (MLX5_CAP_GEN(dev->mdev, cqe_version) == MLX5_CQE_VERSION_V1) { + xsrqc = MLX5_ADDR_OF(create_xrc_srq_in, *in, + xrc_srq_context_entry); + /* 0xffffff means we ask to work with cqe version 0 */ + MLX5_SET(xrc_srqc, xsrqc, user_index, MLX5_IB_DEFAULT_UIDX); + } + return 0; err_in: diff --git a/drivers/infiniband/hw/mlx5/user.h b/drivers/infiniband/hw/mlx5/user.h index 76fb7b927d37..b94a55404a59 100644 --- a/drivers/infiniband/hw/mlx5/user.h +++ b/drivers/infiniband/hw/mlx5/user.h @@ -35,6 +35,8 @@ #include <linux/types.h> +#include "mlx5_ib.h" + enum { MLX5_QP_FLAG_SIGNATURE = 1 << 0, MLX5_QP_FLAG_SCATTER_CQE = 1 << 1, @@ -66,7 +68,15 @@ struct mlx5_ib_alloc_ucontext_req_v2 { __u32 total_num_uuars; __u32 num_low_latency_uuars; __u32 flags; - __u32 reserved; + __u32 comp_mask; + __u8 max_cqe_version; + __u8 reserved0; + __u16 reserved1; + __u32 reserved2; +}; + +enum mlx5_ib_alloc_ucontext_resp_mask { + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, }; struct mlx5_ib_alloc_ucontext_resp { @@ -80,7 +90,13 @@ struct mlx5_ib_alloc_ucontext_resp { __u32 max_recv_wr; __u32 max_srq_recv_wr; __u16 num_ports; - __u16 reserved; + __u16 reserved1; + __u32 comp_mask; + __u32 response_length; + __u8 cqe_version; + __u8 reserved2; + __u16 reserved3; + __u64 hca_core_clock_offset; }; struct mlx5_ib_alloc_pd_resp { @@ -110,7 +126,9 @@ struct mlx5_ib_create_srq { __u64 buf_addr; __u64 db_addr; __u32 flags; - __u32 reserved; /* explicit padding (optional on i386) */ + __u32 reserved0; /* explicit padding (optional on i386) */ + __u32 uidx; + __u32 reserved1; }; struct mlx5_ib_create_srq_resp { @@ -125,9 +143,48 @@ struct mlx5_ib_create_qp { __u32 rq_wqe_count; __u32 rq_wqe_shift; __u32 flags; + __u32 uidx; + __u32 reserved0; + __u64 sq_buf_addr; }; struct mlx5_ib_create_qp_resp { __u32 uuar_index; }; + +static inline int get_qp_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_qp *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_qp, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_qp, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} + +static inline int get_srq_user_index(struct mlx5_ib_ucontext *ucontext, + struct mlx5_ib_create_srq *ucmd, + int inlen, + u32 *user_index) +{ + u8 cqe_version = ucontext->cqe_version; + + if (field_avail(struct mlx5_ib_create_srq, uidx, inlen) && + !cqe_version && (ucmd->uidx == MLX5_IB_DEFAULT_UIDX)) + return 0; + + if (!!(field_avail(struct mlx5_ib_create_srq, uidx, inlen) != + !!cqe_version)) + return -EINVAL; + + return verify_assign_uidx(cqe_version, ucmd->uidx, user_index); +} #endif /* MLX5_IB_USER_H */ diff --git a/drivers/infiniband/hw/mthca/mthca_cq.c b/drivers/infiniband/hw/mthca/mthca_cq.c index 40ba83338155..a6531ffe29a6 100644 --- a/drivers/infiniband/hw/mthca/mthca_cq.c +++ b/drivers/infiniband/hw/mthca/mthca_cq.c @@ -608,9 +608,6 @@ static inline int mthca_poll_one(struct mthca_dev *dev, entry->opcode = IB_WC_FETCH_ADD; entry->byte_len = MTHCA_ATOMIC_BYTE_LEN; break; - case MTHCA_OPCODE_BIND_MW: - entry->opcode = IB_WC_BIND_MW; - break; default: entry->opcode = MTHCA_OPCODE_INVALID; break; diff --git a/drivers/infiniband/hw/mthca/mthca_provider.c b/drivers/infiniband/hw/mthca/mthca_provider.c index dc2d48c59e62..9866c35cc977 100644 --- a/drivers/infiniband/hw/mthca/mthca_provider.c +++ b/drivers/infiniband/hw/mthca/mthca_provider.c @@ -898,89 +898,6 @@ static struct ib_mr *mthca_get_dma_mr(struct ib_pd *pd, int acc) return &mr->ibmr; } -static struct ib_mr *mthca_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, - int acc, - u64 *iova_start) -{ - struct mthca_mr *mr; - u64 *page_list; - u64 total_size; - unsigned long mask; - int shift; - int npages; - int err; - int i, j, n; - - mask = buffer_list[0].addr ^ *iova_start; - total_size = 0; - for (i = 0; i < num_phys_buf; ++i) { - if (i != 0) - mask |= buffer_list[i].addr; - if (i != num_phys_buf - 1) - mask |= buffer_list[i].addr + buffer_list[i].size; - - total_size += buffer_list[i].size; - } - - if (mask & ~PAGE_MASK) - return ERR_PTR(-EINVAL); - - shift = __ffs(mask | 1 << 31); - - buffer_list[0].size += buffer_list[0].addr & ((1ULL << shift) - 1); - buffer_list[0].addr &= ~0ull << shift; - - mr = kmalloc(sizeof *mr, GFP_KERNEL); - if (!mr) - return ERR_PTR(-ENOMEM); - - npages = 0; - for (i = 0; i < num_phys_buf; ++i) - npages += (buffer_list[i].size + (1ULL << shift) - 1) >> shift; - - if (!npages) - return &mr->ibmr; - - page_list = kmalloc(npages * sizeof *page_list, GFP_KERNEL); - if (!page_list) { - kfree(mr); - return ERR_PTR(-ENOMEM); - } - - n = 0; - for (i = 0; i < num_phys_buf; ++i) - for (j = 0; - j < (buffer_list[i].size + (1ULL << shift) - 1) >> shift; - ++j) - page_list[n++] = buffer_list[i].addr + ((u64) j << shift); - - mthca_dbg(to_mdev(pd->device), "Registering memory at %llx (iova %llx) " - "in PD %x; shift %d, npages %d.\n", - (unsigned long long) buffer_list[0].addr, - (unsigned long long) *iova_start, - to_mpd(pd)->pd_num, - shift, npages); - - err = mthca_mr_alloc_phys(to_mdev(pd->device), - to_mpd(pd)->pd_num, - page_list, shift, npages, - *iova_start, total_size, - convert_access(acc), mr); - - if (err) { - kfree(page_list); - kfree(mr); - return ERR_PTR(err); - } - - kfree(page_list); - mr->umem = NULL; - - return &mr->ibmr; -} - static struct ib_mr *mthca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { @@ -1346,7 +1263,6 @@ int mthca_register_device(struct mthca_dev *dev) dev->ib_dev.destroy_cq = mthca_destroy_cq; dev->ib_dev.poll_cq = mthca_poll_cq; dev->ib_dev.get_dma_mr = mthca_get_dma_mr; - dev->ib_dev.reg_phys_mr = mthca_reg_phys_mr; dev->ib_dev.reg_user_mr = mthca_reg_user_mr; dev->ib_dev.dereg_mr = mthca_dereg_mr; dev->ib_dev.get_port_immutable = mthca_port_immutable; diff --git a/drivers/infiniband/hw/mthca/mthca_qp.c b/drivers/infiniband/hw/mthca/mthca_qp.c index 35fe506e2cfa..96e5fb91fb48 100644 --- a/drivers/infiniband/hw/mthca/mthca_qp.c +++ b/drivers/infiniband/hw/mthca/mthca_qp.c @@ -1485,7 +1485,7 @@ static int build_mlx_header(struct mthca_dev *dev, struct mthca_sqp *sqp, u16 pkey; ib_ud_header_init(256, /* assume a MAD */ 1, 0, 0, - mthca_ah_grh_present(to_mah(wr->ah)), 0, + mthca_ah_grh_present(to_mah(wr->ah)), 0, 0, 0, &sqp->ud_header); err = mthca_read_ah(dev, to_mah(wr->ah), &sqp->ud_header); diff --git a/drivers/infiniband/hw/nes/nes_cm.c b/drivers/infiniband/hw/nes/nes_cm.c index 8a3ad170d790..cb9f0f27308d 100644 --- a/drivers/infiniband/hw/nes/nes_cm.c +++ b/drivers/infiniband/hw/nes/nes_cm.c @@ -134,7 +134,7 @@ static void record_ird_ord(struct nes_cm_node *, u16, u16); /* External CM API Interface */ /* instance of function pointers for client API */ /* set address of this instance to cm_core->cm_ops at cm_core alloc */ -static struct nes_cm_ops nes_cm_api = { +static const struct nes_cm_ops nes_cm_api = { mini_cm_accelerated, mini_cm_listen, mini_cm_del_listen, @@ -3232,7 +3232,6 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) int passive_state; struct nes_ib_device *nesibdev; struct ib_mr *ibmr = NULL; - struct ib_phys_buf ibphysbuf; struct nes_pd *nespd; u64 tagged_offset; u8 mpa_frame_offset = 0; @@ -3316,21 +3315,19 @@ int nes_accept(struct iw_cm_id *cm_id, struct iw_cm_conn_param *conn_param) u64temp = (unsigned long)nesqp; nesibdev = nesvnic->nesibdev; nespd = nesqp->nespd; - ibphysbuf.addr = nesqp->ietf_frame_pbase + mpa_frame_offset; - ibphysbuf.size = buff_len; tagged_offset = (u64)(unsigned long)*start_buff; - ibmr = nesibdev->ibdev.reg_phys_mr((struct ib_pd *)nespd, - &ibphysbuf, 1, - IB_ACCESS_LOCAL_WRITE, - &tagged_offset); - if (!ibmr) { + ibmr = nes_reg_phys_mr(&nespd->ibpd, + nesqp->ietf_frame_pbase + mpa_frame_offset, + buff_len, IB_ACCESS_LOCAL_WRITE, + &tagged_offset); + if (IS_ERR(ibmr)) { nes_debug(NES_DBG_CM, "Unable to register memory region" "for lSMM for cm_node = %p \n", cm_node); pci_free_consistent(nesdev->pcidev, nesqp->private_data_len + nesqp->ietf_frame_size, nesqp->ietf_frame, nesqp->ietf_frame_pbase); - return -ENOMEM; + return PTR_ERR(ibmr); } ibmr->pd = &nespd->ibpd; diff --git a/drivers/infiniband/hw/nes/nes_cm.h b/drivers/infiniband/hw/nes/nes_cm.h index 32a6420c2940..147c2c884227 100644 --- a/drivers/infiniband/hw/nes/nes_cm.h +++ b/drivers/infiniband/hw/nes/nes_cm.h @@ -423,7 +423,7 @@ struct nes_cm_core { struct timer_list tcp_timer; - struct nes_cm_ops *api; + const struct nes_cm_ops *api; int (*post_event)(struct nes_cm_event *event); atomic_t events_posted; diff --git a/drivers/infiniband/hw/nes/nes_utils.c b/drivers/infiniband/hw/nes/nes_utils.c index 2042c0f29759..6d3a169c049b 100644 --- a/drivers/infiniband/hw/nes/nes_utils.c +++ b/drivers/infiniband/hw/nes/nes_utils.c @@ -727,7 +727,7 @@ int nes_arp_table(struct nes_device *nesdev, u32 ip_addr, u8 *mac_addr, u32 acti if (action == NES_ARP_DELETE) { nes_debug(NES_DBG_NETDEV, "DELETE, arp_index=%d\n", arp_index); nesadapter->arp_table[arp_index].ip_addr = 0; - memset(nesadapter->arp_table[arp_index].mac_addr, 0x00, ETH_ALEN); + eth_zero_addr(nesadapter->arp_table[arp_index].mac_addr); nes_free_resource(nesadapter, nesadapter->allocated_arps, arp_index); return arp_index; } diff --git a/drivers/infiniband/hw/nes/nes_verbs.c b/drivers/infiniband/hw/nes/nes_verbs.c index 137880a19ebe..8c4daf7f22ec 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.c +++ b/drivers/infiniband/hw/nes/nes_verbs.c @@ -206,80 +206,6 @@ static int nes_dealloc_mw(struct ib_mw *ibmw) } -/** - * nes_bind_mw - */ -static int nes_bind_mw(struct ib_qp *ibqp, struct ib_mw *ibmw, - struct ib_mw_bind *ibmw_bind) -{ - u64 u64temp; - struct nes_vnic *nesvnic = to_nesvnic(ibqp->device); - struct nes_device *nesdev = nesvnic->nesdev; - /* struct nes_mr *nesmr = to_nesmw(ibmw); */ - struct nes_qp *nesqp = to_nesqp(ibqp); - struct nes_hw_qp_wqe *wqe; - unsigned long flags = 0; - u32 head; - u32 wqe_misc = 0; - u32 qsize; - - if (nesqp->ibqp_state > IB_QPS_RTS) - return -EINVAL; - - spin_lock_irqsave(&nesqp->lock, flags); - - head = nesqp->hwqp.sq_head; - qsize = nesqp->hwqp.sq_tail; - - /* Check for SQ overflow */ - if (((head + (2 * qsize) - nesqp->hwqp.sq_tail) % qsize) == (qsize - 1)) { - spin_unlock_irqrestore(&nesqp->lock, flags); - return -ENOMEM; - } - - wqe = &nesqp->hwqp.sq_vbase[head]; - /* nes_debug(NES_DBG_MR, "processing sq wqe at %p, head = %u.\n", wqe, head); */ - nes_fill_init_qp_wqe(wqe, nesqp, head); - u64temp = ibmw_bind->wr_id; - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_COMP_SCRATCH_LOW_IDX, u64temp); - wqe_misc = NES_IWARP_SQ_OP_BIND; - - wqe_misc |= NES_IWARP_SQ_WQE_LOCAL_FENCE; - - if (ibmw_bind->send_flags & IB_SEND_SIGNALED) - wqe_misc |= NES_IWARP_SQ_WQE_SIGNALED_COMPL; - - if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_WRITE) - wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_WRITE; - if (ibmw_bind->bind_info.mw_access_flags & IB_ACCESS_REMOTE_READ) - wqe_misc |= NES_CQP_STAG_RIGHTS_REMOTE_READ; - - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_WQE_MISC_IDX, wqe_misc); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MR_IDX, - ibmw_bind->bind_info.mr->lkey); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_MW_IDX, ibmw->rkey); - set_wqe_32bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_LENGTH_LOW_IDX, - ibmw_bind->bind_info.length); - wqe->wqe_words[NES_IWARP_SQ_BIND_WQE_LENGTH_HIGH_IDX] = 0; - u64temp = (u64)ibmw_bind->bind_info.addr; - set_wqe_64bit_value(wqe->wqe_words, NES_IWARP_SQ_BIND_WQE_VA_FBO_LOW_IDX, u64temp); - - head++; - if (head >= qsize) - head = 0; - - nesqp->hwqp.sq_head = head; - barrier(); - - nes_write32(nesdev->regs+NES_WQE_ALLOC, - (1 << 24) | 0x00800000 | nesqp->hwqp.qp_id); - - spin_unlock_irqrestore(&nesqp->lock, flags); - - return 0; -} - - /* * nes_alloc_fast_mr */ @@ -2074,9 +2000,8 @@ static int nes_reg_mr(struct nes_device *nesdev, struct nes_pd *nespd, /** * nes_reg_phys_mr */ -static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, - struct ib_phys_buf *buffer_list, int num_phys_buf, int acc, - u64 * iova_start) +struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, u64 addr, u64 size, + int acc, u64 *iova_start) { u64 region_length; struct nes_pd *nespd = to_nespd(ib_pd); @@ -2088,13 +2013,10 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, struct nes_vpbl vpbl; struct nes_root_vpbl root_vpbl; u32 stag; - u32 i; unsigned long mask; u32 stag_index = 0; u32 next_stag_index = 0; u32 driver_key = 0; - u32 root_pbl_index = 0; - u32 cur_pbl_index = 0; int err = 0; int ret = 0; u16 pbl_count = 0; @@ -2113,11 +2035,8 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, next_stag_index >>= 8; next_stag_index %= nesadapter->max_mr; - if (num_phys_buf > (1024*512)) { - return ERR_PTR(-E2BIG); - } - if ((buffer_list[0].addr ^ *iova_start) & ~PAGE_MASK) + if ((addr ^ *iova_start) & ~PAGE_MASK) return ERR_PTR(-EINVAL); err = nes_alloc_resource(nesadapter, nesadapter->allocated_mrs, nesadapter->max_mr, @@ -2132,84 +2051,33 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, return ERR_PTR(-ENOMEM); } - for (i = 0; i < num_phys_buf; i++) { + /* Allocate a 4K buffer for the PBL */ + vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, + &vpbl.pbl_pbase); + nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n", + vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase); + if (!vpbl.pbl_vbase) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + ibmr = ERR_PTR(-ENOMEM); + kfree(nesmr); + goto reg_phys_err; + } - if ((i & 0x01FF) == 0) { - if (root_pbl_index == 1) { - /* Allocate the root PBL */ - root_vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 8192, - &root_vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating root PBL, va = %p, pa = 0x%08X\n", - root_vpbl.pbl_vbase, (unsigned int)root_vpbl.pbl_pbase); - if (!root_vpbl.pbl_vbase) { - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - kfree(nesmr); - return ERR_PTR(-ENOMEM); - } - root_vpbl.leaf_vpbl = kzalloc(sizeof(*root_vpbl.leaf_vpbl)*1024, GFP_KERNEL); - if (!root_vpbl.leaf_vpbl) { - pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, - root_vpbl.pbl_pbase); - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, - vpbl.pbl_pbase); - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - kfree(nesmr); - return ERR_PTR(-ENOMEM); - } - root_vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[0].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); - root_vpbl.leaf_vpbl[0] = vpbl; - } - /* Allocate a 4K buffer for the PBL */ - vpbl.pbl_vbase = pci_alloc_consistent(nesdev->pcidev, 4096, - &vpbl.pbl_pbase); - nes_debug(NES_DBG_MR, "Allocating leaf PBL, va = %p, pa = 0x%016lX\n", - vpbl.pbl_vbase, (unsigned long)vpbl.pbl_pbase); - if (!vpbl.pbl_vbase) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - ibmr = ERR_PTR(-ENOMEM); - kfree(nesmr); - goto reg_phys_err; - } - /* Fill in the root table */ - if (1 <= root_pbl_index) { - root_vpbl.pbl_vbase[root_pbl_index].pa_low = - cpu_to_le32((u32)vpbl.pbl_pbase); - root_vpbl.pbl_vbase[root_pbl_index].pa_high = - cpu_to_le32((u32)((((u64)vpbl.pbl_pbase) >> 32))); - root_vpbl.leaf_vpbl[root_pbl_index] = vpbl; - } - root_pbl_index++; - cur_pbl_index = 0; - } - mask = !buffer_list[i].size; - if (i != 0) - mask |= buffer_list[i].addr; - if (i != num_phys_buf - 1) - mask |= buffer_list[i].addr + buffer_list[i].size; - - if (mask & ~PAGE_MASK) { - nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); - nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n"); - ibmr = ERR_PTR(-EINVAL); - kfree(nesmr); - goto reg_phys_err; - } + mask = !size; - region_length += buffer_list[i].size; - if ((i != 0) && (single_page)) { - if ((buffer_list[i-1].addr+PAGE_SIZE) != buffer_list[i].addr) - single_page = 0; - } - vpbl.pbl_vbase[cur_pbl_index].pa_low = cpu_to_le32((u32)buffer_list[i].addr & PAGE_MASK); - vpbl.pbl_vbase[cur_pbl_index++].pa_high = - cpu_to_le32((u32)((((u64)buffer_list[i].addr) >> 32))); + if (mask & ~PAGE_MASK) { + nes_free_resource(nesadapter, nesadapter->allocated_mrs, stag_index); + nes_debug(NES_DBG_MR, "Invalid buffer addr or size\n"); + ibmr = ERR_PTR(-EINVAL); + kfree(nesmr); + goto reg_phys_err; } + region_length += size; + vpbl.pbl_vbase[0].pa_low = cpu_to_le32((u32)addr & PAGE_MASK); + vpbl.pbl_vbase[0].pa_high = cpu_to_le32((u32)((((u64)addr) >> 32))); + stag = stag_index << 8; stag |= driver_key; stag += (u32)stag_key; @@ -2219,17 +2087,15 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, stag, (unsigned long)*iova_start, (unsigned long)region_length, stag_index); /* Make the leaf PBL the root if only one PBL */ - if (root_pbl_index == 1) { - root_vpbl.pbl_pbase = vpbl.pbl_pbase; - } + root_vpbl.pbl_pbase = vpbl.pbl_pbase; if (single_page) { pbl_count = 0; } else { - pbl_count = root_pbl_index; + pbl_count = 1; } ret = nes_reg_mr(nesdev, nespd, stag, region_length, &root_vpbl, - buffer_list[0].addr, pbl_count, (u16)cur_pbl_index, acc, iova_start, + addr, pbl_count, 1, acc, iova_start, &nesmr->pbls_used, &nesmr->pbl_4k); if (ret == 0) { @@ -2242,21 +2108,9 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, ibmr = ERR_PTR(-ENOMEM); } - reg_phys_err: - /* free the resources */ - if (root_pbl_index == 1) { - /* single PBL case */ - pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase); - } else { - for (i=0; i<root_pbl_index; i++) { - pci_free_consistent(nesdev->pcidev, 4096, root_vpbl.leaf_vpbl[i].pbl_vbase, - root_vpbl.leaf_vpbl[i].pbl_pbase); - } - kfree(root_vpbl.leaf_vpbl); - pci_free_consistent(nesdev->pcidev, 8192, root_vpbl.pbl_vbase, - root_vpbl.pbl_pbase); - } - +reg_phys_err: + /* single PBL case */ + pci_free_consistent(nesdev->pcidev, 4096, vpbl.pbl_vbase, vpbl.pbl_pbase); return ibmr; } @@ -2266,17 +2120,13 @@ static struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, */ static struct ib_mr *nes_get_dma_mr(struct ib_pd *pd, int acc) { - struct ib_phys_buf bl; u64 kva = 0; nes_debug(NES_DBG_MR, "\n"); - bl.size = (u64)0xffffffffffULL; - bl.addr = 0; - return nes_reg_phys_mr(pd, &bl, 1, acc, &kva); + return nes_reg_phys_mr(pd, 0, 0xffffffffffULL, acc, &kva); } - /** * nes_reg_user_mr */ @@ -3888,12 +3738,10 @@ struct nes_ib_device *nes_init_ofa_device(struct net_device *netdev) nesibdev->ibdev.destroy_cq = nes_destroy_cq; nesibdev->ibdev.poll_cq = nes_poll_cq; nesibdev->ibdev.get_dma_mr = nes_get_dma_mr; - nesibdev->ibdev.reg_phys_mr = nes_reg_phys_mr; nesibdev->ibdev.reg_user_mr = nes_reg_user_mr; nesibdev->ibdev.dereg_mr = nes_dereg_mr; nesibdev->ibdev.alloc_mw = nes_alloc_mw; nesibdev->ibdev.dealloc_mw = nes_dealloc_mw; - nesibdev->ibdev.bind_mw = nes_bind_mw; nesibdev->ibdev.alloc_mr = nes_alloc_mr; nesibdev->ibdev.map_mr_sg = nes_map_mr_sg; diff --git a/drivers/infiniband/hw/nes/nes_verbs.h b/drivers/infiniband/hw/nes/nes_verbs.h index a204b677af22..70290883d067 100644 --- a/drivers/infiniband/hw/nes/nes_verbs.h +++ b/drivers/infiniband/hw/nes/nes_verbs.h @@ -190,4 +190,8 @@ struct nes_qp { u8 pau_state; __u64 nesuqp_addr; }; + +struct ib_mr *nes_reg_phys_mr(struct ib_pd *ib_pd, + u64 addr, u64 size, int acc, u64 *iova_start); + #endif /* NES_VERBS_H */ diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c index 9820074be59d..3790771f2baa 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_ah.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_ah.c @@ -152,9 +152,10 @@ struct ib_ah *ocrdma_create_ah(struct ib_pd *ibpd, struct ib_ah_attr *attr) if ((pd->uctx) && (!rdma_is_multicast_addr((struct in6_addr *)attr->grh.dgid.raw)) && (!rdma_link_local_addr((struct in6_addr *)attr->grh.dgid.raw))) { - status = rdma_addr_find_dmac_by_grh(&sgid, &attr->grh.dgid, - attr->dmac, &vlan_tag, - sgid_attr.ndev->ifindex); + status = rdma_addr_find_l2_eth_by_grh(&sgid, &attr->grh.dgid, + attr->dmac, &vlan_tag, + &sgid_attr.ndev->ifindex, + NULL); if (status) { pr_err("%s(): Failed to resolve dmac from gid." "status = %d\n", __func__, status); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_main.c b/drivers/infiniband/hw/ocrdma/ocrdma_main.c index 3afb40b85159..573849354cb9 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_main.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_main.c @@ -175,7 +175,6 @@ static int ocrdma_register_device(struct ocrdma_dev *dev) dev->ibdev.req_notify_cq = ocrdma_arm_cq; dev->ibdev.get_dma_mr = ocrdma_get_dma_mr; - dev->ibdev.reg_phys_mr = ocrdma_reg_kernel_mr; dev->ibdev.dereg_mr = ocrdma_dereg_mr; dev->ibdev.reg_user_mr = ocrdma_reg_user_mr; diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c index 76e96f97b3f6..d4c687b548d8 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.c @@ -3066,169 +3066,6 @@ pl_err: return ERR_PTR(-ENOMEM); } -#define MAX_KERNEL_PBE_SIZE 65536 -static inline int count_kernel_pbes(struct ib_phys_buf *buf_list, - int buf_cnt, u32 *pbe_size) -{ - u64 total_size = 0; - u64 buf_size = 0; - int i; - *pbe_size = roundup(buf_list[0].size, PAGE_SIZE); - *pbe_size = roundup_pow_of_two(*pbe_size); - - /* find the smallest PBE size that we can have */ - for (i = 0; i < buf_cnt; i++) { - /* first addr may not be page aligned, so ignore checking */ - if ((i != 0) && ((buf_list[i].addr & ~PAGE_MASK) || - (buf_list[i].size & ~PAGE_MASK))) { - return 0; - } - - /* if configured PBE size is greater then the chosen one, - * reduce the PBE size. - */ - buf_size = roundup(buf_list[i].size, PAGE_SIZE); - /* pbe_size has to be even multiple of 4K 1,2,4,8...*/ - buf_size = roundup_pow_of_two(buf_size); - if (*pbe_size > buf_size) - *pbe_size = buf_size; - - total_size += buf_size; - } - *pbe_size = *pbe_size > MAX_KERNEL_PBE_SIZE ? - (MAX_KERNEL_PBE_SIZE) : (*pbe_size); - - /* num_pbes = total_size / (*pbe_size); this is implemented below. */ - - return total_size >> ilog2(*pbe_size); -} - -static void build_kernel_pbes(struct ib_phys_buf *buf_list, int ib_buf_cnt, - u32 pbe_size, struct ocrdma_pbl *pbl_tbl, - struct ocrdma_hw_mr *hwmr) -{ - int i; - int idx; - int pbes_per_buf = 0; - u64 buf_addr = 0; - int num_pbes; - struct ocrdma_pbe *pbe; - int total_num_pbes = 0; - - if (!hwmr->num_pbes) - return; - - pbe = (struct ocrdma_pbe *)pbl_tbl->va; - num_pbes = 0; - - /* go through the OS phy regions & fill hw pbe entries into pbls. */ - for (i = 0; i < ib_buf_cnt; i++) { - buf_addr = buf_list[i].addr; - pbes_per_buf = - roundup_pow_of_two(roundup(buf_list[i].size, PAGE_SIZE)) / - pbe_size; - hwmr->len += buf_list[i].size; - /* number of pbes can be more for one OS buf, when - * buffers are of different sizes. - * split the ib_buf to one or more pbes. - */ - for (idx = 0; idx < pbes_per_buf; idx++) { - /* we program always page aligned addresses, - * first unaligned address is taken care by fbo. - */ - if (i == 0) { - /* for non zero fbo, assign the - * start of the page. - */ - pbe->pa_lo = - cpu_to_le32((u32) (buf_addr & PAGE_MASK)); - pbe->pa_hi = - cpu_to_le32((u32) upper_32_bits(buf_addr)); - } else { - pbe->pa_lo = - cpu_to_le32((u32) (buf_addr & 0xffffffff)); - pbe->pa_hi = - cpu_to_le32((u32) upper_32_bits(buf_addr)); - } - buf_addr += pbe_size; - num_pbes += 1; - total_num_pbes += 1; - pbe++; - - if (total_num_pbes == hwmr->num_pbes) - goto mr_tbl_done; - /* if the pbl is full storing the pbes, - * move to next pbl. - */ - if (num_pbes == (hwmr->pbl_size/sizeof(u64))) { - pbl_tbl++; - pbe = (struct ocrdma_pbe *)pbl_tbl->va; - num_pbes = 0; - } - } - } -mr_tbl_done: - return; -} - -struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *ibpd, - struct ib_phys_buf *buf_list, - int buf_cnt, int acc, u64 *iova_start) -{ - int status = -ENOMEM; - struct ocrdma_mr *mr; - struct ocrdma_pd *pd = get_ocrdma_pd(ibpd); - struct ocrdma_dev *dev = get_ocrdma_dev(ibpd->device); - u32 num_pbes; - u32 pbe_size = 0; - - if ((acc & IB_ACCESS_REMOTE_WRITE) && !(acc & IB_ACCESS_LOCAL_WRITE)) - return ERR_PTR(-EINVAL); - - mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (!mr) - return ERR_PTR(status); - - num_pbes = count_kernel_pbes(buf_list, buf_cnt, &pbe_size); - if (num_pbes == 0) { - status = -EINVAL; - goto pbl_err; - } - status = ocrdma_get_pbl_info(dev, mr, num_pbes); - if (status) - goto pbl_err; - - mr->hwmr.pbe_size = pbe_size; - mr->hwmr.fbo = *iova_start - (buf_list[0].addr & PAGE_MASK); - mr->hwmr.va = *iova_start; - mr->hwmr.local_rd = 1; - mr->hwmr.remote_wr = (acc & IB_ACCESS_REMOTE_WRITE) ? 1 : 0; - mr->hwmr.remote_rd = (acc & IB_ACCESS_REMOTE_READ) ? 1 : 0; - mr->hwmr.local_wr = (acc & IB_ACCESS_LOCAL_WRITE) ? 1 : 0; - mr->hwmr.remote_atomic = (acc & IB_ACCESS_REMOTE_ATOMIC) ? 1 : 0; - mr->hwmr.mw_bind = (acc & IB_ACCESS_MW_BIND) ? 1 : 0; - - status = ocrdma_build_pbl_tbl(dev, &mr->hwmr); - if (status) - goto pbl_err; - build_kernel_pbes(buf_list, buf_cnt, pbe_size, mr->hwmr.pbl_table, - &mr->hwmr); - status = ocrdma_reg_mr(dev, &mr->hwmr, pd->id, acc); - if (status) - goto mbx_err; - - mr->ibmr.lkey = mr->hwmr.lkey; - if (mr->hwmr.remote_wr || mr->hwmr.remote_rd) - mr->ibmr.rkey = mr->hwmr.lkey; - return &mr->ibmr; - -mbx_err: - ocrdma_free_mr_pbl_tbl(dev, &mr->hwmr); -pbl_err: - kfree(mr); - return ERR_PTR(status); -} - static int ocrdma_set_page(struct ib_mr *ibmr, u64 addr) { struct ocrdma_mr *mr = get_ocrdma_mr(ibmr); diff --git a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h index a2f3b4dc20b0..8b517fd36779 100644 --- a/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h +++ b/drivers/infiniband/hw/ocrdma/ocrdma_verbs.h @@ -117,9 +117,6 @@ int ocrdma_post_srq_recv(struct ib_srq *, struct ib_recv_wr *, int ocrdma_dereg_mr(struct ib_mr *); struct ib_mr *ocrdma_get_dma_mr(struct ib_pd *, int acc); -struct ib_mr *ocrdma_reg_kernel_mr(struct ib_pd *, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start); struct ib_mr *ocrdma_reg_user_mr(struct ib_pd *, u64 start, u64 length, u64 virt, int acc, struct ib_udata *); struct ib_mr *ocrdma_alloc_mr(struct ib_pd *pd, diff --git a/drivers/infiniband/hw/qib/qib_fs.c b/drivers/infiniband/hw/qib/qib_fs.c index 13ef22bd9459..fcdf37913a26 100644 --- a/drivers/infiniband/hw/qib/qib_fs.c +++ b/drivers/infiniband/hw/qib/qib_fs.c @@ -89,14 +89,14 @@ static int create_file(const char *name, umode_t mode, { int error; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) error = qibfs_mknod(d_inode(parent), *dentry, mode, fops, data); else error = PTR_ERR(*dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return error; } @@ -481,7 +481,7 @@ static int remove_device_files(struct super_block *sb, int ret, i; root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); snprintf(unit, sizeof(unit), "%u", dd->unit); dir = lookup_one_len(unit, root, strlen(unit)); @@ -491,7 +491,7 @@ static int remove_device_files(struct super_block *sb, goto bail; } - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); remove_file(dir, "counters"); remove_file(dir, "counter_names"); remove_file(dir, "portcounter_names"); @@ -506,13 +506,13 @@ static int remove_device_files(struct super_block *sb, } } remove_file(dir, "flash"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = simple_rmdir(d_inode(root), dir); d_delete(dir); dput(dir); bail: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); return ret; } diff --git a/drivers/infiniband/hw/qib/qib_mr.c b/drivers/infiniband/hw/qib/qib_mr.c index 294f5c706be9..5f53304e8a9b 100644 --- a/drivers/infiniband/hw/qib/qib_mr.c +++ b/drivers/infiniband/hw/qib/qib_mr.c @@ -150,10 +150,7 @@ static struct qib_mr *alloc_mr(int count, struct ib_pd *pd) rval = init_qib_mregion(&mr->mr, pd, count); if (rval) goto bail; - /* - * ib_reg_phys_mr() will initialize mr->ibmr except for - * lkey and rkey. - */ + rval = qib_alloc_lkey(&mr->mr, 0); if (rval) goto bail_mregion; @@ -171,52 +168,6 @@ bail: } /** - * qib_reg_phys_mr - register a physical memory region - * @pd: protection domain for this memory region - * @buffer_list: pointer to the list of physical buffers to register - * @num_phys_buf: the number of physical buffers to register - * @iova_start: the starting address passed over IB which maps to this MR - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start) -{ - struct qib_mr *mr; - int n, m, i; - struct ib_mr *ret; - - mr = alloc_mr(num_phys_buf, pd); - if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; - goto bail; - } - - mr->mr.user_base = *iova_start; - mr->mr.iova = *iova_start; - mr->mr.access_flags = acc; - - m = 0; - n = 0; - for (i = 0; i < num_phys_buf; i++) { - mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; - mr->mr.map[m]->segs[n].length = buffer_list[i].size; - mr->mr.length += buffer_list[i].size; - n++; - if (n == QIB_SEGSZ) { - m++; - n = 0; - } - } - - ret = &mr->ibmr; - -bail: - return ret; -} - -/** * qib_reg_user_mr - register a userspace memory region * @pd: protection domain for this memory region * @start: starting userspace address diff --git a/drivers/infiniband/hw/qib/qib_qp.c b/drivers/infiniband/hw/qib/qib_qp.c index 40f85bb3e0d3..3eff35c2d453 100644 --- a/drivers/infiniband/hw/qib/qib_qp.c +++ b/drivers/infiniband/hw/qib/qib_qp.c @@ -100,9 +100,10 @@ static u32 credit_table[31] = { 32768 /* 1E */ }; -static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map) +static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map, + gfp_t gfp) { - unsigned long page = get_zeroed_page(GFP_KERNEL); + unsigned long page = get_zeroed_page(gfp); /* * Free the page if someone raced with us installing it. @@ -121,7 +122,7 @@ static void get_map_page(struct qib_qpn_table *qpt, struct qpn_map *map) * zero/one for QP type IB_QPT_SMI/IB_QPT_GSI. */ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, - enum ib_qp_type type, u8 port) + enum ib_qp_type type, u8 port, gfp_t gfp) { u32 i, offset, max_scan, qpn; struct qpn_map *map; @@ -151,7 +152,7 @@ static int alloc_qpn(struct qib_devdata *dd, struct qib_qpn_table *qpt, max_scan = qpt->nmaps - !offset; for (i = 0;;) { if (unlikely(!map->page)) { - get_map_page(qpt, map); + get_map_page(qpt, map, gfp); if (unlikely(!map->page)) break; } @@ -983,13 +984,21 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, size_t sz; size_t sg_list_sz; struct ib_qp *ret; + gfp_t gfp; + if (init_attr->cap.max_send_sge > ib_qib_max_sges || init_attr->cap.max_send_wr > ib_qib_max_qp_wrs || - init_attr->create_flags) { - ret = ERR_PTR(-EINVAL); - goto bail; - } + init_attr->create_flags & ~(IB_QP_CREATE_USE_GFP_NOIO)) + return ERR_PTR(-EINVAL); + + /* GFP_NOIO is applicable in RC QPs only */ + if (init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO && + init_attr->qp_type != IB_QPT_RC) + return ERR_PTR(-EINVAL); + + gfp = init_attr->create_flags & IB_QP_CREATE_USE_GFP_NOIO ? + GFP_NOIO : GFP_KERNEL; /* Check receive queue parameters if no SRQ is specified. */ if (!init_attr->srq) { @@ -1021,7 +1030,8 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, sz = sizeof(struct qib_sge) * init_attr->cap.max_send_sge + sizeof(struct qib_swqe); - swq = vmalloc((init_attr->cap.max_send_wr + 1) * sz); + swq = __vmalloc((init_attr->cap.max_send_wr + 1) * sz, + gfp, PAGE_KERNEL); if (swq == NULL) { ret = ERR_PTR(-ENOMEM); goto bail; @@ -1037,13 +1047,13 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, } else if (init_attr->cap.max_recv_sge > 1) sg_list_sz = sizeof(*qp->r_sg_list) * (init_attr->cap.max_recv_sge - 1); - qp = kzalloc(sz + sg_list_sz, GFP_KERNEL); + qp = kzalloc(sz + sg_list_sz, gfp); if (!qp) { ret = ERR_PTR(-ENOMEM); goto bail_swq; } RCU_INIT_POINTER(qp->next, NULL); - qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), GFP_KERNEL); + qp->s_hdr = kzalloc(sizeof(*qp->s_hdr), gfp); if (!qp->s_hdr) { ret = ERR_PTR(-ENOMEM); goto bail_qp; @@ -1058,8 +1068,16 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, qp->r_rq.max_sge = init_attr->cap.max_recv_sge; sz = (sizeof(struct ib_sge) * qp->r_rq.max_sge) + sizeof(struct qib_rwqe); - qp->r_rq.wq = vmalloc_user(sizeof(struct qib_rwq) + - qp->r_rq.size * sz); + if (gfp != GFP_NOIO) + qp->r_rq.wq = vmalloc_user( + sizeof(struct qib_rwq) + + qp->r_rq.size * sz); + else + qp->r_rq.wq = __vmalloc( + sizeof(struct qib_rwq) + + qp->r_rq.size * sz, + gfp, PAGE_KERNEL); + if (!qp->r_rq.wq) { ret = ERR_PTR(-ENOMEM); goto bail_qp; @@ -1090,7 +1108,7 @@ struct ib_qp *qib_create_qp(struct ib_pd *ibpd, dev = to_idev(ibpd->device); dd = dd_from_dev(dev); err = alloc_qpn(dd, &dev->qpn_table, init_attr->qp_type, - init_attr->port_num); + init_attr->port_num, gfp); if (err < 0) { ret = ERR_PTR(err); vfree(qp->r_rq.wq); diff --git a/drivers/infiniband/hw/qib/qib_verbs.c b/drivers/infiniband/hw/qib/qib_verbs.c index de6cb6fcda8d..baf1e42b6896 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.c +++ b/drivers/infiniband/hw/qib/qib_verbs.c @@ -346,6 +346,7 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr, unsigned long flags; struct qib_lkey_table *rkt; struct qib_pd *pd; + int avoid_schedule = 0; spin_lock_irqsave(&qp->s_lock, flags); @@ -438,11 +439,15 @@ static int qib_post_one_send(struct qib_qp *qp, struct ib_send_wr *wr, qp->ibqp.qp_type == IB_QPT_RC) { if (wqe->length > 0x80000000U) goto bail_inval_free; + if (wqe->length <= qp->pmtu) + avoid_schedule = 1; } else if (wqe->length > (dd_from_ibdev(qp->ibqp.device)->pport + - qp->port_num - 1)->ibmtu) + qp->port_num - 1)->ibmtu) { goto bail_inval_free; - else + } else { atomic_inc(&to_iah(ud_wr(wr)->ah)->refcount); + avoid_schedule = 1; + } wqe->ssn = qp->s_ssn++; qp->s_head = next; @@ -458,7 +463,7 @@ bail_inval_free: bail_inval: ret = -EINVAL; bail: - if (!ret && !wr->next && + if (!ret && !wr->next && !avoid_schedule && !qib_sdma_empty( dd_from_ibdev(qp->ibqp.device)->pport + qp->port_num - 1)) { qib_schedule_send(qp); @@ -2256,7 +2261,6 @@ int qib_register_ib_device(struct qib_devdata *dd) ibdev->poll_cq = qib_poll_cq; ibdev->req_notify_cq = qib_req_notify_cq; ibdev->get_dma_mr = qib_get_dma_mr; - ibdev->reg_phys_mr = qib_reg_phys_mr; ibdev->reg_user_mr = qib_reg_user_mr; ibdev->dereg_mr = qib_dereg_mr; ibdev->alloc_mr = qib_alloc_mr; diff --git a/drivers/infiniband/hw/qib/qib_verbs.h b/drivers/infiniband/hw/qib/qib_verbs.h index bc803f33d5f6..6c5e77753d85 100644 --- a/drivers/infiniband/hw/qib/qib_verbs.h +++ b/drivers/infiniband/hw/qib/qib_verbs.h @@ -1032,10 +1032,6 @@ int qib_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); struct ib_mr *qib_get_dma_mr(struct ib_pd *pd, int acc); -struct ib_mr *qib_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start); - struct ib_mr *qib_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); diff --git a/drivers/infiniband/hw/qib/qib_verbs_mcast.c b/drivers/infiniband/hw/qib/qib_verbs_mcast.c index f8ea069a3eaf..b2fb5286dbd9 100644 --- a/drivers/infiniband/hw/qib/qib_verbs_mcast.c +++ b/drivers/infiniband/hw/qib/qib_verbs_mcast.c @@ -286,15 +286,13 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) struct qib_ibdev *dev = to_idev(ibqp->device); struct qib_ibport *ibp = to_iport(ibqp->device, qp->port_num); struct qib_mcast *mcast = NULL; - struct qib_mcast_qp *p, *tmp; + struct qib_mcast_qp *p, *tmp, *delp = NULL; struct rb_node *n; int last = 0; int ret; - if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) { - ret = -EINVAL; - goto bail; - } + if (ibqp->qp_num <= 1 || qp->state == IB_QPS_RESET) + return -EINVAL; spin_lock_irq(&ibp->lock); @@ -303,8 +301,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) while (1) { if (n == NULL) { spin_unlock_irq(&ibp->lock); - ret = -EINVAL; - goto bail; + return -EINVAL; } mcast = rb_entry(n, struct qib_mcast, rb_node); @@ -328,6 +325,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) */ list_del_rcu(&p->list); mcast->n_attached--; + delp = p; /* If this was the last attached QP, remove the GID too. */ if (list_empty(&mcast->qp_list)) { @@ -338,15 +336,16 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) } spin_unlock_irq(&ibp->lock); + /* QP not attached */ + if (!delp) + return -EINVAL; + /* + * Wait for any list walkers to finish before freeing the + * list element. + */ + wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); + qib_mcast_qp_free(delp); - if (p) { - /* - * Wait for any list walkers to finish before freeing the - * list element. - */ - wait_event(mcast->wait, atomic_read(&mcast->refcount) <= 1); - qib_mcast_qp_free(p); - } if (last) { atomic_dec(&mcast->refcount); wait_event(mcast->wait, !atomic_read(&mcast->refcount)); @@ -355,11 +354,7 @@ int qib_multicast_detach(struct ib_qp *ibqp, union ib_gid *gid, u16 lid) dev->n_mcast_grps_allocated--; spin_unlock_irq(&dev->n_mcast_grps_lock); } - - ret = 0; - -bail: - return ret; + return 0; } int qib_mcast_tree_empty(struct qib_ibport *ibp) diff --git a/drivers/infiniband/hw/usnic/usnic_debugfs.c b/drivers/infiniband/hw/usnic/usnic_debugfs.c index 5e55b8bc6fe4..92dc66cc2d50 100644 --- a/drivers/infiniband/hw/usnic/usnic_debugfs.c +++ b/drivers/infiniband/hw/usnic/usnic_debugfs.c @@ -157,8 +157,9 @@ void usnic_debugfs_flow_add(struct usnic_ib_qp_grp_flow *qp_flow) qp_flow, &flowinfo_ops); if (IS_ERR_OR_NULL(qp_flow->dbgfs_dentry)) { - usnic_err("Failed to create dbg fs entry for flow %u\n", - qp_flow->flow->flow_id); + usnic_err("Failed to create dbg fs entry for flow %u with error %ld\n", + qp_flow->flow->flow_id, + PTR_ERR(qp_flow->dbgfs_dentry)); } } diff --git a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c index fcea3a24d3eb..5f44b66ccb86 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_qp_grp.c @@ -521,7 +521,7 @@ int usnic_ib_qp_grp_modify(struct usnic_ib_qp_grp *qp_grp, if (!status) { qp_grp->state = new_state; - usnic_info("Transistioned %u from %s to %s", + usnic_info("Transitioned %u from %s to %s", qp_grp->grp_id, usnic_ib_qp_grp_state_to_string(old_state), usnic_ib_qp_grp_state_to_string(new_state)); @@ -575,7 +575,7 @@ alloc_res_chunk_list(struct usnic_vnic *vnic, return res_chunk_list; out_free_res: - for (i--; i > 0; i--) + for (i--; i >= 0; i--) usnic_vnic_put_resources(res_chunk_list[i]); kfree(res_chunk_list); return ERR_PTR(err); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c index f8e3211689a3..6cdb4d23f78f 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.c +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.c @@ -51,7 +51,7 @@ static void usnic_ib_fw_string_to_u64(char *fw_ver_str, u64 *fw_ver) { - *fw_ver = (u64) *fw_ver_str; + *fw_ver = *((u64 *)fw_ver_str); } static int usnic_ib_fill_create_qp_resp(struct usnic_ib_qp_grp *qp_grp, @@ -571,20 +571,20 @@ int usnic_ib_modify_qp(struct ib_qp *ibqp, struct ib_qp_attr *attr, qp_grp = to_uqp_grp(ibqp); - /* TODO: Future Support All States */ mutex_lock(&qp_grp->vf->pf->usdev_lock); - if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_INIT) { - status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_INIT, NULL); - } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTR) { - status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTR, NULL); - } else if ((attr_mask & IB_QP_STATE) && attr->qp_state == IB_QPS_RTS) { - status = usnic_ib_qp_grp_modify(qp_grp, IB_QPS_RTS, NULL); + if ((attr_mask & IB_QP_PORT) && attr->port_num != 1) { + /* usnic devices only have one port */ + status = -EINVAL; + goto out_unlock; + } + if (attr_mask & IB_QP_STATE) { + status = usnic_ib_qp_grp_modify(qp_grp, attr->qp_state, NULL); } else { - usnic_err("Unexpected combination mask: %u state: %u\n", - attr_mask & IB_QP_STATE, attr->qp_state); + usnic_err("Unhandled request, attr_mask=0x%x\n", attr_mask); status = -EINVAL; } +out_unlock: mutex_unlock(&qp_grp->vf->pf->usdev_lock); return status; } @@ -625,8 +625,8 @@ struct ib_mr *usnic_ib_reg_mr(struct ib_pd *pd, u64 start, u64 length, virt_addr, length); mr = kzalloc(sizeof(*mr), GFP_KERNEL); - if (IS_ERR_OR_NULL(mr)) - return ERR_PTR(mr ? PTR_ERR(mr) : -ENOMEM); + if (!mr) + return ERR_PTR(-ENOMEM); mr->umem = usnic_uiom_reg_get(to_upd(pd)->umem_pd, start, length, access_flags, 0); diff --git a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h index 414eaa566bd9..0d9d2e6a14d5 100644 --- a/drivers/infiniband/hw/usnic/usnic_ib_verbs.h +++ b/drivers/infiniband/hw/usnic/usnic_ib_verbs.h @@ -43,8 +43,6 @@ int usnic_ib_query_device(struct ib_device *ibdev, struct ib_udata *uhw); int usnic_ib_query_port(struct ib_device *ibdev, u8 port, struct ib_port_attr *props); -enum rdma_protocol_type -usnic_ib_query_protocol(struct ib_device *device, u8 port_num); int usnic_ib_query_qp(struct ib_qp *qp, struct ib_qp_attr *qp_attr, int qp_attr_mask, struct ib_qp_init_attr *qp_init_attr); diff --git a/drivers/infiniband/hw/usnic/usnic_vnic.c b/drivers/infiniband/hw/usnic/usnic_vnic.c index 66de93fb8ea9..887510718690 100644 --- a/drivers/infiniband/hw/usnic/usnic_vnic.c +++ b/drivers/infiniband/hw/usnic/usnic_vnic.c @@ -237,7 +237,7 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, struct usnic_vnic_res *res; int i; - if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 1 || !owner) + if (usnic_vnic_res_free_cnt(vnic, type) < cnt || cnt < 0 || !owner) return ERR_PTR(-EINVAL); ret = kzalloc(sizeof(*ret), GFP_ATOMIC); @@ -247,26 +247,28 @@ usnic_vnic_get_resources(struct usnic_vnic *vnic, enum usnic_vnic_res_type type, return ERR_PTR(-ENOMEM); } - ret->res = kzalloc(sizeof(*(ret->res))*cnt, GFP_ATOMIC); - if (!ret->res) { - usnic_err("Failed to allocate resources for %s. Out of memory\n", - usnic_vnic_pci_name(vnic)); - kfree(ret); - return ERR_PTR(-ENOMEM); - } + if (cnt > 0) { + ret->res = kcalloc(cnt, sizeof(*(ret->res)), GFP_ATOMIC); + if (!ret->res) { + usnic_err("Failed to allocate resources for %s. Out of memory\n", + usnic_vnic_pci_name(vnic)); + kfree(ret); + return ERR_PTR(-ENOMEM); + } - spin_lock(&vnic->res_lock); - src = &vnic->chunks[type]; - for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { - res = src->res[i]; - if (!res->owner) { - src->free_cnt--; - res->owner = owner; - ret->res[ret->cnt++] = res; + spin_lock(&vnic->res_lock); + src = &vnic->chunks[type]; + for (i = 0; i < src->cnt && ret->cnt < cnt; i++) { + res = src->res[i]; + if (!res->owner) { + src->free_cnt--; + res->owner = owner; + ret->res[ret->cnt++] = res; + } } - } - spin_unlock(&vnic->res_lock); + spin_unlock(&vnic->res_lock); + } ret->type = type; ret->vnic = vnic; WARN_ON(ret->cnt != cnt); @@ -281,14 +283,16 @@ void usnic_vnic_put_resources(struct usnic_vnic_res_chunk *chunk) int i; struct usnic_vnic *vnic = chunk->vnic; - spin_lock(&vnic->res_lock); - while ((i = --chunk->cnt) >= 0) { - res = chunk->res[i]; - chunk->res[i] = NULL; - res->owner = NULL; - vnic->chunks[res->type].free_cnt++; + if (chunk->cnt > 0) { + spin_lock(&vnic->res_lock); + while ((i = --chunk->cnt) >= 0) { + res = chunk->res[i]; + chunk->res[i] = NULL; + res->owner = NULL; + vnic->chunks[res->type].free_cnt++; + } + spin_unlock(&vnic->res_lock); } - spin_unlock(&vnic->res_lock); kfree(chunk->res); kfree(chunk); diff --git a/drivers/infiniband/ulp/ipoib/ipoib.h b/drivers/infiniband/ulp/ipoib/ipoib.h index 3ede10309754..a6f3eab0f350 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib.h +++ b/drivers/infiniband/ulp/ipoib/ipoib.h @@ -495,7 +495,6 @@ void ipoib_dev_cleanup(struct net_device *dev); void ipoib_mcast_join_task(struct work_struct *work); void ipoib_mcast_carrier_on_task(struct work_struct *work); void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb); -void ipoib_mcast_free(struct ipoib_mcast *mc); void ipoib_mcast_restart_task(struct work_struct *work); int ipoib_mcast_start_thread(struct net_device *dev); @@ -549,8 +548,9 @@ void ipoib_path_iter_read(struct ipoib_path_iter *iter, int ipoib_mcast_attach(struct net_device *dev, u16 mlid, union ib_gid *mgid, int set_qkey); -int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast); -struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid); +void ipoib_mcast_remove_list(struct list_head *remove_list); +void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid, + struct list_head *remove_list); int ipoib_init_qp(struct net_device *dev); int ipoib_transport_dev_init(struct net_device *dev, struct ib_device *ca); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_cm.c b/drivers/infiniband/ulp/ipoib/ipoib_cm.c index 3ae9726efb98..917e46ea3bf6 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_cm.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_cm.c @@ -70,7 +70,6 @@ static struct ib_qp_attr ipoib_cm_err_attr = { #define IPOIB_CM_RX_DRAIN_WRID 0xffffffff static struct ib_send_wr ipoib_cm_rx_drain_wr = { - .wr_id = IPOIB_CM_RX_DRAIN_WRID, .opcode = IB_WR_SEND, }; @@ -223,6 +222,7 @@ static void ipoib_cm_start_rx_drain(struct ipoib_dev_priv *priv) * error" WC will be immediately generated for each WR we post. */ p = list_entry(priv->cm.rx_flush_list.next, typeof(*p), list); + ipoib_cm_rx_drain_wr.wr_id = IPOIB_CM_RX_DRAIN_WRID; if (ib_post_send(p->qp, &ipoib_cm_rx_drain_wr, &bad_wr)) ipoib_warn(priv, "failed to post drain wr\n"); @@ -1522,8 +1522,7 @@ static void ipoib_cm_create_srq(struct net_device *dev, int max_sge) int ipoib_cm_dev_init(struct net_device *dev) { struct ipoib_dev_priv *priv = netdev_priv(dev); - int i, ret; - struct ib_device_attr attr; + int max_srq_sge, i; INIT_LIST_HEAD(&priv->cm.passive_ids); INIT_LIST_HEAD(&priv->cm.reap_list); @@ -1540,19 +1539,13 @@ int ipoib_cm_dev_init(struct net_device *dev) skb_queue_head_init(&priv->cm.skb_queue); - ret = ib_query_device(priv->ca, &attr); - if (ret) { - printk(KERN_WARNING "ib_query_device() failed with %d\n", ret); - return ret; - } - - ipoib_dbg(priv, "max_srq_sge=%d\n", attr.max_srq_sge); + ipoib_dbg(priv, "max_srq_sge=%d\n", priv->ca->attrs.max_srq_sge); - attr.max_srq_sge = min_t(int, IPOIB_CM_RX_SG, attr.max_srq_sge); - ipoib_cm_create_srq(dev, attr.max_srq_sge); + max_srq_sge = min_t(int, IPOIB_CM_RX_SG, priv->ca->attrs.max_srq_sge); + ipoib_cm_create_srq(dev, max_srq_sge); if (ipoib_cm_has_srq(dev)) { - priv->cm.max_cm_mtu = attr.max_srq_sge * PAGE_SIZE - 0x10; - priv->cm.num_frags = attr.max_srq_sge; + priv->cm.max_cm_mtu = max_srq_sge * PAGE_SIZE - 0x10; + priv->cm.num_frags = max_srq_sge; ipoib_dbg(priv, "max_cm_mtu = 0x%x, num_frags=%d\n", priv->cm.max_cm_mtu, priv->cm.num_frags); } else { diff --git a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c index 078cadd6c797..a53fa5fc0dec 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_ethtool.c @@ -40,15 +40,11 @@ static void ipoib_get_drvinfo(struct net_device *netdev, struct ethtool_drvinfo *drvinfo) { struct ipoib_dev_priv *priv = netdev_priv(netdev); - struct ib_device_attr *attr; - - attr = kmalloc(sizeof(*attr), GFP_KERNEL); - if (attr && !ib_query_device(priv->ca, attr)) - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), - "%d.%d.%d", (int)(attr->fw_ver >> 32), - (int)(attr->fw_ver >> 16) & 0xffff, - (int)attr->fw_ver & 0xffff); - kfree(attr); + + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), + "%d.%d.%d", (int)(priv->ca->attrs.fw_ver >> 32), + (int)(priv->ca->attrs.fw_ver >> 16) & 0xffff, + (int)priv->ca->attrs.fw_ver & 0xffff); strlcpy(drvinfo->bus_info, dev_name(priv->ca->dma_device), sizeof(drvinfo->bus_info)); diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c b/drivers/infiniband/ulp/ipoib/ipoib_main.c index 7d3281866ffc..25509bbd4a05 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c @@ -1150,8 +1150,6 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) unsigned long flags; int i; LIST_HEAD(remove_list); - struct ipoib_mcast *mcast, *tmcast; - struct net_device *dev = priv->dev; if (test_bit(IPOIB_STOP_NEIGH_GC, &priv->flags)) return; @@ -1179,18 +1177,8 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) lockdep_is_held(&priv->lock))) != NULL) { /* was the neigh idle for two GC periods */ if (time_after(neigh_obsolete, neigh->alive)) { - u8 *mgid = neigh->daddr + 4; - /* Is this multicast ? */ - if (*mgid == 0xff) { - mcast = __ipoib_mcast_find(dev, mgid); - - if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { - list_del(&mcast->list); - rb_erase(&mcast->rb_node, &priv->multicast_tree); - list_add_tail(&mcast->list, &remove_list); - } - } + ipoib_check_and_add_mcast_sendonly(priv, neigh->daddr + 4, &remove_list); rcu_assign_pointer(*np, rcu_dereference_protected(neigh->hnext, @@ -1207,10 +1195,7 @@ static void __ipoib_reap_neigh(struct ipoib_dev_priv *priv) out_unlock: spin_unlock_irqrestore(&priv->lock, flags); - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { - ipoib_mcast_leave(dev, mcast); - ipoib_mcast_free(mcast); - } + ipoib_mcast_remove_list(&remove_list); } static void ipoib_reap_neigh(struct work_struct *work) @@ -1777,26 +1762,7 @@ int ipoib_add_pkey_attr(struct net_device *dev) int ipoib_set_dev_features(struct ipoib_dev_priv *priv, struct ib_device *hca) { - struct ib_device_attr *device_attr; - int result = -ENOMEM; - - device_attr = kmalloc(sizeof *device_attr, GFP_KERNEL); - if (!device_attr) { - printk(KERN_WARNING "%s: allocation of %zu bytes failed\n", - hca->name, sizeof *device_attr); - return result; - } - - result = ib_query_device(hca, device_attr); - if (result) { - printk(KERN_WARNING "%s: ib_query_device failed (ret = %d)\n", - hca->name, result); - kfree(device_attr); - return result; - } - priv->hca_caps = device_attr->device_cap_flags; - - kfree(device_attr); + priv->hca_caps = hca->attrs.device_cap_flags; if (priv->hca_caps & IB_DEVICE_UD_IP_CSUM) { priv->dev->hw_features = NETIF_F_SG | diff --git a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c index f357ca67a41c..050dfa175d16 100644 --- a/drivers/infiniband/ulp/ipoib/ipoib_multicast.c +++ b/drivers/infiniband/ulp/ipoib/ipoib_multicast.c @@ -106,7 +106,7 @@ static void __ipoib_mcast_schedule_join_thread(struct ipoib_dev_priv *priv, queue_delayed_work(priv->wq, &priv->mcast_task, 0); } -void ipoib_mcast_free(struct ipoib_mcast *mcast) +static void ipoib_mcast_free(struct ipoib_mcast *mcast) { struct net_device *dev = mcast->dev; int tx_dropped = 0; @@ -153,7 +153,7 @@ static struct ipoib_mcast *ipoib_mcast_alloc(struct net_device *dev, return mcast; } -struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) +static struct ipoib_mcast *__ipoib_mcast_find(struct net_device *dev, void *mgid) { struct ipoib_dev_priv *priv = netdev_priv(dev); struct rb_node *n = priv->multicast_tree.rb_node; @@ -677,7 +677,7 @@ int ipoib_mcast_stop_thread(struct net_device *dev) return 0; } -int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) +static int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) { struct ipoib_dev_priv *priv = netdev_priv(dev); int ret = 0; @@ -704,6 +704,35 @@ int ipoib_mcast_leave(struct net_device *dev, struct ipoib_mcast *mcast) return 0; } +/* + * Check if the multicast group is sendonly. If so remove it from the maps + * and add to the remove list + */ +void ipoib_check_and_add_mcast_sendonly(struct ipoib_dev_priv *priv, u8 *mgid, + struct list_head *remove_list) +{ + /* Is this multicast ? */ + if (*mgid == 0xff) { + struct ipoib_mcast *mcast = __ipoib_mcast_find(priv->dev, mgid); + + if (mcast && test_bit(IPOIB_MCAST_FLAG_SENDONLY, &mcast->flags)) { + list_del(&mcast->list); + rb_erase(&mcast->rb_node, &priv->multicast_tree); + list_add_tail(&mcast->list, remove_list); + } + } +} + +void ipoib_mcast_remove_list(struct list_head *remove_list) +{ + struct ipoib_mcast *mcast, *tmcast; + + list_for_each_entry_safe(mcast, tmcast, remove_list, list) { + ipoib_mcast_leave(mcast->dev, mcast); + ipoib_mcast_free(mcast); + } +} + void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb) { struct ipoib_dev_priv *priv = netdev_priv(dev); @@ -810,10 +839,7 @@ void ipoib_mcast_dev_flush(struct net_device *dev) if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) wait_for_completion(&mcast->done); - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { - ipoib_mcast_leave(dev, mcast); - ipoib_mcast_free(mcast); - } + ipoib_mcast_remove_list(&remove_list); } static int ipoib_mcast_addr_is_valid(const u8 *addr, const u8 *broadcast) @@ -939,10 +965,7 @@ void ipoib_mcast_restart_task(struct work_struct *work) if (test_bit(IPOIB_MCAST_FLAG_BUSY, &mcast->flags)) wait_for_completion(&mcast->done); - list_for_each_entry_safe(mcast, tmcast, &remove_list, list) { - ipoib_mcast_leave(mcast->dev, mcast); - ipoib_mcast_free(mcast); - } + ipoib_mcast_remove_list(&remove_list); /* * Double check that we are still up diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.c b/drivers/infiniband/ulp/iser/iscsi_iser.c index 9080161e01af..c827c93f46c5 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.c +++ b/drivers/infiniband/ulp/iser/iscsi_iser.c @@ -644,7 +644,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, ib_conn = &iser_conn->ib_conn; if (ib_conn->pi_support) { - u32 sig_caps = ib_conn->device->dev_attr.sig_prot_cap; + u32 sig_caps = ib_conn->device->ib_device->attrs.sig_prot_cap; scsi_host_set_prot(shost, iser_dif_prot_caps(sig_caps)); scsi_host_set_guard(shost, SHOST_DIX_GUARD_IP | @@ -656,7 +656,7 @@ iscsi_iser_session_create(struct iscsi_endpoint *ep, * max fastreg page list length. */ shost->sg_tablesize = min_t(unsigned short, shost->sg_tablesize, - ib_conn->device->dev_attr.max_fast_reg_page_list_len); + ib_conn->device->ib_device->attrs.max_fast_reg_page_list_len); shost->max_sectors = min_t(unsigned int, 1024, (shost->sg_tablesize * PAGE_SIZE) >> 9); @@ -1059,7 +1059,8 @@ static int __init iser_init(void) release_wq = alloc_workqueue("release workqueue", 0, 0); if (!release_wq) { iser_err("failed to allocate release workqueue\n"); - return -ENOMEM; + err = -ENOMEM; + goto err_alloc_wq; } iscsi_iser_scsi_transport = iscsi_register_transport( @@ -1067,12 +1068,14 @@ static int __init iser_init(void) if (!iscsi_iser_scsi_transport) { iser_err("iscsi_register_transport failed\n"); err = -EINVAL; - goto register_transport_failure; + goto err_reg; } return 0; -register_transport_failure: +err_reg: + destroy_workqueue(release_wq); +err_alloc_wq: kmem_cache_destroy(ig.desc_cache); return err; diff --git a/drivers/infiniband/ulp/iser/iscsi_iser.h b/drivers/infiniband/ulp/iser/iscsi_iser.h index 8a5998e6a407..95f0a64e076b 100644 --- a/drivers/infiniband/ulp/iser/iscsi_iser.h +++ b/drivers/infiniband/ulp/iser/iscsi_iser.h @@ -48,6 +48,7 @@ #include <scsi/scsi_transport_iscsi.h> #include <scsi/scsi_cmnd.h> #include <scsi/scsi_device.h> +#include <scsi/iser.h> #include <linux/interrupt.h> #include <linux/wait.h> @@ -151,46 +152,10 @@ - ISER_MAX_RX_MISC_PDUS) / \ (1 + ISER_INFLIGHT_DATAOUTS)) -#define ISER_WC_BATCH_COUNT 16 #define ISER_SIGNAL_CMD_COUNT 32 -#define ISER_VER 0x10 -#define ISER_WSV 0x08 -#define ISER_RSV 0x04 - -#define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL -#define ISER_BEACON_WRID 0xfffffffffffffffeULL - -/** - * struct iser_hdr - iSER header - * - * @flags: flags support (zbva, remote_inv) - * @rsvd: reserved - * @write_stag: write rkey - * @write_va: write virtual address - * @reaf_stag: read rkey - * @read_va: read virtual address - */ -struct iser_hdr { - u8 flags; - u8 rsvd[3]; - __be32 write_stag; - __be64 write_va; - __be32 read_stag; - __be64 read_va; -} __attribute__((packed)); - - -#define ISER_ZBVA_NOT_SUPPORTED 0x80 -#define ISER_SEND_W_INV_NOT_SUPPORTED 0x40 - -struct iser_cm_hdr { - u8 flags; - u8 rsvd[3]; -} __packed; - /* Constant PDU lengths calculations */ -#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) +#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + sizeof(struct iscsi_hdr)) #define ISER_RECV_DATA_SEG_LEN 128 #define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) @@ -269,7 +234,7 @@ enum iser_desc_type { #define ISER_MAX_WRS 7 /** - * struct iser_tx_desc - iSER TX descriptor (for send wr_id) + * struct iser_tx_desc - iSER TX descriptor * * @iser_header: iser header * @iscsi_header: iscsi header @@ -287,12 +252,13 @@ enum iser_desc_type { * @sig_attrs: Signature attributes */ struct iser_tx_desc { - struct iser_hdr iser_header; + struct iser_ctrl iser_header; struct iscsi_hdr iscsi_header; enum iser_desc_type type; u64 dma_addr; struct ib_sge tx_sg[2]; int num_sge; + struct ib_cqe cqe; bool mapped; u8 wr_idx; union iser_wr { @@ -306,9 +272,10 @@ struct iser_tx_desc { }; #define ISER_RX_PAD_SIZE (256 - (ISER_RX_PAYLOAD_SIZE + \ - sizeof(u64) + sizeof(struct ib_sge))) + sizeof(u64) + sizeof(struct ib_sge) + \ + sizeof(struct ib_cqe))) /** - * struct iser_rx_desc - iSER RX descriptor (for recv wr_id) + * struct iser_rx_desc - iSER RX descriptor * * @iser_header: iser header * @iscsi_header: iscsi header @@ -318,12 +285,32 @@ struct iser_tx_desc { * @pad: for sense data TODO: Modify to maximum sense length supported */ struct iser_rx_desc { - struct iser_hdr iser_header; + struct iser_ctrl iser_header; struct iscsi_hdr iscsi_header; char data[ISER_RECV_DATA_SEG_LEN]; u64 dma_addr; struct ib_sge rx_sg; + struct ib_cqe cqe; char pad[ISER_RX_PAD_SIZE]; +} __packed; + +/** + * struct iser_login_desc - iSER login descriptor + * + * @req: pointer to login request buffer + * @resp: pointer to login response buffer + * @req_dma: DMA address of login request buffer + * @rsp_dma: DMA address of login response buffer + * @sge: IB sge for login post recv + * @cqe: completion handler + */ +struct iser_login_desc { + void *req; + void *rsp; + u64 req_dma; + u64 rsp_dma; + struct ib_sge sge; + struct ib_cqe cqe; } __attribute__((packed)); struct iser_conn; @@ -333,18 +320,12 @@ struct iscsi_iser_task; /** * struct iser_comp - iSER completion context * - * @device: pointer to device handle * @cq: completion queue - * @wcs: work completion array - * @tasklet: Tasklet handle * @active_qps: Number of active QPs attached * to completion context */ struct iser_comp { - struct iser_device *device; struct ib_cq *cq; - struct ib_wc wcs[ISER_WC_BATCH_COUNT]; - struct tasklet_struct tasklet; int active_qps; }; @@ -380,7 +361,6 @@ struct iser_reg_ops { * * @ib_device: RDMA device * @pd: Protection Domain for this device - * @dev_attr: Device attributes container * @mr: Global DMA memory region * @event_handler: IB events handle routine * @ig_list: entry in devices list @@ -389,18 +369,19 @@ struct iser_reg_ops { * cpus and device max completion vectors * @comps: Dinamically allocated array of completion handlers * @reg_ops: Registration ops + * @remote_inv_sup: Remote invalidate is supported on this device */ struct iser_device { struct ib_device *ib_device; struct ib_pd *pd; - struct ib_device_attr dev_attr; struct ib_mr *mr; struct ib_event_handler event_handler; struct list_head ig_list; int refcount; int comps_used; struct iser_comp *comps; - struct iser_reg_ops *reg_ops; + const struct iser_reg_ops *reg_ops; + bool remote_inv_sup; }; #define ISER_CHECK_GUARD 0xc0 @@ -475,10 +456,11 @@ struct iser_fr_pool { * @rx_wr: receive work request for batch posts * @device: reference to iser device * @comp: iser completion context - * @pi_support: Indicate device T10-PI support - * @beacon: beacon send wr to signal all flush errors were drained - * @flush_comp: completes when all connection completions consumed * @fr_pool: connection fast registration poool + * @pi_support: Indicate device T10-PI support + * @last: last send wr to signal all flush errors were drained + * @last_cqe: cqe handler for last wr + * @last_comp: completes when all connection completions consumed */ struct ib_conn { struct rdma_cm_id *cma_id; @@ -488,10 +470,12 @@ struct ib_conn { struct ib_recv_wr rx_wr[ISER_MIN_POSTED_RX]; struct iser_device *device; struct iser_comp *comp; - bool pi_support; - struct ib_send_wr beacon; - struct completion flush_comp; struct iser_fr_pool fr_pool; + bool pi_support; + struct ib_send_wr last; + struct ib_cqe last_cqe; + struct ib_cqe reg_cqe; + struct completion last_comp; }; /** @@ -514,11 +498,7 @@ struct ib_conn { * @up_completion: connection establishment completed * (state is ISER_CONN_UP) * @conn_list: entry in ig conn list - * @login_buf: login data buffer (stores login parameters) - * @login_req_buf: login request buffer - * @login_req_dma: login request buffer dma address - * @login_resp_buf: login response buffer - * @login_resp_dma: login response buffer dma address + * @login_desc: login descriptor * @rx_desc_head: head of rx_descs cyclic buffer * @rx_descs: rx buffers array (cyclic buffer) * @num_rx_descs: number of rx descriptors @@ -541,15 +521,13 @@ struct iser_conn { struct completion ib_completion; struct completion up_completion; struct list_head conn_list; - - char *login_buf; - char *login_req_buf, *login_resp_buf; - u64 login_req_dma, login_resp_dma; + struct iser_login_desc login_desc; unsigned int rx_desc_head; struct iser_rx_desc *rx_descs; u32 num_rx_descs; unsigned short scsi_sg_tablesize; unsigned int scsi_max_sectors; + bool snd_w_inv; }; /** @@ -579,9 +557,8 @@ struct iscsi_iser_task { struct iser_page_vec { u64 *pages; - int length; - int offset; - int data_size; + int npages; + struct ib_mr fake_mr; }; /** @@ -633,12 +610,14 @@ int iser_conn_terminate(struct iser_conn *iser_conn); void iser_release_work(struct work_struct *work); -void iser_rcv_completion(struct iser_rx_desc *desc, - unsigned long dto_xfer_len, - struct ib_conn *ib_conn); - -void iser_snd_completion(struct iser_tx_desc *desc, - struct ib_conn *ib_conn); +void iser_err_comp(struct ib_wc *wc, const char *type); +void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc); +void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc); +void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc); +void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc); void iser_task_rdma_init(struct iscsi_iser_task *task); @@ -651,7 +630,8 @@ void iser_finalize_rdma_unaligned_sg(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir); int iser_reg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir dir); + enum iser_data_dir dir, + bool all_imm); void iser_unreg_rdma_mem(struct iscsi_iser_task *task, enum iser_data_dir dir); @@ -719,4 +699,28 @@ iser_tx_next_wr(struct iser_tx_desc *tx_desc) return cur_wr; } +static inline struct iser_conn * +to_iser_conn(struct ib_conn *ib_conn) +{ + return container_of(ib_conn, struct iser_conn, ib_conn); +} + +static inline struct iser_rx_desc * +iser_rx(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_rx_desc, cqe); +} + +static inline struct iser_tx_desc * +iser_tx(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_tx_desc, cqe); +} + +static inline struct iser_login_desc * +iser_login(struct ib_cqe *cqe) +{ + return container_of(cqe, struct iser_login_desc, cqe); +} + #endif diff --git a/drivers/infiniband/ulp/iser/iser_initiator.c b/drivers/infiniband/ulp/iser/iser_initiator.c index ffd00c420729..ed54b388e7ad 100644 --- a/drivers/infiniband/ulp/iser/iser_initiator.c +++ b/drivers/infiniband/ulp/iser/iser_initiator.c @@ -51,7 +51,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) struct iscsi_iser_task *iser_task = task->dd_data; struct iser_mem_reg *mem_reg; int err; - struct iser_hdr *hdr = &iser_task->desc.iser_header; + struct iser_ctrl *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_in = &iser_task->data[ISER_DIR_IN]; err = iser_dma_map_task_data(iser_task, @@ -72,7 +72,7 @@ static int iser_prepare_read_cmd(struct iscsi_task *task) return err; } - err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN); + err = iser_reg_rdma_mem(iser_task, ISER_DIR_IN, false); if (err) { iser_err("Failed to set up Data-IN RDMA\n"); return err; @@ -104,7 +104,7 @@ iser_prepare_write_cmd(struct iscsi_task *task, struct iscsi_iser_task *iser_task = task->dd_data; struct iser_mem_reg *mem_reg; int err; - struct iser_hdr *hdr = &iser_task->desc.iser_header; + struct iser_ctrl *hdr = &iser_task->desc.iser_header; struct iser_data_buf *buf_out = &iser_task->data[ISER_DIR_OUT]; struct ib_sge *tx_dsg = &iser_task->desc.tx_sg[1]; @@ -126,7 +126,8 @@ iser_prepare_write_cmd(struct iscsi_task *task, return err; } - err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT); + err = iser_reg_rdma_mem(iser_task, ISER_DIR_OUT, + buf_out->data_len == imm_sz); if (err != 0) { iser_err("Failed to register write cmd RDMA mem\n"); return err; @@ -166,7 +167,7 @@ static void iser_create_send_desc(struct iser_conn *iser_conn, ib_dma_sync_single_for_cpu(device->ib_device, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); - memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); + memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl)); tx_desc->iser_header.flags = ISER_VER; tx_desc->num_sge = 1; } @@ -174,73 +175,63 @@ static void iser_create_send_desc(struct iser_conn *iser_conn, static void iser_free_login_buf(struct iser_conn *iser_conn) { struct iser_device *device = iser_conn->ib_conn.device; + struct iser_login_desc *desc = &iser_conn->login_desc; - if (!iser_conn->login_buf) + if (!desc->req) return; - if (iser_conn->login_req_dma) - ib_dma_unmap_single(device->ib_device, - iser_conn->login_req_dma, - ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); + ib_dma_unmap_single(device->ib_device, desc->req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, DMA_TO_DEVICE); - if (iser_conn->login_resp_dma) - ib_dma_unmap_single(device->ib_device, - iser_conn->login_resp_dma, - ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); + ib_dma_unmap_single(device->ib_device, desc->rsp_dma, + ISER_RX_LOGIN_SIZE, DMA_FROM_DEVICE); - kfree(iser_conn->login_buf); + kfree(desc->req); + kfree(desc->rsp); /* make sure we never redo any unmapping */ - iser_conn->login_req_dma = 0; - iser_conn->login_resp_dma = 0; - iser_conn->login_buf = NULL; + desc->req = NULL; + desc->rsp = NULL; } static int iser_alloc_login_buf(struct iser_conn *iser_conn) { struct iser_device *device = iser_conn->ib_conn.device; - int req_err, resp_err; - - BUG_ON(device == NULL); - - iser_conn->login_buf = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN + - ISER_RX_LOGIN_SIZE, GFP_KERNEL); - if (!iser_conn->login_buf) - goto out_err; - - iser_conn->login_req_buf = iser_conn->login_buf; - iser_conn->login_resp_buf = iser_conn->login_buf + - ISCSI_DEF_MAX_RECV_SEG_LEN; - - iser_conn->login_req_dma = ib_dma_map_single(device->ib_device, - iser_conn->login_req_buf, - ISCSI_DEF_MAX_RECV_SEG_LEN, - DMA_TO_DEVICE); - - iser_conn->login_resp_dma = ib_dma_map_single(device->ib_device, - iser_conn->login_resp_buf, - ISER_RX_LOGIN_SIZE, - DMA_FROM_DEVICE); - - req_err = ib_dma_mapping_error(device->ib_device, - iser_conn->login_req_dma); - resp_err = ib_dma_mapping_error(device->ib_device, - iser_conn->login_resp_dma); - - if (req_err || resp_err) { - if (req_err) - iser_conn->login_req_dma = 0; - if (resp_err) - iser_conn->login_resp_dma = 0; - goto free_login_buf; - } + struct iser_login_desc *desc = &iser_conn->login_desc; + + desc->req = kmalloc(ISCSI_DEF_MAX_RECV_SEG_LEN, GFP_KERNEL); + if (!desc->req) + return -ENOMEM; + + desc->req_dma = ib_dma_map_single(device->ib_device, desc->req, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); + if (ib_dma_mapping_error(device->ib_device, + desc->req_dma)) + goto free_req; + + desc->rsp = kmalloc(ISER_RX_LOGIN_SIZE, GFP_KERNEL); + if (!desc->rsp) + goto unmap_req; + + desc->rsp_dma = ib_dma_map_single(device->ib_device, desc->rsp, + ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + if (ib_dma_mapping_error(device->ib_device, + desc->rsp_dma)) + goto free_rsp; + return 0; -free_login_buf: - iser_free_login_buf(iser_conn); +free_rsp: + kfree(desc->rsp); +unmap_req: + ib_dma_unmap_single(device->ib_device, desc->req_dma, + ISCSI_DEF_MAX_RECV_SEG_LEN, + DMA_TO_DEVICE); +free_req: + kfree(desc->req); -out_err: - iser_err("unable to alloc or map login buf\n"); return -ENOMEM; } @@ -280,11 +271,11 @@ int iser_alloc_rx_descriptors(struct iser_conn *iser_conn, goto rx_desc_dma_map_failed; rx_desc->dma_addr = dma_addr; - + rx_desc->cqe.done = iser_task_rsp; rx_sg = &rx_desc->rx_sg; - rx_sg->addr = rx_desc->dma_addr; + rx_sg->addr = rx_desc->dma_addr; rx_sg->length = ISER_RX_PAYLOAD_SIZE; - rx_sg->lkey = device->pd->local_dma_lkey; + rx_sg->lkey = device->pd->local_dma_lkey; } iser_conn->rx_desc_head = 0; @@ -383,6 +374,7 @@ int iser_send_command(struct iscsi_conn *conn, /* build the tx desc regd header and add it to the tx desc dto */ tx_desc->type = ISCSI_TX_SCSI_COMMAND; + tx_desc->cqe.done = iser_cmd_comp; iser_create_send_desc(iser_conn, tx_desc); if (hdr->flags & ISCSI_FLAG_CMD_READ) { @@ -464,6 +456,7 @@ int iser_send_data_out(struct iscsi_conn *conn, } tx_desc->type = ISCSI_TX_DATAOUT; + tx_desc->cqe.done = iser_dataout_comp; tx_desc->iser_header.flags = ISER_VER; memcpy(&tx_desc->iscsi_header, hdr, sizeof(struct iscsi_hdr)); @@ -513,6 +506,7 @@ int iser_send_control(struct iscsi_conn *conn, /* build the tx desc regd header and add it to the tx desc dto */ mdesc->type = ISCSI_TX_CONTROL; + mdesc->cqe.done = iser_ctrl_comp; iser_create_send_desc(iser_conn, mdesc); device = iser_conn->ib_conn.device; @@ -520,25 +514,25 @@ int iser_send_control(struct iscsi_conn *conn, data_seg_len = ntoh24(task->hdr->dlength); if (data_seg_len > 0) { + struct iser_login_desc *desc = &iser_conn->login_desc; struct ib_sge *tx_dsg = &mdesc->tx_sg[1]; + if (task != conn->login_task) { iser_err("data present on non login task!!!\n"); goto send_control_error; } - ib_dma_sync_single_for_cpu(device->ib_device, - iser_conn->login_req_dma, task->data_count, - DMA_TO_DEVICE); + ib_dma_sync_single_for_cpu(device->ib_device, desc->req_dma, + task->data_count, DMA_TO_DEVICE); - memcpy(iser_conn->login_req_buf, task->data, task->data_count); + memcpy(desc->req, task->data, task->data_count); - ib_dma_sync_single_for_device(device->ib_device, - iser_conn->login_req_dma, task->data_count, - DMA_TO_DEVICE); + ib_dma_sync_single_for_device(device->ib_device, desc->req_dma, + task->data_count, DMA_TO_DEVICE); - tx_dsg->addr = iser_conn->login_req_dma; - tx_dsg->length = task->data_count; - tx_dsg->lkey = device->pd->local_dma_lkey; + tx_dsg->addr = desc->req_dma; + tx_dsg->length = task->data_count; + tx_dsg->lkey = device->pd->local_dma_lkey; mdesc->num_sge = 2; } @@ -562,41 +556,126 @@ send_control_error: return err; } -/** - * iser_rcv_dto_completion - recv DTO completion - */ -void iser_rcv_completion(struct iser_rx_desc *rx_desc, - unsigned long rx_xfer_len, - struct ib_conn *ib_conn) +void iser_login_rsp(struct ib_cq *cq, struct ib_wc *wc) { - struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, - ib_conn); + struct ib_conn *ib_conn = wc->qp->qp_context; + struct iser_conn *iser_conn = to_iser_conn(ib_conn); + struct iser_login_desc *desc = iser_login(wc->wr_cqe); struct iscsi_hdr *hdr; - u64 rx_dma; - int rx_buflen, outstanding, count, err; + char *data; + int length; - /* differentiate between login to all other PDUs */ - if ((char *)rx_desc == iser_conn->login_resp_buf) { - rx_dma = iser_conn->login_resp_dma; - rx_buflen = ISER_RX_LOGIN_SIZE; - } else { - rx_dma = rx_desc->dma_addr; - rx_buflen = ISER_RX_PAYLOAD_SIZE; + if (unlikely(wc->status != IB_WC_SUCCESS)) { + iser_err_comp(wc, "login_rsp"); + return; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, + desc->rsp_dma, ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + + hdr = desc->rsp + sizeof(struct iser_ctrl); + data = desc->rsp + ISER_HEADERS_LEN; + length = wc->byte_len - ISER_HEADERS_LEN; + + iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, + hdr->itt, length); + + iscsi_iser_recv(iser_conn->iscsi_conn, hdr, data, length); + + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + desc->rsp_dma, ISER_RX_LOGIN_SIZE, + DMA_FROM_DEVICE); + + ib_conn->post_recv_buf_count--; +} + +static inline void +iser_inv_desc(struct iser_fr_desc *desc, u32 rkey) +{ + if (likely(rkey == desc->rsc.mr->rkey)) + desc->rsc.mr_valid = 0; + else if (likely(rkey == desc->pi_ctx->sig_mr->rkey)) + desc->pi_ctx->sig_mr_valid = 0; +} + +static int +iser_check_remote_inv(struct iser_conn *iser_conn, + struct ib_wc *wc, + struct iscsi_hdr *hdr) +{ + if (wc->wc_flags & IB_WC_WITH_INVALIDATE) { + struct iscsi_task *task; + u32 rkey = wc->ex.invalidate_rkey; + + iser_dbg("conn %p: remote invalidation for rkey %#x\n", + iser_conn, rkey); + + if (unlikely(!iser_conn->snd_w_inv)) { + iser_err("conn %p: unexepected remote invalidation, " + "terminating connection\n", iser_conn); + return -EPROTO; + } + + task = iscsi_itt_to_ctask(iser_conn->iscsi_conn, hdr->itt); + if (likely(task)) { + struct iscsi_iser_task *iser_task = task->dd_data; + struct iser_fr_desc *desc; + + if (iser_task->dir[ISER_DIR_IN]) { + desc = iser_task->rdma_reg[ISER_DIR_IN].mem_h; + iser_inv_desc(desc, rkey); + } + + if (iser_task->dir[ISER_DIR_OUT]) { + desc = iser_task->rdma_reg[ISER_DIR_OUT].mem_h; + iser_inv_desc(desc, rkey); + } + } else { + iser_err("failed to get task for itt=%d\n", hdr->itt); + return -EINVAL; + } } - ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + return 0; +} - hdr = &rx_desc->iscsi_header; + +void iser_task_rsp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_conn *ib_conn = wc->qp->qp_context; + struct iser_conn *iser_conn = to_iser_conn(ib_conn); + struct iser_rx_desc *desc = iser_rx(wc->wr_cqe); + struct iscsi_hdr *hdr; + int length; + int outstanding, count, err; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + iser_err_comp(wc, "task_rsp"); + return; + } + + ib_dma_sync_single_for_cpu(ib_conn->device->ib_device, + desc->dma_addr, ISER_RX_PAYLOAD_SIZE, + DMA_FROM_DEVICE); + + hdr = &desc->iscsi_header; + length = wc->byte_len - ISER_HEADERS_LEN; iser_dbg("op 0x%x itt 0x%x dlen %d\n", hdr->opcode, - hdr->itt, (int)(rx_xfer_len - ISER_HEADERS_LEN)); + hdr->itt, length); + + if (iser_check_remote_inv(iser_conn, wc, hdr)) { + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + return; + } - iscsi_iser_recv(iser_conn->iscsi_conn, hdr, rx_desc->data, - rx_xfer_len - ISER_HEADERS_LEN); + iscsi_iser_recv(iser_conn->iscsi_conn, hdr, desc->data, length); - ib_dma_sync_single_for_device(ib_conn->device->ib_device, rx_dma, - rx_buflen, DMA_FROM_DEVICE); + ib_dma_sync_single_for_device(ib_conn->device->ib_device, + desc->dma_addr, ISER_RX_PAYLOAD_SIZE, + DMA_FROM_DEVICE); /* decrementing conn->post_recv_buf_count only --after-- freeing the * * task eliminates the need to worry on tasks which are completed in * @@ -604,9 +683,6 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, * for the posted rx bufs refcount to become zero handles everything */ ib_conn->post_recv_buf_count--; - if (rx_dma == iser_conn->login_resp_dma) - return; - outstanding = ib_conn->post_recv_buf_count; if (outstanding + iser_conn->min_posted_rx <= iser_conn->qp_max_recv_dtos) { count = min(iser_conn->qp_max_recv_dtos - outstanding, @@ -617,26 +693,47 @@ void iser_rcv_completion(struct iser_rx_desc *rx_desc, } } -void iser_snd_completion(struct iser_tx_desc *tx_desc, - struct ib_conn *ib_conn) +void iser_cmd_comp(struct ib_cq *cq, struct ib_wc *wc) { + if (unlikely(wc->status != IB_WC_SUCCESS)) + iser_err_comp(wc, "command"); +} + +void iser_ctrl_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct iser_tx_desc *desc = iser_tx(wc->wr_cqe); struct iscsi_task *task; - struct iser_device *device = ib_conn->device; - if (tx_desc->type == ISCSI_TX_DATAOUT) { - ib_dma_unmap_single(device->ib_device, tx_desc->dma_addr, - ISER_HEADERS_LEN, DMA_TO_DEVICE); - kmem_cache_free(ig.desc_cache, tx_desc); - tx_desc = NULL; + if (unlikely(wc->status != IB_WC_SUCCESS)) { + iser_err_comp(wc, "control"); + return; } - if (tx_desc && tx_desc->type == ISCSI_TX_CONTROL) { - /* this arithmetic is legal by libiscsi dd_data allocation */ - task = (void *) ((long)(void *)tx_desc - - sizeof(struct iscsi_task)); - if (task->hdr->itt == RESERVED_ITT) - iscsi_put_task(task); - } + /* this arithmetic is legal by libiscsi dd_data allocation */ + task = (void *)desc - sizeof(struct iscsi_task); + if (task->hdr->itt == RESERVED_ITT) + iscsi_put_task(task); +} + +void iser_dataout_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct iser_tx_desc *desc = iser_tx(wc->wr_cqe); + struct ib_conn *ib_conn = wc->qp->qp_context; + struct iser_device *device = ib_conn->device; + + if (unlikely(wc->status != IB_WC_SUCCESS)) + iser_err_comp(wc, "dataout"); + + ib_dma_unmap_single(device->ib_device, desc->dma_addr, + ISER_HEADERS_LEN, DMA_TO_DEVICE); + kmem_cache_free(ig.desc_cache, desc); +} + +void iser_last_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + struct ib_conn *ib_conn = wc->qp->qp_context; + + complete(&ib_conn->last_comp); } void iser_task_rdma_init(struct iscsi_iser_task *iser_task) diff --git a/drivers/infiniband/ulp/iser/iser_memory.c b/drivers/infiniband/ulp/iser/iser_memory.c index ea765fb9664d..9a391cc5b9b3 100644 --- a/drivers/infiniband/ulp/iser/iser_memory.c +++ b/drivers/infiniband/ulp/iser/iser_memory.c @@ -49,7 +49,7 @@ int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, struct iser_reg_resources *rsc, struct iser_mem_reg *mem_reg); -static struct iser_reg_ops fastreg_ops = { +static const struct iser_reg_ops fastreg_ops = { .alloc_reg_res = iser_alloc_fastreg_pool, .free_reg_res = iser_free_fastreg_pool, .reg_mem = iser_fast_reg_mr, @@ -58,7 +58,7 @@ static struct iser_reg_ops fastreg_ops = { .reg_desc_put = iser_reg_desc_put_fr, }; -static struct iser_reg_ops fmr_ops = { +static const struct iser_reg_ops fmr_ops = { .alloc_reg_res = iser_alloc_fmr_pool, .free_reg_res = iser_free_fmr_pool, .reg_mem = iser_fast_reg_fmr, @@ -67,19 +67,24 @@ static struct iser_reg_ops fmr_ops = { .reg_desc_put = iser_reg_desc_put_fmr, }; +void iser_reg_comp(struct ib_cq *cq, struct ib_wc *wc) +{ + iser_err_comp(wc, "memreg"); +} + int iser_assign_reg_ops(struct iser_device *device) { - struct ib_device_attr *dev_attr = &device->dev_attr; + struct ib_device *ib_dev = device->ib_device; /* Assign function handles - based on FMR support */ - if (device->ib_device->alloc_fmr && device->ib_device->dealloc_fmr && - device->ib_device->map_phys_fmr && device->ib_device->unmap_fmr) { + if (ib_dev->alloc_fmr && ib_dev->dealloc_fmr && + ib_dev->map_phys_fmr && ib_dev->unmap_fmr) { iser_info("FMR supported, using FMR for registration\n"); device->reg_ops = &fmr_ops; - } else - if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + } else if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { iser_info("FastReg supported, using FastReg for registration\n"); device->reg_ops = &fastreg_ops; + device->remote_inv_sup = iser_always_reg; } else { iser_err("IB device does not support FMRs nor FastRegs, can't register memory\n"); return -1; @@ -131,67 +136,6 @@ iser_reg_desc_put_fmr(struct ib_conn *ib_conn, { } -#define IS_4K_ALIGNED(addr) ((((unsigned long)addr) & ~MASK_4K) == 0) - -/** - * iser_sg_to_page_vec - Translates scatterlist entries to physical addresses - * and returns the length of resulting physical address array (may be less than - * the original due to possible compaction). - * - * we build a "page vec" under the assumption that the SG meets the RDMA - * alignment requirements. Other then the first and last SG elements, all - * the "internal" elements can be compacted into a list whose elements are - * dma addresses of physical pages. The code supports also the weird case - * where --few fragments of the same page-- are present in the SG as - * consecutive elements. Also, it handles one entry SG. - */ - -static int iser_sg_to_page_vec(struct iser_data_buf *data, - struct ib_device *ibdev, u64 *pages, - int *offset, int *data_size) -{ - struct scatterlist *sg, *sgl = data->sg; - u64 start_addr, end_addr, page, chunk_start = 0; - unsigned long total_sz = 0; - unsigned int dma_len; - int i, new_chunk, cur_page, last_ent = data->dma_nents - 1; - - /* compute the offset of first element */ - *offset = (u64) sgl[0].offset & ~MASK_4K; - - new_chunk = 1; - cur_page = 0; - for_each_sg(sgl, sg, data->dma_nents, i) { - start_addr = ib_sg_dma_address(ibdev, sg); - if (new_chunk) - chunk_start = start_addr; - dma_len = ib_sg_dma_len(ibdev, sg); - end_addr = start_addr + dma_len; - total_sz += dma_len; - - /* collect page fragments until aligned or end of SG list */ - if (!IS_4K_ALIGNED(end_addr) && i < last_ent) { - new_chunk = 0; - continue; - } - new_chunk = 1; - - /* address of the first page in the contiguous chunk; - masking relevant for the very first SG entry, - which might be unaligned */ - page = chunk_start & MASK_4K; - do { - pages[cur_page++] = page; - page += SIZE_4K; - } while (page < end_addr); - } - - *data_size = total_sz; - iser_dbg("page_vec->data_size:%d cur_page %d\n", - *data_size, cur_page); - return cur_page; -} - static void iser_data_buf_dump(struct iser_data_buf *data, struct ib_device *ibdev) { @@ -210,10 +154,10 @@ static void iser_dump_page_vec(struct iser_page_vec *page_vec) { int i; - iser_err("page vec length %d data size %d\n", - page_vec->length, page_vec->data_size); - for (i = 0; i < page_vec->length; i++) - iser_err("%d %lx\n",i,(unsigned long)page_vec->pages[i]); + iser_err("page vec npages %d data length %d\n", + page_vec->npages, page_vec->fake_mr.length); + for (i = 0; i < page_vec->npages; i++) + iser_err("vec[%d]: %llx\n", i, page_vec->pages[i]); } int iser_dma_map_task_data(struct iscsi_iser_task *iser_task, @@ -251,7 +195,11 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, struct scatterlist *sg = mem->sg; reg->sge.lkey = device->pd->local_dma_lkey; - reg->rkey = device->mr->rkey; + /* + * FIXME: rework the registration code path to differentiate + * rkey/lkey use cases + */ + reg->rkey = device->mr ? device->mr->rkey : 0; reg->sge.addr = ib_sg_dma_address(device->ib_device, &sg[0]); reg->sge.length = ib_sg_dma_len(device->ib_device, &sg[0]); @@ -262,11 +210,16 @@ iser_reg_dma(struct iser_device *device, struct iser_data_buf *mem, return 0; } -/** - * iser_reg_page_vec - Register physical memory - * - * returns: 0 on success, errno code on failure - */ +static int iser_set_page(struct ib_mr *mr, u64 addr) +{ + struct iser_page_vec *page_vec = + container_of(mr, struct iser_page_vec, fake_mr); + + page_vec->pages[page_vec->npages++] = addr; + + return 0; +} + static int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, struct iser_data_buf *mem, @@ -280,22 +233,19 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, struct ib_pool_fmr *fmr; int ret, plen; - plen = iser_sg_to_page_vec(mem, device->ib_device, - page_vec->pages, - &page_vec->offset, - &page_vec->data_size); - page_vec->length = plen; - if (plen * SIZE_4K < page_vec->data_size) { + page_vec->npages = 0; + page_vec->fake_mr.page_size = SIZE_4K; + plen = ib_sg_to_pages(&page_vec->fake_mr, mem->sg, + mem->size, iser_set_page); + if (unlikely(plen < mem->size)) { iser_err("page vec too short to hold this SG\n"); iser_data_buf_dump(mem, device->ib_device); iser_dump_page_vec(page_vec); return -EINVAL; } - fmr = ib_fmr_pool_map_phys(fmr_pool, - page_vec->pages, - page_vec->length, - page_vec->pages[0]); + fmr = ib_fmr_pool_map_phys(fmr_pool, page_vec->pages, + page_vec->npages, page_vec->pages[0]); if (IS_ERR(fmr)) { ret = PTR_ERR(fmr); iser_err("ib_fmr_pool_map_phys failed: %d\n", ret); @@ -304,8 +254,8 @@ int iser_fast_reg_fmr(struct iscsi_iser_task *iser_task, reg->sge.lkey = fmr->fmr->lkey; reg->rkey = fmr->fmr->rkey; - reg->sge.addr = page_vec->pages[0] + page_vec->offset; - reg->sge.length = page_vec->data_size; + reg->sge.addr = page_vec->fake_mr.iova; + reg->sge.length = page_vec->fake_mr.length; reg->mem_h = fmr; iser_dbg("fmr reg: lkey=0x%x, rkey=0x%x, addr=0x%llx," @@ -413,19 +363,16 @@ iser_set_prot_checks(struct scsi_cmnd *sc, u8 *mask) *mask |= ISER_CHECK_GUARD; } -static void -iser_inv_rkey(struct ib_send_wr *inv_wr, struct ib_mr *mr) +static inline void +iser_inv_rkey(struct ib_send_wr *inv_wr, + struct ib_mr *mr, + struct ib_cqe *cqe) { - u32 rkey; - inv_wr->opcode = IB_WR_LOCAL_INV; - inv_wr->wr_id = ISER_FASTREG_LI_WRID; + inv_wr->wr_cqe = cqe; inv_wr->ex.invalidate_rkey = mr->rkey; inv_wr->send_flags = 0; inv_wr->num_sge = 0; - - rkey = ib_inc_rkey(mr->rkey); - ib_update_fast_reg_key(mr, rkey); } static int @@ -437,7 +384,9 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, { struct iser_tx_desc *tx_desc = &iser_task->desc; struct ib_sig_attrs *sig_attrs = &tx_desc->sig_attrs; + struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; struct ib_sig_handover_wr *wr; + struct ib_mr *mr = pi_ctx->sig_mr; int ret; memset(sig_attrs, 0, sizeof(*sig_attrs)); @@ -447,17 +396,19 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, iser_set_prot_checks(iser_task->sc, &sig_attrs->check_mask); - if (!pi_ctx->sig_mr_valid) - iser_inv_rkey(iser_tx_next_wr(tx_desc), pi_ctx->sig_mr); + if (pi_ctx->sig_mr_valid) + iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe); + + ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); wr = sig_handover_wr(iser_tx_next_wr(tx_desc)); wr->wr.opcode = IB_WR_REG_SIG_MR; - wr->wr.wr_id = ISER_FASTREG_LI_WRID; + wr->wr.wr_cqe = cqe; wr->wr.sg_list = &data_reg->sge; wr->wr.num_sge = 1; wr->wr.send_flags = 0; wr->sig_attrs = sig_attrs; - wr->sig_mr = pi_ctx->sig_mr; + wr->sig_mr = mr; if (scsi_prot_sg_count(iser_task->sc)) wr->prot = &prot_reg->sge; else @@ -465,10 +416,10 @@ iser_reg_sig_mr(struct iscsi_iser_task *iser_task, wr->access_flags = IB_ACCESS_LOCAL_WRITE | IB_ACCESS_REMOTE_READ | IB_ACCESS_REMOTE_WRITE; - pi_ctx->sig_mr_valid = 0; + pi_ctx->sig_mr_valid = 1; - sig_reg->sge.lkey = pi_ctx->sig_mr->lkey; - sig_reg->rkey = pi_ctx->sig_mr->rkey; + sig_reg->sge.lkey = mr->lkey; + sig_reg->rkey = mr->rkey; sig_reg->sge.addr = 0; sig_reg->sge.length = scsi_transfer_length(iser_task->sc); @@ -485,12 +436,15 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, struct iser_mem_reg *reg) { struct iser_tx_desc *tx_desc = &iser_task->desc; + struct ib_cqe *cqe = &iser_task->iser_conn->ib_conn.reg_cqe; struct ib_mr *mr = rsc->mr; struct ib_reg_wr *wr; int n; - if (!rsc->mr_valid) - iser_inv_rkey(iser_tx_next_wr(tx_desc), mr); + if (rsc->mr_valid) + iser_inv_rkey(iser_tx_next_wr(tx_desc), mr, cqe); + + ib_update_fast_reg_key(mr, ib_inc_rkey(mr->rkey)); n = ib_map_mr_sg(mr, mem->sg, mem->size, SIZE_4K); if (unlikely(n != mem->size)) { @@ -501,7 +455,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, wr = reg_wr(iser_tx_next_wr(tx_desc)); wr->wr.opcode = IB_WR_REG_MR; - wr->wr.wr_id = ISER_FASTREG_LI_WRID; + wr->wr.wr_cqe = cqe; wr->wr.send_flags = 0; wr->wr.num_sge = 0; wr->mr = mr; @@ -510,7 +464,7 @@ static int iser_fast_reg_mr(struct iscsi_iser_task *iser_task, IB_ACCESS_REMOTE_WRITE | IB_ACCESS_REMOTE_READ; - rsc->mr_valid = 0; + rsc->mr_valid = 1; reg->sge.lkey = mr->lkey; reg->rkey = mr->rkey; @@ -554,7 +508,8 @@ iser_reg_data_sg(struct iscsi_iser_task *task, } int iser_reg_rdma_mem(struct iscsi_iser_task *task, - enum iser_data_dir dir) + enum iser_data_dir dir, + bool all_imm) { struct ib_conn *ib_conn = &task->iser_conn->ib_conn; struct iser_device *device = ib_conn->device; @@ -565,8 +520,8 @@ int iser_reg_rdma_mem(struct iscsi_iser_task *task, bool use_dma_key; int err; - use_dma_key = (mem->dma_nents == 1 && !iser_always_reg && - scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL); + use_dma_key = mem->dma_nents == 1 && (all_imm || !iser_always_reg) && + scsi_get_prot_op(task->sc) == SCSI_PROT_NORMAL; if (!use_dma_key) { desc = device->reg_ops->reg_desc_get(ib_conn); diff --git a/drivers/infiniband/ulp/iser/iser_verbs.c b/drivers/infiniband/ulp/iser/iser_verbs.c index 42f4da620f2e..40c0f4978e2f 100644 --- a/drivers/infiniband/ulp/iser/iser_verbs.c +++ b/drivers/infiniband/ulp/iser/iser_verbs.c @@ -44,17 +44,6 @@ #define ISER_MAX_CQ_LEN (ISER_MAX_RX_LEN + ISER_MAX_TX_LEN + \ ISCSI_ISER_MAX_CONN) -static int iser_cq_poll_limit = 512; - -static void iser_cq_tasklet_fn(unsigned long data); -static void iser_cq_callback(struct ib_cq *cq, void *cq_context); - -static void iser_cq_event_callback(struct ib_event *cause, void *context) -{ - iser_err("cq event %s (%d)\n", - ib_event_msg(cause->event), cause->event); -} - static void iser_qp_event_callback(struct ib_event *cause, void *context) { iser_err("qp event %s (%d)\n", @@ -78,59 +67,40 @@ static void iser_event_handler(struct ib_event_handler *handler, */ static int iser_create_device_ib_res(struct iser_device *device) { - struct ib_device_attr *dev_attr = &device->dev_attr; + struct ib_device *ib_dev = device->ib_device; int ret, i, max_cqe; - ret = ib_query_device(device->ib_device, dev_attr); - if (ret) { - pr_warn("Query device failed for %s\n", device->ib_device->name); - return ret; - } - ret = iser_assign_reg_ops(device); if (ret) return ret; device->comps_used = min_t(int, num_online_cpus(), - device->ib_device->num_comp_vectors); + ib_dev->num_comp_vectors); device->comps = kcalloc(device->comps_used, sizeof(*device->comps), GFP_KERNEL); if (!device->comps) goto comps_err; - max_cqe = min(ISER_MAX_CQ_LEN, dev_attr->max_cqe); + max_cqe = min(ISER_MAX_CQ_LEN, ib_dev->attrs.max_cqe); iser_info("using %d CQs, device %s supports %d vectors max_cqe %d\n", - device->comps_used, device->ib_device->name, - device->ib_device->num_comp_vectors, max_cqe); + device->comps_used, ib_dev->name, + ib_dev->num_comp_vectors, max_cqe); - device->pd = ib_alloc_pd(device->ib_device); + device->pd = ib_alloc_pd(ib_dev); if (IS_ERR(device->pd)) goto pd_err; for (i = 0; i < device->comps_used; i++) { - struct ib_cq_init_attr cq_attr = {}; struct iser_comp *comp = &device->comps[i]; - comp->device = device; - cq_attr.cqe = max_cqe; - cq_attr.comp_vector = i; - comp->cq = ib_create_cq(device->ib_device, - iser_cq_callback, - iser_cq_event_callback, - (void *)comp, - &cq_attr); + comp->cq = ib_alloc_cq(ib_dev, comp, max_cqe, i, + IB_POLL_SOFTIRQ); if (IS_ERR(comp->cq)) { comp->cq = NULL; goto cq_err; } - - if (ib_req_notify_cq(comp->cq, IB_CQ_NEXT_COMP)) - goto cq_err; - - tasklet_init(&comp->tasklet, iser_cq_tasklet_fn, - (unsigned long)comp); } if (!iser_always_reg) { @@ -140,11 +110,11 @@ static int iser_create_device_ib_res(struct iser_device *device) device->mr = ib_get_dma_mr(device->pd, access); if (IS_ERR(device->mr)) - goto dma_mr_err; + goto cq_err; } - INIT_IB_EVENT_HANDLER(&device->event_handler, device->ib_device, - iser_event_handler); + INIT_IB_EVENT_HANDLER(&device->event_handler, ib_dev, + iser_event_handler); if (ib_register_event_handler(&device->event_handler)) goto handler_err; @@ -153,15 +123,12 @@ static int iser_create_device_ib_res(struct iser_device *device) handler_err: if (device->mr) ib_dereg_mr(device->mr); -dma_mr_err: - for (i = 0; i < device->comps_used; i++) - tasklet_kill(&device->comps[i].tasklet); cq_err: for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; if (comp->cq) - ib_destroy_cq(comp->cq); + ib_free_cq(comp->cq); } ib_dealloc_pd(device->pd); pd_err: @@ -182,8 +149,7 @@ static void iser_free_device_ib_res(struct iser_device *device) for (i = 0; i < device->comps_used; i++) { struct iser_comp *comp = &device->comps[i]; - tasklet_kill(&comp->tasklet); - ib_destroy_cq(comp->cq); + ib_free_cq(comp->cq); comp->cq = NULL; } @@ -299,7 +265,7 @@ iser_alloc_reg_res(struct ib_device *ib_device, iser_err("Failed to allocate ib_fast_reg_mr err=%d\n", ret); return ret; } - res->mr_valid = 1; + res->mr_valid = 0; return 0; } @@ -336,7 +302,7 @@ iser_alloc_pi_ctx(struct ib_device *ib_device, ret = PTR_ERR(pi_ctx->sig_mr); goto sig_mr_failure; } - pi_ctx->sig_mr_valid = 1; + pi_ctx->sig_mr_valid = 0; desc->pi_ctx->sig_protected = 0; return 0; @@ -461,10 +427,9 @@ void iser_free_fastreg_pool(struct ib_conn *ib_conn) */ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) { - struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, - ib_conn); + struct iser_conn *iser_conn = to_iser_conn(ib_conn); struct iser_device *device; - struct ib_device_attr *dev_attr; + struct ib_device *ib_dev; struct ib_qp_init_attr init_attr; int ret = -ENOMEM; int index, min_index = 0; @@ -472,7 +437,7 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) BUG_ON(ib_conn->device == NULL); device = ib_conn->device; - dev_attr = &device->dev_attr; + ib_dev = device->ib_device; memset(&init_attr, 0, sizeof init_attr); @@ -503,16 +468,16 @@ static int iser_create_ib_conn_res(struct ib_conn *ib_conn) iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ISER_QP_SIG_MAX_REQ_DTOS); } else { - if (dev_attr->max_qp_wr > ISER_QP_MAX_REQ_DTOS) { + if (ib_dev->attrs.max_qp_wr > ISER_QP_MAX_REQ_DTOS) { init_attr.cap.max_send_wr = ISER_QP_MAX_REQ_DTOS + 1; iser_conn->max_cmds = ISER_GET_MAX_XMIT_CMDS(ISER_QP_MAX_REQ_DTOS); } else { - init_attr.cap.max_send_wr = dev_attr->max_qp_wr; + init_attr.cap.max_send_wr = ib_dev->attrs.max_qp_wr; iser_conn->max_cmds = - ISER_GET_MAX_XMIT_CMDS(dev_attr->max_qp_wr); + ISER_GET_MAX_XMIT_CMDS(ib_dev->attrs.max_qp_wr); iser_dbg("device %s supports max_send_wr %d\n", - device->ib_device->name, dev_attr->max_qp_wr); + device->ib_device->name, ib_dev->attrs.max_qp_wr); } } @@ -724,13 +689,13 @@ int iser_conn_terminate(struct iser_conn *iser_conn) iser_conn, err); /* post an indication that all flush errors were consumed */ - err = ib_post_send(ib_conn->qp, &ib_conn->beacon, &bad_wr); + err = ib_post_send(ib_conn->qp, &ib_conn->last, &bad_wr); if (err) { - iser_err("conn %p failed to post beacon", ib_conn); + iser_err("conn %p failed to post last wr", ib_conn); return 1; } - wait_for_completion(&ib_conn->flush_comp); + wait_for_completion(&ib_conn->last_comp); } return 1; @@ -756,7 +721,7 @@ iser_calc_scsi_params(struct iser_conn *iser_conn, sg_tablesize = DIV_ROUND_UP(max_sectors * 512, SIZE_4K); sup_sg_tablesize = min_t(unsigned, ISCSI_ISER_MAX_SG_TABLESIZE, - device->dev_attr.max_fast_reg_page_list_len); + device->ib_device->attrs.max_fast_reg_page_list_len); if (sg_tablesize > sup_sg_tablesize) { sg_tablesize = sup_sg_tablesize; @@ -799,7 +764,7 @@ static void iser_addr_handler(struct rdma_cm_id *cma_id) /* connection T10-PI support */ if (iser_pi_enable) { - if (!(device->dev_attr.device_cap_flags & + if (!(device->ib_device->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER)) { iser_warn("T10-PI requested but not supported on %s, " "continue without T10-PI\n", @@ -841,16 +806,17 @@ static void iser_route_handler(struct rdma_cm_id *cma_id) goto failure; memset(&conn_param, 0, sizeof conn_param); - conn_param.responder_resources = device->dev_attr.max_qp_rd_atom; + conn_param.responder_resources = device->ib_device->attrs.max_qp_rd_atom; conn_param.initiator_depth = 1; conn_param.retry_count = 7; conn_param.rnr_retry_count = 6; memset(&req_hdr, 0, sizeof(req_hdr)); - req_hdr.flags = (ISER_ZBVA_NOT_SUPPORTED | - ISER_SEND_W_INV_NOT_SUPPORTED); - conn_param.private_data = (void *)&req_hdr; - conn_param.private_data_len = sizeof(struct iser_cm_hdr); + req_hdr.flags = ISER_ZBVA_NOT_SUP; + if (!device->remote_inv_sup) + req_hdr.flags |= ISER_SEND_W_INV_NOT_SUP; + conn_param.private_data = (void *)&req_hdr; + conn_param.private_data_len = sizeof(struct iser_cm_hdr); ret = rdma_connect(cma_id, &conn_param); if (ret) { @@ -863,7 +829,8 @@ failure: iser_connect_error(cma_id); } -static void iser_connected_handler(struct rdma_cm_id *cma_id) +static void iser_connected_handler(struct rdma_cm_id *cma_id, + const void *private_data) { struct iser_conn *iser_conn; struct ib_qp_attr attr; @@ -877,6 +844,15 @@ static void iser_connected_handler(struct rdma_cm_id *cma_id) (void)ib_query_qp(cma_id->qp, &attr, ~0, &init_attr); iser_info("remote qpn:%x my qpn:%x\n", attr.dest_qp_num, cma_id->qp->qp_num); + if (private_data) { + u8 flags = *(u8 *)private_data; + + iser_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP); + } + + iser_info("conn %p: negotiated %s invalidation\n", + iser_conn, iser_conn->snd_w_inv ? "remote" : "local"); + iser_conn->state = ISER_CONN_UP; complete(&iser_conn->up_completion); } @@ -928,7 +904,7 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve iser_route_handler(cma_id); break; case RDMA_CM_EVENT_ESTABLISHED: - iser_connected_handler(cma_id); + iser_connected_handler(cma_id, event->param.conn.private_data); break; case RDMA_CM_EVENT_ADDR_ERROR: case RDMA_CM_EVENT_ROUTE_ERROR: @@ -967,14 +943,21 @@ static int iser_cma_handler(struct rdma_cm_id *cma_id, struct rdma_cm_event *eve void iser_conn_init(struct iser_conn *iser_conn) { + struct ib_conn *ib_conn = &iser_conn->ib_conn; + iser_conn->state = ISER_CONN_INIT; - iser_conn->ib_conn.post_recv_buf_count = 0; - init_completion(&iser_conn->ib_conn.flush_comp); init_completion(&iser_conn->stop_completion); init_completion(&iser_conn->ib_completion); init_completion(&iser_conn->up_completion); INIT_LIST_HEAD(&iser_conn->conn_list); mutex_init(&iser_conn->state_mutex); + + ib_conn->post_recv_buf_count = 0; + ib_conn->reg_cqe.done = iser_reg_comp; + ib_conn->last_cqe.done = iser_last_comp; + ib_conn->last.wr_cqe = &ib_conn->last_cqe; + ib_conn->last.opcode = IB_WR_SEND; + init_completion(&ib_conn->last_comp); } /** @@ -1000,9 +983,6 @@ int iser_connect(struct iser_conn *iser_conn, iser_conn->state = ISER_CONN_PENDING; - ib_conn->beacon.wr_id = ISER_BEACON_WRID; - ib_conn->beacon.opcode = IB_WR_SEND; - ib_conn->cma_id = rdma_create_id(&init_net, iser_cma_handler, (void *)iser_conn, RDMA_PS_TCP, IB_QPT_RC); @@ -1045,56 +1025,60 @@ connect_failure: int iser_post_recvl(struct iser_conn *iser_conn) { - struct ib_recv_wr rx_wr, *rx_wr_failed; struct ib_conn *ib_conn = &iser_conn->ib_conn; - struct ib_sge sge; + struct iser_login_desc *desc = &iser_conn->login_desc; + struct ib_recv_wr wr, *wr_failed; int ib_ret; - sge.addr = iser_conn->login_resp_dma; - sge.length = ISER_RX_LOGIN_SIZE; - sge.lkey = ib_conn->device->pd->local_dma_lkey; + desc->sge.addr = desc->rsp_dma; + desc->sge.length = ISER_RX_LOGIN_SIZE; + desc->sge.lkey = ib_conn->device->pd->local_dma_lkey; - rx_wr.wr_id = (uintptr_t)iser_conn->login_resp_buf; - rx_wr.sg_list = &sge; - rx_wr.num_sge = 1; - rx_wr.next = NULL; + desc->cqe.done = iser_login_rsp; + wr.wr_cqe = &desc->cqe; + wr.sg_list = &desc->sge; + wr.num_sge = 1; + wr.next = NULL; ib_conn->post_recv_buf_count++; - ib_ret = ib_post_recv(ib_conn->qp, &rx_wr, &rx_wr_failed); + ib_ret = ib_post_recv(ib_conn->qp, &wr, &wr_failed); if (ib_ret) { iser_err("ib_post_recv failed ret=%d\n", ib_ret); ib_conn->post_recv_buf_count--; } + return ib_ret; } int iser_post_recvm(struct iser_conn *iser_conn, int count) { - struct ib_recv_wr *rx_wr, *rx_wr_failed; - int i, ib_ret; struct ib_conn *ib_conn = &iser_conn->ib_conn; unsigned int my_rx_head = iser_conn->rx_desc_head; struct iser_rx_desc *rx_desc; + struct ib_recv_wr *wr, *wr_failed; + int i, ib_ret; - for (rx_wr = ib_conn->rx_wr, i = 0; i < count; i++, rx_wr++) { - rx_desc = &iser_conn->rx_descs[my_rx_head]; - rx_wr->wr_id = (uintptr_t)rx_desc; - rx_wr->sg_list = &rx_desc->rx_sg; - rx_wr->num_sge = 1; - rx_wr->next = rx_wr + 1; + for (wr = ib_conn->rx_wr, i = 0; i < count; i++, wr++) { + rx_desc = &iser_conn->rx_descs[my_rx_head]; + rx_desc->cqe.done = iser_task_rsp; + wr->wr_cqe = &rx_desc->cqe; + wr->sg_list = &rx_desc->rx_sg; + wr->num_sge = 1; + wr->next = wr + 1; my_rx_head = (my_rx_head + 1) & iser_conn->qp_max_recv_dtos_mask; } - rx_wr--; - rx_wr->next = NULL; /* mark end of work requests list */ + wr--; + wr->next = NULL; /* mark end of work requests list */ ib_conn->post_recv_buf_count += count; - ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &rx_wr_failed); + ib_ret = ib_post_recv(ib_conn->qp, ib_conn->rx_wr, &wr_failed); if (ib_ret) { iser_err("ib_post_recv failed ret=%d\n", ib_ret); ib_conn->post_recv_buf_count -= count; } else iser_conn->rx_desc_head = my_rx_head; + return ib_ret; } @@ -1115,7 +1099,7 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, DMA_TO_DEVICE); wr->next = NULL; - wr->wr_id = (uintptr_t)tx_desc; + wr->wr_cqe = &tx_desc->cqe; wr->sg_list = tx_desc->tx_sg; wr->num_sge = tx_desc->num_sge; wr->opcode = IB_WR_SEND; @@ -1129,149 +1113,6 @@ int iser_post_send(struct ib_conn *ib_conn, struct iser_tx_desc *tx_desc, return ib_ret; } -/** - * is_iser_tx_desc - Indicate if the completion wr_id - * is a TX descriptor or not. - * @iser_conn: iser connection - * @wr_id: completion WR identifier - * - * Since we cannot rely on wc opcode in FLUSH errors - * we must work around it by checking if the wr_id address - * falls in the iser connection rx_descs buffer. If so - * it is an RX descriptor, otherwize it is a TX. - */ -static inline bool -is_iser_tx_desc(struct iser_conn *iser_conn, void *wr_id) -{ - void *start = iser_conn->rx_descs; - int len = iser_conn->num_rx_descs * sizeof(*iser_conn->rx_descs); - - if (wr_id >= start && wr_id < start + len) - return false; - - return true; -} - -/** - * iser_handle_comp_error() - Handle error completion - * @ib_conn: connection RDMA resources - * @wc: work completion - * - * Notes: We may handle a FLUSH error completion and in this case - * we only cleanup in case TX type was DATAOUT. For non-FLUSH - * error completion we should also notify iscsi layer that - * connection is failed (in case we passed bind stage). - */ -static void -iser_handle_comp_error(struct ib_conn *ib_conn, - struct ib_wc *wc) -{ - void *wr_id = (void *)(uintptr_t)wc->wr_id; - struct iser_conn *iser_conn = container_of(ib_conn, struct iser_conn, - ib_conn); - - if (wc->status != IB_WC_WR_FLUSH_ERR) - if (iser_conn->iscsi_conn) - iscsi_conn_failure(iser_conn->iscsi_conn, - ISCSI_ERR_CONN_FAILED); - - if (wc->wr_id == ISER_FASTREG_LI_WRID) - return; - - if (is_iser_tx_desc(iser_conn, wr_id)) { - struct iser_tx_desc *desc = wr_id; - - if (desc->type == ISCSI_TX_DATAOUT) - kmem_cache_free(ig.desc_cache, desc); - } else { - ib_conn->post_recv_buf_count--; - } -} - -/** - * iser_handle_wc - handle a single work completion - * @wc: work completion - * - * Soft-IRQ context, work completion can be either - * SEND or RECV, and can turn out successful or - * with error (or flush error). - */ -static void iser_handle_wc(struct ib_wc *wc) -{ - struct ib_conn *ib_conn; - struct iser_tx_desc *tx_desc; - struct iser_rx_desc *rx_desc; - - ib_conn = wc->qp->qp_context; - if (likely(wc->status == IB_WC_SUCCESS)) { - if (wc->opcode == IB_WC_RECV) { - rx_desc = (struct iser_rx_desc *)(uintptr_t)wc->wr_id; - iser_rcv_completion(rx_desc, wc->byte_len, - ib_conn); - } else - if (wc->opcode == IB_WC_SEND) { - tx_desc = (struct iser_tx_desc *)(uintptr_t)wc->wr_id; - iser_snd_completion(tx_desc, ib_conn); - } else { - iser_err("Unknown wc opcode %d\n", wc->opcode); - } - } else { - if (wc->status != IB_WC_WR_FLUSH_ERR) - iser_err("%s (%d): wr id %llx vend_err %x\n", - ib_wc_status_msg(wc->status), wc->status, - wc->wr_id, wc->vendor_err); - else - iser_dbg("%s (%d): wr id %llx\n", - ib_wc_status_msg(wc->status), wc->status, - wc->wr_id); - - if (wc->wr_id == ISER_BEACON_WRID) - /* all flush errors were consumed */ - complete(&ib_conn->flush_comp); - else - iser_handle_comp_error(ib_conn, wc); - } -} - -/** - * iser_cq_tasklet_fn - iSER completion polling loop - * @data: iSER completion context - * - * Soft-IRQ context, polling connection CQ until - * either CQ was empty or we exausted polling budget - */ -static void iser_cq_tasklet_fn(unsigned long data) -{ - struct iser_comp *comp = (struct iser_comp *)data; - struct ib_cq *cq = comp->cq; - struct ib_wc *const wcs = comp->wcs; - int i, n, completed = 0; - - while ((n = ib_poll_cq(cq, ARRAY_SIZE(comp->wcs), wcs)) > 0) { - for (i = 0; i < n; i++) - iser_handle_wc(&wcs[i]); - - completed += n; - if (completed >= iser_cq_poll_limit) - break; - } - - /* - * It is assumed here that arming CQ only once its empty - * would not cause interrupts to be missed. - */ - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - - iser_dbg("got %d completions\n", completed); -} - -static void iser_cq_callback(struct ib_cq *cq, void *cq_context) -{ - struct iser_comp *comp = cq_context; - - tasklet_schedule(&comp->tasklet); -} - u8 iser_check_task_pi_status(struct iscsi_iser_task *iser_task, enum iser_data_dir cmd_dir, sector_t *sector) { @@ -1319,3 +1160,21 @@ err: /* Not alot we can do here, return ambiguous guard error */ return 0x1; } + +void iser_err_comp(struct ib_wc *wc, const char *type) +{ + if (wc->status != IB_WC_WR_FLUSH_ERR) { + struct iser_conn *iser_conn = to_iser_conn(wc->qp->qp_context); + + iser_err("%s failure: %s (%d) vend_err %x\n", type, + ib_wc_status_msg(wc->status), wc->status, + wc->vendor_err); + + if (iser_conn->iscsi_conn) + iscsi_conn_failure(iser_conn->iscsi_conn, + ISCSI_ERR_CONN_FAILED); + } else { + iser_dbg("%s failure: %s (%d)\n", type, + ib_wc_status_msg(wc->status), wc->status); + } +} diff --git a/drivers/infiniband/ulp/isert/ib_isert.c b/drivers/infiniband/ulp/isert/ib_isert.c index 468c5e132563..f121e6129339 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.c +++ b/drivers/infiniband/ulp/isert/ib_isert.c @@ -29,7 +29,6 @@ #include <target/iscsi/iscsi_transport.h> #include <linux/semaphore.h> -#include "isert_proto.h" #include "ib_isert.h" #define ISERT_MAX_CONN 8 @@ -95,22 +94,6 @@ isert_qp_event_callback(struct ib_event *e, void *context) } } -static int -isert_query_device(struct ib_device *ib_dev, struct ib_device_attr *devattr) -{ - int ret; - - ret = ib_query_device(ib_dev, devattr); - if (ret) { - isert_err("ib_query_device() failed: %d\n", ret); - return ret; - } - isert_dbg("devattr->max_sge: %d\n", devattr->max_sge); - isert_dbg("devattr->max_sge_rd: %d\n", devattr->max_sge_rd); - - return 0; -} - static struct isert_comp * isert_comp_get(struct isert_conn *isert_conn) { @@ -157,9 +140,9 @@ isert_create_qp(struct isert_conn *isert_conn, attr.recv_cq = comp->cq; attr.cap.max_send_wr = ISERT_QP_MAX_REQ_DTOS; attr.cap.max_recv_wr = ISERT_QP_MAX_RECV_DTOS + 1; - attr.cap.max_send_sge = device->dev_attr.max_sge; - isert_conn->max_sge = min(device->dev_attr.max_sge, - device->dev_attr.max_sge_rd); + attr.cap.max_send_sge = device->ib_device->attrs.max_sge; + isert_conn->max_sge = min(device->ib_device->attrs.max_sge, + device->ib_device->attrs.max_sge_rd); attr.cap.max_recv_sge = 1; attr.sq_sig_type = IB_SIGNAL_REQ_WR; attr.qp_type = IB_QPT_RC; @@ -287,8 +270,7 @@ isert_free_comps(struct isert_device *device) } static int -isert_alloc_comps(struct isert_device *device, - struct ib_device_attr *attr) +isert_alloc_comps(struct isert_device *device) { int i, max_cqe, ret = 0; @@ -308,7 +290,7 @@ isert_alloc_comps(struct isert_device *device, return -ENOMEM; } - max_cqe = min(ISER_MAX_CQ_LEN, attr->max_cqe); + max_cqe = min(ISER_MAX_CQ_LEN, device->ib_device->attrs.max_cqe); for (i = 0; i < device->comps_used; i++) { struct ib_cq_init_attr cq_attr = {}; @@ -344,17 +326,15 @@ out_cq: static int isert_create_device_ib_res(struct isert_device *device) { - struct ib_device_attr *dev_attr; + struct ib_device *ib_dev = device->ib_device; int ret; - dev_attr = &device->dev_attr; - ret = isert_query_device(device->ib_device, dev_attr); - if (ret) - goto out; + isert_dbg("devattr->max_sge: %d\n", ib_dev->attrs.max_sge); + isert_dbg("devattr->max_sge_rd: %d\n", ib_dev->attrs.max_sge_rd); /* asign function handlers */ - if (dev_attr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && - dev_attr->device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { + if (ib_dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS && + ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER) { device->use_fastreg = 1; device->reg_rdma_mem = isert_reg_rdma; device->unreg_rdma_mem = isert_unreg_rdma; @@ -364,11 +344,11 @@ isert_create_device_ib_res(struct isert_device *device) device->unreg_rdma_mem = isert_unmap_cmd; } - ret = isert_alloc_comps(device, dev_attr); + ret = isert_alloc_comps(device); if (ret) goto out; - device->pd = ib_alloc_pd(device->ib_device); + device->pd = ib_alloc_pd(ib_dev); if (IS_ERR(device->pd)) { ret = PTR_ERR(device->pd); isert_err("failed to allocate pd, device %p, ret=%d\n", @@ -377,7 +357,7 @@ isert_create_device_ib_res(struct isert_device *device) } /* Check signature cap */ - device->pi_capable = dev_attr->device_cap_flags & + device->pi_capable = ib_dev->attrs.device_cap_flags & IB_DEVICE_SIGNATURE_HANDOVER ? true : false; return 0; @@ -676,6 +656,32 @@ out_login_buf: return ret; } +static void +isert_set_nego_params(struct isert_conn *isert_conn, + struct rdma_conn_param *param) +{ + struct ib_device_attr *attr = &isert_conn->device->ib_device->attrs; + + /* Set max inflight RDMA READ requests */ + isert_conn->initiator_depth = min_t(u8, param->initiator_depth, + attr->max_qp_init_rd_atom); + isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth); + + if (param->private_data) { + u8 flags = *(u8 *)param->private_data; + + /* + * use remote invalidation if the both initiator + * and the HCA support it + */ + isert_conn->snd_w_inv = !(flags & ISER_SEND_W_INV_NOT_SUP) && + (attr->device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS); + if (isert_conn->snd_w_inv) + isert_info("Using remote invalidation\n"); + } +} + static int isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) { @@ -714,11 +720,7 @@ isert_connect_request(struct rdma_cm_id *cma_id, struct rdma_cm_event *event) } isert_conn->device = device; - /* Set max inflight RDMA READ requests */ - isert_conn->initiator_depth = min_t(u8, - event->param.conn.initiator_depth, - device->dev_attr.max_qp_init_rd_atom); - isert_dbg("Using initiator_depth: %u\n", isert_conn->initiator_depth); + isert_set_nego_params(isert_conn, &event->param.conn); ret = isert_conn_setup_qp(isert_conn, cma_id); if (ret) @@ -1050,8 +1052,8 @@ isert_create_send_desc(struct isert_conn *isert_conn, ib_dma_sync_single_for_cpu(ib_dev, tx_desc->dma_addr, ISER_HEADERS_LEN, DMA_TO_DEVICE); - memset(&tx_desc->iser_header, 0, sizeof(struct iser_hdr)); - tx_desc->iser_header.flags = ISER_VER; + memset(&tx_desc->iser_header, 0, sizeof(struct iser_ctrl)); + tx_desc->iser_header.flags = ISCSI_CTRL; tx_desc->num_sge = 1; tx_desc->isert_cmd = isert_cmd; @@ -1097,7 +1099,14 @@ isert_init_send_wr(struct isert_conn *isert_conn, struct isert_cmd *isert_cmd, isert_cmd->rdma_wr.iser_ib_op = ISER_IB_SEND; send_wr->wr_id = (uintptr_t)&isert_cmd->tx_desc; - send_wr->opcode = IB_WR_SEND; + + if (isert_conn->snd_w_inv && isert_cmd->inv_rkey) { + send_wr->opcode = IB_WR_SEND_WITH_INV; + send_wr->ex.invalidate_rkey = isert_cmd->inv_rkey; + } else { + send_wr->opcode = IB_WR_SEND; + } + send_wr->sg_list = &tx_desc->tx_sg[0]; send_wr->num_sge = isert_cmd->tx_desc.num_sge; send_wr->send_flags = IB_SEND_SIGNALED; @@ -1486,6 +1495,7 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, isert_cmd->read_va = read_va; isert_cmd->write_stag = write_stag; isert_cmd->write_va = write_va; + isert_cmd->inv_rkey = read_stag ? read_stag : write_stag; ret = isert_handle_scsi_cmd(isert_conn, isert_cmd, cmd, rx_desc, (unsigned char *)hdr); @@ -1543,21 +1553,21 @@ isert_rx_opcode(struct isert_conn *isert_conn, struct iser_rx_desc *rx_desc, static void isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) { - struct iser_hdr *iser_hdr = &rx_desc->iser_header; + struct iser_ctrl *iser_ctrl = &rx_desc->iser_header; uint64_t read_va = 0, write_va = 0; uint32_t read_stag = 0, write_stag = 0; - switch (iser_hdr->flags & 0xF0) { + switch (iser_ctrl->flags & 0xF0) { case ISCSI_CTRL: - if (iser_hdr->flags & ISER_RSV) { - read_stag = be32_to_cpu(iser_hdr->read_stag); - read_va = be64_to_cpu(iser_hdr->read_va); + if (iser_ctrl->flags & ISER_RSV) { + read_stag = be32_to_cpu(iser_ctrl->read_stag); + read_va = be64_to_cpu(iser_ctrl->read_va); isert_dbg("ISER_RSV: read_stag: 0x%x read_va: 0x%llx\n", read_stag, (unsigned long long)read_va); } - if (iser_hdr->flags & ISER_WSV) { - write_stag = be32_to_cpu(iser_hdr->write_stag); - write_va = be64_to_cpu(iser_hdr->write_va); + if (iser_ctrl->flags & ISER_WSV) { + write_stag = be32_to_cpu(iser_ctrl->write_stag); + write_va = be64_to_cpu(iser_ctrl->write_va); isert_dbg("ISER_WSV: write_stag: 0x%x write_va: 0x%llx\n", write_stag, (unsigned long long)write_va); } @@ -1568,7 +1578,7 @@ isert_rx_do_work(struct iser_rx_desc *rx_desc, struct isert_conn *isert_conn) isert_err("iSER Hello message\n"); break; default: - isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_hdr->flags); + isert_warn("Unknown iSER hdr flags: 0x%02x\n", iser_ctrl->flags); break; } @@ -3095,12 +3105,20 @@ isert_rdma_accept(struct isert_conn *isert_conn) struct rdma_cm_id *cm_id = isert_conn->cm_id; struct rdma_conn_param cp; int ret; + struct iser_cm_hdr rsp_hdr; memset(&cp, 0, sizeof(struct rdma_conn_param)); cp.initiator_depth = isert_conn->initiator_depth; cp.retry_count = 7; cp.rnr_retry_count = 7; + memset(&rsp_hdr, 0, sizeof(rsp_hdr)); + rsp_hdr.flags = ISERT_ZBVA_NOT_USED; + if (!isert_conn->snd_w_inv) + rsp_hdr.flags = rsp_hdr.flags | ISERT_SEND_W_INV_NOT_USED; + cp.private_data = (void *)&rsp_hdr; + cp.private_data_len = sizeof(rsp_hdr); + ret = rdma_accept(cm_id, &cp); if (ret) { isert_err("rdma_accept() failed with: %d\n", ret); diff --git a/drivers/infiniband/ulp/isert/ib_isert.h b/drivers/infiniband/ulp/isert/ib_isert.h index 3d7fbc47c343..8d50453eef66 100644 --- a/drivers/infiniband/ulp/isert/ib_isert.h +++ b/drivers/infiniband/ulp/isert/ib_isert.h @@ -3,6 +3,8 @@ #include <linux/in6.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> +#include <scsi/iser.h> + #define DRV_NAME "isert" #define PFX DRV_NAME ": " @@ -31,6 +33,38 @@ #define isert_err(fmt, arg...) \ pr_err(PFX "%s: " fmt, __func__ , ## arg) +/* Constant PDU lengths calculations */ +#define ISER_HEADERS_LEN (sizeof(struct iser_ctrl) + \ + sizeof(struct iscsi_hdr)) +#define ISER_RECV_DATA_SEG_LEN 8192 +#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) +#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) + +/* QP settings */ +/* Maximal bounds on received asynchronous PDUs */ +#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ + +#define ISERT_MAX_RX_MISC_PDUS 6 /* + * NOOP_OUT(2), TEXT(1), + * SCSI_TMFUNC(2), LOGOUT(1) + */ + +#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ + +#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) + +#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) + +#define ISERT_INFLIGHT_DATAOUTS 8 + +#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ + (1 + ISERT_INFLIGHT_DATAOUTS) + \ + ISERT_MAX_TX_MISC_PDUS + \ + ISERT_MAX_RX_MISC_PDUS) + +#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \ + (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge))) + #define ISCSI_ISER_SG_TABLESIZE 256 #define ISER_FASTREG_LI_WRID 0xffffffffffffffffULL #define ISER_BEACON_WRID 0xfffffffffffffffeULL @@ -56,7 +90,7 @@ enum iser_conn_state { }; struct iser_rx_desc { - struct iser_hdr iser_header; + struct iser_ctrl iser_header; struct iscsi_hdr iscsi_header; char data[ISER_RECV_DATA_SEG_LEN]; u64 dma_addr; @@ -65,7 +99,7 @@ struct iser_rx_desc { } __packed; struct iser_tx_desc { - struct iser_hdr iser_header; + struct iser_ctrl iser_header; struct iscsi_hdr iscsi_header; enum isert_desc_type type; u64 dma_addr; @@ -129,6 +163,7 @@ struct isert_cmd { uint32_t write_stag; uint64_t read_va; uint64_t write_va; + uint32_t inv_rkey; u64 pdu_buf_dma; u32 pdu_buf_len; struct isert_conn *conn; @@ -176,6 +211,7 @@ struct isert_conn { struct work_struct release_work; struct ib_recv_wr beacon; bool logout_posted; + bool snd_w_inv; }; #define ISERT_MAX_CQ 64 @@ -207,7 +243,6 @@ struct isert_device { struct isert_comp *comps; int comps_used; struct list_head dev_node; - struct ib_device_attr dev_attr; int (*reg_rdma_mem)(struct iscsi_conn *conn, struct iscsi_cmd *cmd, struct isert_rdma_wr *wr); diff --git a/drivers/infiniband/ulp/isert/isert_proto.h b/drivers/infiniband/ulp/isert/isert_proto.h deleted file mode 100644 index 4dccd313b777..000000000000 --- a/drivers/infiniband/ulp/isert/isert_proto.h +++ /dev/null @@ -1,47 +0,0 @@ -/* From iscsi_iser.h */ - -struct iser_hdr { - u8 flags; - u8 rsvd[3]; - __be32 write_stag; /* write rkey */ - __be64 write_va; - __be32 read_stag; /* read rkey */ - __be64 read_va; -} __packed; - -/*Constant PDU lengths calculations */ -#define ISER_HEADERS_LEN (sizeof(struct iser_hdr) + sizeof(struct iscsi_hdr)) - -#define ISER_RECV_DATA_SEG_LEN 8192 -#define ISER_RX_PAYLOAD_SIZE (ISER_HEADERS_LEN + ISER_RECV_DATA_SEG_LEN) -#define ISER_RX_LOGIN_SIZE (ISER_HEADERS_LEN + ISCSI_DEF_MAX_RECV_SEG_LEN) - -/* QP settings */ -/* Maximal bounds on received asynchronous PDUs */ -#define ISERT_MAX_TX_MISC_PDUS 4 /* NOOP_IN(2) , ASYNC_EVENT(2) */ - -#define ISERT_MAX_RX_MISC_PDUS 6 /* NOOP_OUT(2), TEXT(1), * - * SCSI_TMFUNC(2), LOGOUT(1) */ - -#define ISCSI_DEF_XMIT_CMDS_MAX 128 /* from libiscsi.h, must be power of 2 */ - -#define ISERT_QP_MAX_RECV_DTOS (ISCSI_DEF_XMIT_CMDS_MAX) - -#define ISERT_MIN_POSTED_RX (ISCSI_DEF_XMIT_CMDS_MAX >> 2) - -#define ISERT_INFLIGHT_DATAOUTS 8 - -#define ISERT_QP_MAX_REQ_DTOS (ISCSI_DEF_XMIT_CMDS_MAX * \ - (1 + ISERT_INFLIGHT_DATAOUTS) + \ - ISERT_MAX_TX_MISC_PDUS + \ - ISERT_MAX_RX_MISC_PDUS) - -#define ISER_RX_PAD_SIZE (ISER_RECV_DATA_SEG_LEN + 4096 - \ - (ISER_RX_PAYLOAD_SIZE + sizeof(u64) + sizeof(struct ib_sge))) - -#define ISER_VER 0x10 -#define ISER_WSV 0x08 -#define ISER_RSV 0x04 -#define ISCSI_CTRL 0x10 -#define ISER_HELLO 0x20 -#define ISER_HELLORPLY 0x30 diff --git a/drivers/infiniband/ulp/srp/ib_srp.c b/drivers/infiniband/ulp/srp/ib_srp.c index 3db9a659719b..03022f6420d7 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.c +++ b/drivers/infiniband/ulp/srp/ib_srp.c @@ -132,8 +132,9 @@ MODULE_PARM_DESC(ch_count, static void srp_add_one(struct ib_device *device); static void srp_remove_one(struct ib_device *device, void *client_data); -static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr); -static void srp_send_completion(struct ib_cq *cq, void *ch_ptr); +static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, + const char *opname); static int srp_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event); static struct scsi_transport_template *ib_srp_transport_template; @@ -445,6 +446,17 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) dev->max_pages_per_mr); } +static void srp_drain_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct srp_rdma_ch *ch = cq->cq_context; + + complete(&ch->done); +} + +static struct ib_cqe srp_drain_cqe = { + .done = srp_drain_done, +}; + /** * srp_destroy_qp() - destroy an RDMA queue pair * @ch: SRP RDMA channel. @@ -457,10 +469,11 @@ static struct srp_fr_pool *srp_alloc_fr_pool(struct srp_target_port *target) static void srp_destroy_qp(struct srp_rdma_ch *ch) { static struct ib_qp_attr attr = { .qp_state = IB_QPS_ERR }; - static struct ib_recv_wr wr = { .wr_id = SRP_LAST_WR_ID }; + static struct ib_recv_wr wr = { 0 }; struct ib_recv_wr *bad_wr; int ret; + wr.wr_cqe = &srp_drain_cqe; /* Destroying a QP and reusing ch->done is only safe if not connected */ WARN_ON_ONCE(ch->connected); @@ -489,34 +502,27 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) struct ib_fmr_pool *fmr_pool = NULL; struct srp_fr_pool *fr_pool = NULL; const int m = dev->use_fast_reg ? 3 : 1; - struct ib_cq_init_attr cq_attr = {}; int ret; init_attr = kzalloc(sizeof *init_attr, GFP_KERNEL); if (!init_attr) return -ENOMEM; - /* + 1 for SRP_LAST_WR_ID */ - cq_attr.cqe = target->queue_size + 1; - cq_attr.comp_vector = ch->comp_vector; - recv_cq = ib_create_cq(dev->dev, srp_recv_completion, NULL, ch, - &cq_attr); + /* queue_size + 1 for ib_drain_qp */ + recv_cq = ib_alloc_cq(dev->dev, ch, target->queue_size + 1, + ch->comp_vector, IB_POLL_SOFTIRQ); if (IS_ERR(recv_cq)) { ret = PTR_ERR(recv_cq); goto err; } - cq_attr.cqe = m * target->queue_size; - cq_attr.comp_vector = ch->comp_vector; - send_cq = ib_create_cq(dev->dev, srp_send_completion, NULL, ch, - &cq_attr); + send_cq = ib_alloc_cq(dev->dev, ch, m * target->queue_size, + ch->comp_vector, IB_POLL_DIRECT); if (IS_ERR(send_cq)) { ret = PTR_ERR(send_cq); goto err_recv_cq; } - ib_req_notify_cq(recv_cq, IB_CQ_NEXT_COMP); - init_attr->event_handler = srp_qp_event; init_attr->cap.max_send_wr = m * target->queue_size; init_attr->cap.max_recv_wr = target->queue_size + 1; @@ -558,9 +564,9 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) if (ch->qp) srp_destroy_qp(ch); if (ch->recv_cq) - ib_destroy_cq(ch->recv_cq); + ib_free_cq(ch->recv_cq); if (ch->send_cq) - ib_destroy_cq(ch->send_cq); + ib_free_cq(ch->send_cq); ch->qp = qp; ch->recv_cq = recv_cq; @@ -580,13 +586,13 @@ static int srp_create_ch_ib(struct srp_rdma_ch *ch) return 0; err_qp: - ib_destroy_qp(qp); + srp_destroy_qp(ch); err_send_cq: - ib_destroy_cq(send_cq); + ib_free_cq(send_cq); err_recv_cq: - ib_destroy_cq(recv_cq); + ib_free_cq(recv_cq); err: kfree(init_attr); @@ -622,9 +628,10 @@ static void srp_free_ch_ib(struct srp_target_port *target, if (ch->fmr_pool) ib_destroy_fmr_pool(ch->fmr_pool); } + srp_destroy_qp(ch); - ib_destroy_cq(ch->send_cq); - ib_destroy_cq(ch->recv_cq); + ib_free_cq(ch->send_cq); + ib_free_cq(ch->recv_cq); /* * Avoid that the SCSI error handler tries to use this channel after @@ -1041,18 +1048,25 @@ out: return ret <= 0 ? ret : -ENODEV; } -static int srp_inv_rkey(struct srp_rdma_ch *ch, u32 rkey) +static void srp_inv_rkey_err_done(struct ib_cq *cq, struct ib_wc *wc) +{ + srp_handle_qp_err(cq, wc, "INV RKEY"); +} + +static int srp_inv_rkey(struct srp_request *req, struct srp_rdma_ch *ch, + u32 rkey) { struct ib_send_wr *bad_wr; struct ib_send_wr wr = { .opcode = IB_WR_LOCAL_INV, - .wr_id = LOCAL_INV_WR_ID_MASK, .next = NULL, .num_sge = 0, .send_flags = 0, .ex.invalidate_rkey = rkey, }; + wr.wr_cqe = &req->reg_cqe; + req->reg_cqe.done = srp_inv_rkey_err_done; return ib_post_send(ch->qp, &wr, &bad_wr); } @@ -1074,7 +1088,7 @@ static void srp_unmap_data(struct scsi_cmnd *scmnd, struct srp_fr_desc **pfr; for (i = req->nmdesc, pfr = req->fr_list; i > 0; i--, pfr++) { - res = srp_inv_rkey(ch, (*pfr)->mr->rkey); + res = srp_inv_rkey(req, ch, (*pfr)->mr->rkey); if (res < 0) { shost_printk(KERN_ERR, target->scsi_host, PFX "Queueing INV WR for rkey %#x failed (%d)\n", @@ -1312,7 +1326,13 @@ reset_state: return 0; } +static void srp_reg_mr_err_done(struct ib_cq *cq, struct ib_wc *wc) +{ + srp_handle_qp_err(cq, wc, "FAST REG"); +} + static int srp_map_finish_fr(struct srp_map_state *state, + struct srp_request *req, struct srp_rdma_ch *ch, int sg_nents) { struct srp_target_port *target = ch->target; @@ -1349,9 +1369,11 @@ static int srp_map_finish_fr(struct srp_map_state *state, if (unlikely(n < 0)) return n; + req->reg_cqe.done = srp_reg_mr_err_done; + wr.wr.next = NULL; wr.wr.opcode = IB_WR_REG_MR; - wr.wr.wr_id = FAST_REG_WR_ID_MASK; + wr.wr.wr_cqe = &req->reg_cqe; wr.wr.num_sge = 0; wr.wr.send_flags = 0; wr.mr = desc->mr; @@ -1455,7 +1477,7 @@ static int srp_map_sg_fr(struct srp_map_state *state, struct srp_rdma_ch *ch, while (count) { int i, n; - n = srp_map_finish_fr(state, ch, count); + n = srp_map_finish_fr(state, req, ch, count); if (unlikely(n < 0)) return n; @@ -1524,7 +1546,7 @@ static int srp_map_idb(struct srp_rdma_ch *ch, struct srp_request *req, #ifdef CONFIG_NEED_SG_DMA_LENGTH idb_sg->dma_length = idb_sg->length; /* hack^2 */ #endif - ret = srp_map_finish_fr(&state, ch, 1); + ret = srp_map_finish_fr(&state, req, ch, 1); if (ret < 0) return ret; } else if (dev->use_fmr) { @@ -1719,7 +1741,7 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, s32 rsv = (iu_type == SRP_IU_TSK_MGMT) ? 0 : SRP_TSK_MGMT_SQ_SIZE; struct srp_iu *iu; - srp_send_completion(ch->send_cq, ch); + ib_process_cq_direct(ch->send_cq, -1); if (list_empty(&ch->free_tx)) return NULL; @@ -1739,6 +1761,19 @@ static struct srp_iu *__srp_get_tx_iu(struct srp_rdma_ch *ch, return iu; } +static void srp_send_done(struct ib_cq *cq, struct ib_wc *wc) +{ + struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); + struct srp_rdma_ch *ch = cq->cq_context; + + if (unlikely(wc->status != IB_WC_SUCCESS)) { + srp_handle_qp_err(cq, wc, "SEND"); + return; + } + + list_add(&iu->list, &ch->free_tx); +} + static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len) { struct srp_target_port *target = ch->target; @@ -1749,8 +1784,10 @@ static int srp_post_send(struct srp_rdma_ch *ch, struct srp_iu *iu, int len) list.length = len; list.lkey = target->lkey; + iu->cqe.done = srp_send_done; + wr.next = NULL; - wr.wr_id = (uintptr_t) iu; + wr.wr_cqe = &iu->cqe; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IB_WR_SEND; @@ -1769,8 +1806,10 @@ static int srp_post_recv(struct srp_rdma_ch *ch, struct srp_iu *iu) list.length = iu->size; list.lkey = target->lkey; + iu->cqe.done = srp_recv_done; + wr.next = NULL; - wr.wr_id = (uintptr_t) iu; + wr.wr_cqe = &iu->cqe; wr.sg_list = &list; wr.num_sge = 1; @@ -1902,14 +1941,20 @@ static void srp_process_aer_req(struct srp_rdma_ch *ch, "problems processing SRP_AER_REQ\n"); } -static void srp_handle_recv(struct srp_rdma_ch *ch, struct ib_wc *wc) +static void srp_recv_done(struct ib_cq *cq, struct ib_wc *wc) { + struct srp_iu *iu = container_of(wc->wr_cqe, struct srp_iu, cqe); + struct srp_rdma_ch *ch = cq->cq_context; struct srp_target_port *target = ch->target; struct ib_device *dev = target->srp_host->srp_dev->dev; - struct srp_iu *iu = (struct srp_iu *) (uintptr_t) wc->wr_id; int res; u8 opcode; + if (unlikely(wc->status != IB_WC_SUCCESS)) { + srp_handle_qp_err(cq, wc, "RECV"); + return; + } + ib_dma_sync_single_for_cpu(dev, iu->dma, ch->max_ti_iu_len, DMA_FROM_DEVICE); @@ -1972,68 +2017,22 @@ static void srp_tl_err_work(struct work_struct *work) srp_start_tl_fail_timers(target->rport); } -static void srp_handle_qp_err(u64 wr_id, enum ib_wc_status wc_status, - bool send_err, struct srp_rdma_ch *ch) +static void srp_handle_qp_err(struct ib_cq *cq, struct ib_wc *wc, + const char *opname) { + struct srp_rdma_ch *ch = cq->cq_context; struct srp_target_port *target = ch->target; - if (wr_id == SRP_LAST_WR_ID) { - complete(&ch->done); - return; - } - if (ch->connected && !target->qp_in_error) { - if (wr_id & LOCAL_INV_WR_ID_MASK) { - shost_printk(KERN_ERR, target->scsi_host, PFX - "LOCAL_INV failed with status %s (%d)\n", - ib_wc_status_msg(wc_status), wc_status); - } else if (wr_id & FAST_REG_WR_ID_MASK) { - shost_printk(KERN_ERR, target->scsi_host, PFX - "FAST_REG_MR failed status %s (%d)\n", - ib_wc_status_msg(wc_status), wc_status); - } else { - shost_printk(KERN_ERR, target->scsi_host, - PFX "failed %s status %s (%d) for iu %p\n", - send_err ? "send" : "receive", - ib_wc_status_msg(wc_status), wc_status, - (void *)(uintptr_t)wr_id); - } + shost_printk(KERN_ERR, target->scsi_host, + PFX "failed %s status %s (%d) for CQE %p\n", + opname, ib_wc_status_msg(wc->status), wc->status, + wc->wr_cqe); queue_work(system_long_wq, &target->tl_err_work); } target->qp_in_error = true; } -static void srp_recv_completion(struct ib_cq *cq, void *ch_ptr) -{ - struct srp_rdma_ch *ch = ch_ptr; - struct ib_wc wc; - - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - while (ib_poll_cq(cq, 1, &wc) > 0) { - if (likely(wc.status == IB_WC_SUCCESS)) { - srp_handle_recv(ch, &wc); - } else { - srp_handle_qp_err(wc.wr_id, wc.status, false, ch); - } - } -} - -static void srp_send_completion(struct ib_cq *cq, void *ch_ptr) -{ - struct srp_rdma_ch *ch = ch_ptr; - struct ib_wc wc; - struct srp_iu *iu; - - while (ib_poll_cq(cq, 1, &wc) > 0) { - if (likely(wc.status == IB_WC_SUCCESS)) { - iu = (struct srp_iu *) (uintptr_t) wc.wr_id; - list_add(&iu->list, &ch->free_tx); - } else { - srp_handle_qp_err(wc.wr_id, wc.status, true, ch); - } - } -} - static int srp_queuecommand(struct Scsi_Host *shost, struct scsi_cmnd *scmnd) { struct srp_target_port *target = host_to_target(shost); @@ -3439,27 +3438,17 @@ free_host: static void srp_add_one(struct ib_device *device) { struct srp_device *srp_dev; - struct ib_device_attr *dev_attr; struct srp_host *host; int mr_page_shift, p; u64 max_pages_per_mr; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - pr_warn("Query device failed for %s\n", device->name); - goto free_attr; - } - srp_dev = kmalloc(sizeof *srp_dev, GFP_KERNEL); if (!srp_dev) - goto free_attr; + return; srp_dev->has_fmr = (device->alloc_fmr && device->dealloc_fmr && device->map_phys_fmr && device->unmap_fmr); - srp_dev->has_fr = (dev_attr->device_cap_flags & + srp_dev->has_fr = (device->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS); if (!srp_dev->has_fmr && !srp_dev->has_fr) dev_warn(&device->dev, "neither FMR nor FR is supported\n"); @@ -3473,23 +3462,23 @@ static void srp_add_one(struct ib_device *device) * minimum of 4096 bytes. We're unlikely to build large sglists * out of smaller entries. */ - mr_page_shift = max(12, ffs(dev_attr->page_size_cap) - 1); + mr_page_shift = max(12, ffs(device->attrs.page_size_cap) - 1); srp_dev->mr_page_size = 1 << mr_page_shift; srp_dev->mr_page_mask = ~((u64) srp_dev->mr_page_size - 1); - max_pages_per_mr = dev_attr->max_mr_size; + max_pages_per_mr = device->attrs.max_mr_size; do_div(max_pages_per_mr, srp_dev->mr_page_size); srp_dev->max_pages_per_mr = min_t(u64, SRP_MAX_PAGES_PER_MR, max_pages_per_mr); if (srp_dev->use_fast_reg) { srp_dev->max_pages_per_mr = min_t(u32, srp_dev->max_pages_per_mr, - dev_attr->max_fast_reg_page_list_len); + device->attrs.max_fast_reg_page_list_len); } srp_dev->mr_max_size = srp_dev->mr_page_size * srp_dev->max_pages_per_mr; - pr_debug("%s: mr_page_shift = %d, dev_attr->max_mr_size = %#llx, dev_attr->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", - device->name, mr_page_shift, dev_attr->max_mr_size, - dev_attr->max_fast_reg_page_list_len, + pr_debug("%s: mr_page_shift = %d, device->max_mr_size = %#llx, device->max_fast_reg_page_list_len = %u, max_pages_per_mr = %d, mr_max_size = %#x\n", + device->name, mr_page_shift, device->attrs.max_mr_size, + device->attrs.max_fast_reg_page_list_len, srp_dev->max_pages_per_mr, srp_dev->mr_max_size); INIT_LIST_HEAD(&srp_dev->dev_list); @@ -3517,17 +3506,13 @@ static void srp_add_one(struct ib_device *device) } ib_set_client_data(device, &srp_client, srp_dev); - - goto free_attr; + return; err_pd: ib_dealloc_pd(srp_dev->pd); free_dev: kfree(srp_dev); - -free_attr: - kfree(dev_attr); } static void srp_remove_one(struct ib_device *device, void *client_data) @@ -3587,8 +3572,6 @@ static int __init srp_init_module(void) { int ret; - BUILD_BUG_ON(FIELD_SIZEOF(struct ib_wc, wr_id) < sizeof(void *)); - if (srp_sg_tablesize) { pr_warn("srp_sg_tablesize is deprecated, please use cmd_sg_entries\n"); if (!cmd_sg_entries) diff --git a/drivers/infiniband/ulp/srp/ib_srp.h b/drivers/infiniband/ulp/srp/ib_srp.h index f6af531f9f32..9e05ce4a04fd 100644 --- a/drivers/infiniband/ulp/srp/ib_srp.h +++ b/drivers/infiniband/ulp/srp/ib_srp.h @@ -66,11 +66,6 @@ enum { SRP_TAG_TSK_MGMT = 1U << 31, SRP_MAX_PAGES_PER_MR = 512, - - LOCAL_INV_WR_ID_MASK = 1, - FAST_REG_WR_ID_MASK = 2, - - SRP_LAST_WR_ID = 0xfffffffcU, }; enum srp_target_state { @@ -128,6 +123,7 @@ struct srp_request { struct srp_direct_buf *indirect_desc; dma_addr_t indirect_dma_addr; short nmdesc; + struct ib_cqe reg_cqe; }; /** @@ -231,6 +227,7 @@ struct srp_iu { void *buf; size_t size; enum dma_data_direction direction; + struct ib_cqe cqe; }; /** diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.c b/drivers/infiniband/ulp/srpt/ib_srpt.c index bc5470c43d26..0c37fee363b1 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.c +++ b/drivers/infiniband/ulp/srpt/ib_srpt.c @@ -93,6 +93,8 @@ MODULE_PARM_DESC(srpt_service_guid, static struct ib_client srpt_client; static void srpt_release_channel(struct srpt_rdma_ch *ch); static int srpt_queue_status(struct se_cmd *cmd); +static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc); +static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc); /** * opposite_dma_dir() - Swap DMA_TO_DEVICE and DMA_FROM_DEVICE. @@ -341,10 +343,10 @@ static void srpt_get_ioc(struct srpt_port *sport, u32 slot, memset(iocp, 0, sizeof *iocp); strcpy(iocp->id_string, SRPT_ID_STRING); iocp->guid = cpu_to_be64(srpt_service_guid); - iocp->vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); - iocp->device_id = cpu_to_be32(sdev->dev_attr.vendor_part_id); - iocp->device_version = cpu_to_be16(sdev->dev_attr.hw_ver); - iocp->subsys_vendor_id = cpu_to_be32(sdev->dev_attr.vendor_id); + iocp->vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id); + iocp->device_id = cpu_to_be32(sdev->device->attrs.vendor_part_id); + iocp->device_version = cpu_to_be16(sdev->device->attrs.hw_ver); + iocp->subsys_vendor_id = cpu_to_be32(sdev->device->attrs.vendor_id); iocp->subsys_device_id = 0x0; iocp->io_class = cpu_to_be16(SRP_REV16A_IB_IO_CLASS); iocp->io_subclass = cpu_to_be16(SRP_IO_SUBCLASS); @@ -453,6 +455,7 @@ static void srpt_mad_send_handler(struct ib_mad_agent *mad_agent, * srpt_mad_recv_handler() - MAD reception callback function. */ static void srpt_mad_recv_handler(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_wc) { struct srpt_port *sport = (struct srpt_port *)mad_agent->context; @@ -778,12 +781,12 @@ static int srpt_post_recv(struct srpt_device *sdev, struct ib_recv_wr wr, *bad_wr; BUG_ON(!sdev); - wr.wr_id = encode_wr_id(SRPT_RECV, ioctx->ioctx.index); - list.addr = ioctx->ioctx.dma; list.length = srp_max_req_size; list.lkey = sdev->pd->local_dma_lkey; + ioctx->ioctx.cqe.done = srpt_recv_done; + wr.wr_cqe = &ioctx->ioctx.cqe; wr.next = NULL; wr.sg_list = &list; wr.num_sge = 1; @@ -819,8 +822,9 @@ static int srpt_post_send(struct srpt_rdma_ch *ch, list.length = len; list.lkey = sdev->pd->local_dma_lkey; + ioctx->ioctx.cqe.done = srpt_send_done; wr.next = NULL; - wr.wr_id = encode_wr_id(SRPT_SEND, ioctx->ioctx.index); + wr.wr_cqe = &ioctx->ioctx.cqe; wr.sg_list = &list; wr.num_sge = 1; wr.opcode = IB_WR_SEND; @@ -1052,13 +1056,13 @@ static void srpt_unmap_sg_to_ib_sge(struct srpt_rdma_ch *ch, BUG_ON(!ch); BUG_ON(!ioctx); - BUG_ON(ioctx->n_rdma && !ioctx->rdma_ius); + BUG_ON(ioctx->n_rdma && !ioctx->rdma_wrs); while (ioctx->n_rdma) - kfree(ioctx->rdma_ius[--ioctx->n_rdma].sge); + kfree(ioctx->rdma_wrs[--ioctx->n_rdma].wr.sg_list); - kfree(ioctx->rdma_ius); - ioctx->rdma_ius = NULL; + kfree(ioctx->rdma_wrs); + ioctx->rdma_wrs = NULL; if (ioctx->mapped_sg_count) { sg = ioctx->sg; @@ -1082,7 +1086,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, struct scatterlist *sg, *sg_orig; int sg_cnt; enum dma_data_direction dir; - struct rdma_iu *riu; + struct ib_rdma_wr *riu; struct srp_direct_buf *db; dma_addr_t dma_addr; struct ib_sge *sge; @@ -1109,23 +1113,24 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, ioctx->mapped_sg_count = count; - if (ioctx->rdma_ius && ioctx->n_rdma_ius) - nrdma = ioctx->n_rdma_ius; + if (ioctx->rdma_wrs && ioctx->n_rdma_wrs) + nrdma = ioctx->n_rdma_wrs; else { nrdma = (count + SRPT_DEF_SG_PER_WQE - 1) / SRPT_DEF_SG_PER_WQE + ioctx->n_rbuf; - ioctx->rdma_ius = kzalloc(nrdma * sizeof *riu, GFP_KERNEL); - if (!ioctx->rdma_ius) + ioctx->rdma_wrs = kcalloc(nrdma, sizeof(*ioctx->rdma_wrs), + GFP_KERNEL); + if (!ioctx->rdma_wrs) goto free_mem; - ioctx->n_rdma_ius = nrdma; + ioctx->n_rdma_wrs = nrdma; } db = ioctx->rbufs; tsize = cmd->data_length; dma_len = ib_sg_dma_len(dev, &sg[0]); - riu = ioctx->rdma_ius; + riu = ioctx->rdma_wrs; /* * For each remote desc - calculate the #ib_sge. @@ -1139,9 +1144,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { rsize = be32_to_cpu(db->len); raddr = be64_to_cpu(db->va); - riu->raddr = raddr; + riu->remote_addr = raddr; riu->rkey = be32_to_cpu(db->key); - riu->sge_cnt = 0; + riu->wr.num_sge = 0; /* calculate how many sge required for this remote_buf */ while (rsize > 0 && tsize > 0) { @@ -1165,33 +1170,35 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, rsize = 0; } - ++riu->sge_cnt; + ++riu->wr.num_sge; - if (rsize > 0 && riu->sge_cnt == SRPT_DEF_SG_PER_WQE) { + if (rsize > 0 && + riu->wr.num_sge == SRPT_DEF_SG_PER_WQE) { ++ioctx->n_rdma; - riu->sge = - kmalloc(riu->sge_cnt * sizeof *riu->sge, - GFP_KERNEL); - if (!riu->sge) + riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, + sizeof(*riu->wr.sg_list), + GFP_KERNEL); + if (!riu->wr.sg_list) goto free_mem; ++riu; - riu->sge_cnt = 0; - riu->raddr = raddr; + riu->wr.num_sge = 0; + riu->remote_addr = raddr; riu->rkey = be32_to_cpu(db->key); } } ++ioctx->n_rdma; - riu->sge = kmalloc(riu->sge_cnt * sizeof *riu->sge, - GFP_KERNEL); - if (!riu->sge) + riu->wr.sg_list = kmalloc_array(riu->wr.num_sge, + sizeof(*riu->wr.sg_list), + GFP_KERNEL); + if (!riu->wr.sg_list) goto free_mem; } db = ioctx->rbufs; tsize = cmd->data_length; - riu = ioctx->rdma_ius; + riu = ioctx->rdma_wrs; sg = sg_orig; dma_len = ib_sg_dma_len(dev, &sg[0]); dma_addr = ib_sg_dma_address(dev, &sg[0]); @@ -1200,7 +1207,7 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, for (i = 0, j = 0; j < count && i < ioctx->n_rbuf && tsize > 0; ++i, ++riu, ++db) { rsize = be32_to_cpu(db->len); - sge = riu->sge; + sge = riu->wr.sg_list; k = 0; while (rsize > 0 && tsize > 0) { @@ -1232,9 +1239,9 @@ static int srpt_map_sg_to_ib_sge(struct srpt_rdma_ch *ch, } ++k; - if (k == riu->sge_cnt && rsize > 0 && tsize > 0) { + if (k == riu->wr.num_sge && rsize > 0 && tsize > 0) { ++riu; - sge = riu->sge; + sge = riu->wr.sg_list; k = 0; } else if (rsize > 0 && tsize > 0) ++sge; @@ -1277,8 +1284,8 @@ static struct srpt_send_ioctx *srpt_get_send_ioctx(struct srpt_rdma_ch *ch) ioctx->n_rbuf = 0; ioctx->rbufs = NULL; ioctx->n_rdma = 0; - ioctx->n_rdma_ius = 0; - ioctx->rdma_ius = NULL; + ioctx->n_rdma_wrs = 0; + ioctx->rdma_wrs = NULL; ioctx->mapped_sg_count = 0; init_completion(&ioctx->tx_done); ioctx->queue_status_only = false; @@ -1380,118 +1387,44 @@ out: } /** - * srpt_handle_send_err_comp() - Process an IB_WC_SEND error completion. - */ -static void srpt_handle_send_err_comp(struct srpt_rdma_ch *ch, u64 wr_id) -{ - struct srpt_send_ioctx *ioctx; - enum srpt_command_state state; - u32 index; - - atomic_inc(&ch->sq_wr_avail); - - index = idx_from_wr_id(wr_id); - ioctx = ch->ioctx_ring[index]; - state = srpt_get_cmd_state(ioctx); - - WARN_ON(state != SRPT_STATE_CMD_RSP_SENT - && state != SRPT_STATE_MGMT_RSP_SENT - && state != SRPT_STATE_NEED_DATA - && state != SRPT_STATE_DONE); - - /* If SRP_RSP sending failed, undo the ch->req_lim change. */ - if (state == SRPT_STATE_CMD_RSP_SENT - || state == SRPT_STATE_MGMT_RSP_SENT) - atomic_dec(&ch->req_lim); - - srpt_abort_cmd(ioctx); -} - -/** - * srpt_handle_send_comp() - Process an IB send completion notification. - */ -static void srpt_handle_send_comp(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx) -{ - enum srpt_command_state state; - - atomic_inc(&ch->sq_wr_avail); - - state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); - - if (WARN_ON(state != SRPT_STATE_CMD_RSP_SENT - && state != SRPT_STATE_MGMT_RSP_SENT - && state != SRPT_STATE_DONE)) - pr_debug("state = %d\n", state); - - if (state != SRPT_STATE_DONE) { - srpt_unmap_sg_to_ib_sge(ch, ioctx); - transport_generic_free_cmd(&ioctx->cmd, 0); - } else { - pr_err("IB completion has been received too late for" - " wr_id = %u.\n", ioctx->ioctx.index); - } -} - -/** - * srpt_handle_rdma_comp() - Process an IB RDMA completion notification. - * * XXX: what is now target_execute_cmd used to be asynchronous, and unmapping * the data that has been transferred via IB RDMA had to be postponed until the * check_stop_free() callback. None of this is necessary anymore and needs to * be cleaned up. */ -static void srpt_handle_rdma_comp(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx, - enum srpt_opcode opcode) +static void srpt_rdma_read_done(struct ib_cq *cq, struct ib_wc *wc) { + struct srpt_rdma_ch *ch = cq->cq_context; + struct srpt_send_ioctx *ioctx = + container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); + WARN_ON(ioctx->n_rdma <= 0); atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); - if (opcode == SRPT_RDMA_READ_LAST) { - if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA, - SRPT_STATE_DATA_IN)) - target_execute_cmd(&ioctx->cmd); - else - pr_err("%s[%d]: wrong state = %d\n", __func__, - __LINE__, srpt_get_cmd_state(ioctx)); - } else if (opcode == SRPT_RDMA_ABORT) { - ioctx->rdma_aborted = true; - } else { - WARN(true, "unexpected opcode %d\n", opcode); + if (unlikely(wc->status != IB_WC_SUCCESS)) { + pr_info("RDMA_READ for ioctx 0x%p failed with status %d\n", + ioctx, wc->status); + srpt_abort_cmd(ioctx); + return; } + + if (srpt_test_and_set_cmd_state(ioctx, SRPT_STATE_NEED_DATA, + SRPT_STATE_DATA_IN)) + target_execute_cmd(&ioctx->cmd); + else + pr_err("%s[%d]: wrong state = %d\n", __func__, + __LINE__, srpt_get_cmd_state(ioctx)); } -/** - * srpt_handle_rdma_err_comp() - Process an IB RDMA error completion. - */ -static void srpt_handle_rdma_err_comp(struct srpt_rdma_ch *ch, - struct srpt_send_ioctx *ioctx, - enum srpt_opcode opcode) +static void srpt_rdma_write_done(struct ib_cq *cq, struct ib_wc *wc) { - enum srpt_command_state state; + struct srpt_send_ioctx *ioctx = + container_of(wc->wr_cqe, struct srpt_send_ioctx, rdma_cqe); - state = srpt_get_cmd_state(ioctx); - switch (opcode) { - case SRPT_RDMA_READ_LAST: - if (ioctx->n_rdma <= 0) { - pr_err("Received invalid RDMA read" - " error completion with idx %d\n", - ioctx->ioctx.index); - break; - } - atomic_add(ioctx->n_rdma, &ch->sq_wr_avail); - if (state == SRPT_STATE_NEED_DATA) - srpt_abort_cmd(ioctx); - else - pr_err("%s[%d]: wrong state = %d\n", - __func__, __LINE__, state); - break; - case SRPT_RDMA_WRITE_LAST: - break; - default: - pr_err("%s[%d]: opcode = %u\n", __func__, __LINE__, opcode); - break; + if (unlikely(wc->status != IB_WC_SUCCESS)) { + pr_info("RDMA_WRITE for ioctx 0x%p failed with status %d\n", + ioctx, wc->status); + srpt_abort_cmd(ioctx); } } @@ -1926,32 +1859,26 @@ out: return; } -static void srpt_process_rcv_completion(struct ib_cq *cq, - struct srpt_rdma_ch *ch, - struct ib_wc *wc) +static void srpt_recv_done(struct ib_cq *cq, struct ib_wc *wc) { - struct srpt_device *sdev = ch->sport->sdev; - struct srpt_recv_ioctx *ioctx; - u32 index; + struct srpt_rdma_ch *ch = cq->cq_context; + struct srpt_recv_ioctx *ioctx = + container_of(wc->wr_cqe, struct srpt_recv_ioctx, ioctx.cqe); - index = idx_from_wr_id(wc->wr_id); if (wc->status == IB_WC_SUCCESS) { int req_lim; req_lim = atomic_dec_return(&ch->req_lim); if (unlikely(req_lim < 0)) pr_err("req_lim = %d < 0\n", req_lim); - ioctx = sdev->ioctx_ring[index]; srpt_handle_new_iu(ch, ioctx, NULL); } else { - pr_info("receiving failed for idx %u with status %d\n", - index, wc->status); + pr_info("receiving failed for ioctx %p with status %d\n", + ioctx, wc->status); } } /** - * srpt_process_send_completion() - Process an IB send completion. - * * Note: Although this has not yet been observed during tests, at least in * theory it is possible that the srpt_get_send_ioctx() call invoked by * srpt_handle_new_iu() fails. This is possible because the req_lim_delta @@ -1964,109 +1891,52 @@ static void srpt_process_rcv_completion(struct ib_cq *cq, * are queued on cmd_wait_list. The code below processes these delayed * requests one at a time. */ -static void srpt_process_send_completion(struct ib_cq *cq, - struct srpt_rdma_ch *ch, - struct ib_wc *wc) +static void srpt_send_done(struct ib_cq *cq, struct ib_wc *wc) { - struct srpt_send_ioctx *send_ioctx; - uint32_t index; - enum srpt_opcode opcode; + struct srpt_rdma_ch *ch = cq->cq_context; + struct srpt_send_ioctx *ioctx = + container_of(wc->wr_cqe, struct srpt_send_ioctx, ioctx.cqe); + enum srpt_command_state state; - index = idx_from_wr_id(wc->wr_id); - opcode = opcode_from_wr_id(wc->wr_id); - send_ioctx = ch->ioctx_ring[index]; - if (wc->status == IB_WC_SUCCESS) { - if (opcode == SRPT_SEND) - srpt_handle_send_comp(ch, send_ioctx); - else { - WARN_ON(opcode != SRPT_RDMA_ABORT && - wc->opcode != IB_WC_RDMA_READ); - srpt_handle_rdma_comp(ch, send_ioctx, opcode); - } + state = srpt_set_cmd_state(ioctx, SRPT_STATE_DONE); + + WARN_ON(state != SRPT_STATE_CMD_RSP_SENT && + state != SRPT_STATE_MGMT_RSP_SENT); + + atomic_inc(&ch->sq_wr_avail); + + if (wc->status != IB_WC_SUCCESS) { + pr_info("sending response for ioctx 0x%p failed" + " with status %d\n", ioctx, wc->status); + + atomic_dec(&ch->req_lim); + srpt_abort_cmd(ioctx); + goto out; + } + + if (state != SRPT_STATE_DONE) { + srpt_unmap_sg_to_ib_sge(ch, ioctx); + transport_generic_free_cmd(&ioctx->cmd, 0); } else { - if (opcode == SRPT_SEND) { - pr_info("sending response for idx %u failed" - " with status %d\n", index, wc->status); - srpt_handle_send_err_comp(ch, wc->wr_id); - } else if (opcode != SRPT_RDMA_MID) { - pr_info("RDMA t %d for idx %u failed with" - " status %d\n", opcode, index, wc->status); - srpt_handle_rdma_err_comp(ch, send_ioctx, opcode); - } + pr_err("IB completion has been received too late for" + " wr_id = %u.\n", ioctx->ioctx.index); } - while (unlikely(opcode == SRPT_SEND - && !list_empty(&ch->cmd_wait_list) - && srpt_get_ch_state(ch) == CH_LIVE - && (send_ioctx = srpt_get_send_ioctx(ch)) != NULL)) { +out: + while (!list_empty(&ch->cmd_wait_list) && + srpt_get_ch_state(ch) == CH_LIVE && + (ioctx = srpt_get_send_ioctx(ch)) != NULL) { struct srpt_recv_ioctx *recv_ioctx; recv_ioctx = list_first_entry(&ch->cmd_wait_list, struct srpt_recv_ioctx, wait_list); list_del(&recv_ioctx->wait_list); - srpt_handle_new_iu(ch, recv_ioctx, send_ioctx); - } -} - -static void srpt_process_completion(struct ib_cq *cq, struct srpt_rdma_ch *ch) -{ - struct ib_wc *const wc = ch->wc; - int i, n; - - WARN_ON(cq != ch->cq); - - ib_req_notify_cq(cq, IB_CQ_NEXT_COMP); - while ((n = ib_poll_cq(cq, ARRAY_SIZE(ch->wc), wc)) > 0) { - for (i = 0; i < n; i++) { - if (opcode_from_wr_id(wc[i].wr_id) == SRPT_RECV) - srpt_process_rcv_completion(cq, ch, &wc[i]); - else - srpt_process_send_completion(cq, ch, &wc[i]); - } + srpt_handle_new_iu(ch, recv_ioctx, ioctx); } } /** - * srpt_completion() - IB completion queue callback function. - * - * Notes: - * - It is guaranteed that a completion handler will never be invoked - * concurrently on two different CPUs for the same completion queue. See also - * Documentation/infiniband/core_locking.txt and the implementation of - * handle_edge_irq() in kernel/irq/chip.c. - * - When threaded IRQs are enabled, completion handlers are invoked in thread - * context instead of interrupt context. - */ -static void srpt_completion(struct ib_cq *cq, void *ctx) -{ - struct srpt_rdma_ch *ch = ctx; - - wake_up_interruptible(&ch->wait_queue); -} - -static int srpt_compl_thread(void *arg) -{ - struct srpt_rdma_ch *ch; - - /* Hibernation / freezing of the SRPT kernel thread is not supported. */ - current->flags |= PF_NOFREEZE; - - ch = arg; - BUG_ON(!ch); - pr_info("Session %s: kernel thread %s (PID %d) started\n", - ch->sess_name, ch->thread->comm, current->pid); - while (!kthread_should_stop()) { - wait_event_interruptible(ch->wait_queue, - (srpt_process_completion(ch->cq, ch), - kthread_should_stop())); - } - pr_info("Session %s: kernel thread %s (PID %d) stopped\n", - ch->sess_name, ch->thread->comm, current->pid); - return 0; -} - -/** * srpt_create_ch_ib() - Create receive and send completion queues. */ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) @@ -2075,7 +1945,6 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) struct srpt_port *sport = ch->sport; struct srpt_device *sdev = sport->sdev; u32 srp_sq_size = sport->port_attrib.srp_sq_size; - struct ib_cq_init_attr cq_attr = {}; int ret; WARN_ON(ch->rq_size < 1); @@ -2086,9 +1955,8 @@ static int srpt_create_ch_ib(struct srpt_rdma_ch *ch) goto out; retry: - cq_attr.cqe = ch->rq_size + srp_sq_size; - ch->cq = ib_create_cq(sdev->device, srpt_completion, NULL, ch, - &cq_attr); + ch->cq = ib_alloc_cq(sdev->device, ch, ch->rq_size + srp_sq_size, + 0 /* XXX: spread CQs */, IB_POLL_WORKQUEUE); if (IS_ERR(ch->cq)) { ret = PTR_ERR(ch->cq); pr_err("failed to create CQ cqe= %d ret= %d\n", @@ -2131,18 +1999,6 @@ retry: if (ret) goto err_destroy_qp; - init_waitqueue_head(&ch->wait_queue); - - pr_debug("creating thread for session %s\n", ch->sess_name); - - ch->thread = kthread_run(srpt_compl_thread, ch, "ib_srpt_compl"); - if (IS_ERR(ch->thread)) { - pr_err("failed to create kernel thread %ld\n", - PTR_ERR(ch->thread)); - ch->thread = NULL; - goto err_destroy_qp; - } - out: kfree(qp_init); return ret; @@ -2150,17 +2006,14 @@ out: err_destroy_qp: ib_destroy_qp(ch->qp); err_destroy_cq: - ib_destroy_cq(ch->cq); + ib_free_cq(ch->cq); goto out; } static void srpt_destroy_ch_ib(struct srpt_rdma_ch *ch) { - if (ch->thread) - kthread_stop(ch->thread); - ib_destroy_qp(ch->qp); - ib_destroy_cq(ch->cq); + ib_free_cq(ch->cq); } /** @@ -2808,12 +2661,8 @@ static int srpt_cm_handler(struct ib_cm_id *cm_id, struct ib_cm_event *event) static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, struct srpt_send_ioctx *ioctx) { - struct ib_rdma_wr wr; struct ib_send_wr *bad_wr; - struct rdma_iu *riu; - int i; - int ret; - int sq_wr_avail; + int sq_wr_avail, ret, i; enum dma_data_direction dir; const int n_rdma = ioctx->n_rdma; @@ -2829,59 +2678,32 @@ static int srpt_perform_rdmas(struct srpt_rdma_ch *ch, } } - ioctx->rdma_aborted = false; - ret = 0; - riu = ioctx->rdma_ius; - memset(&wr, 0, sizeof wr); - - for (i = 0; i < n_rdma; ++i, ++riu) { - if (dir == DMA_FROM_DEVICE) { - wr.wr.opcode = IB_WR_RDMA_WRITE; - wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ? - SRPT_RDMA_WRITE_LAST : - SRPT_RDMA_MID, - ioctx->ioctx.index); - } else { - wr.wr.opcode = IB_WR_RDMA_READ; - wr.wr.wr_id = encode_wr_id(i == n_rdma - 1 ? - SRPT_RDMA_READ_LAST : - SRPT_RDMA_MID, - ioctx->ioctx.index); - } - wr.wr.next = NULL; - wr.remote_addr = riu->raddr; - wr.rkey = riu->rkey; - wr.wr.num_sge = riu->sge_cnt; - wr.wr.sg_list = riu->sge; + for (i = 0; i < n_rdma; i++) { + struct ib_send_wr *wr = &ioctx->rdma_wrs[i].wr; - /* only get completion event for the last rdma write */ - if (i == (n_rdma - 1) && dir == DMA_TO_DEVICE) - wr.wr.send_flags = IB_SEND_SIGNALED; + wr->opcode = (dir == DMA_FROM_DEVICE) ? + IB_WR_RDMA_WRITE : IB_WR_RDMA_READ; - ret = ib_post_send(ch->qp, &wr.wr, &bad_wr); - if (ret) - break; + if (i == n_rdma - 1) { + /* only get completion event for the last rdma read */ + if (dir == DMA_TO_DEVICE) { + wr->send_flags = IB_SEND_SIGNALED; + ioctx->rdma_cqe.done = srpt_rdma_read_done; + } else { + ioctx->rdma_cqe.done = srpt_rdma_write_done; + } + wr->wr_cqe = &ioctx->rdma_cqe; + wr->next = NULL; + } else { + wr->wr_cqe = NULL; + wr->next = &ioctx->rdma_wrs[i + 1].wr; + } } + ret = ib_post_send(ch->qp, &ioctx->rdma_wrs->wr, &bad_wr); if (ret) pr_err("%s[%d]: ib_post_send() returned %d for %d/%d\n", __func__, __LINE__, ret, i, n_rdma); - if (ret && i > 0) { - wr.wr.num_sge = 0; - wr.wr.wr_id = encode_wr_id(SRPT_RDMA_ABORT, ioctx->ioctx.index); - wr.wr.send_flags = IB_SEND_SIGNALED; - while (ch->state == CH_LIVE && - ib_post_send(ch->qp, &wr.wr, &bad_wr) != 0) { - pr_info("Trying to abort failed RDMA transfer [%d]\n", - ioctx->ioctx.index); - msleep(1000); - } - while (ch->state != CH_RELEASING && !ioctx->rdma_aborted) { - pr_info("Waiting until RDMA abort finished [%d]\n", - ioctx->ioctx.index); - msleep(1000); - } - } out: if (unlikely(dir == DMA_TO_DEVICE && ret < 0)) atomic_add(n_rdma, &ch->sq_wr_avail); @@ -3190,14 +3012,11 @@ static void srpt_add_one(struct ib_device *device) init_waitqueue_head(&sdev->ch_releaseQ); spin_lock_init(&sdev->spinlock); - if (ib_query_device(device, &sdev->dev_attr)) - goto free_dev; - sdev->pd = ib_alloc_pd(device); if (IS_ERR(sdev->pd)) goto free_dev; - sdev->srq_size = min(srpt_srq_size, sdev->dev_attr.max_srq_wr); + sdev->srq_size = min(srpt_srq_size, sdev->device->attrs.max_srq_wr); srq_attr.event_handler = srpt_srq_event; srq_attr.srq_context = (void *)sdev; @@ -3211,7 +3030,7 @@ static void srpt_add_one(struct ib_device *device) goto err_pd; pr_debug("%s: create SRQ #wr= %d max_allow=%d dev= %s\n", - __func__, sdev->srq_size, sdev->dev_attr.max_srq_wr, + __func__, sdev->srq_size, sdev->device->attrs.max_srq_wr, device->name); if (!srpt_service_guid) diff --git a/drivers/infiniband/ulp/srpt/ib_srpt.h b/drivers/infiniband/ulp/srpt/ib_srpt.h index 5366e0a9fd6d..09037f2b0b51 100644 --- a/drivers/infiniband/ulp/srpt/ib_srpt.h +++ b/drivers/infiniband/ulp/srpt/ib_srpt.h @@ -128,36 +128,6 @@ enum { DEFAULT_MAX_RDMA_SIZE = 65536, }; -enum srpt_opcode { - SRPT_RECV, - SRPT_SEND, - SRPT_RDMA_MID, - SRPT_RDMA_ABORT, - SRPT_RDMA_READ_LAST, - SRPT_RDMA_WRITE_LAST, -}; - -static inline u64 encode_wr_id(u8 opcode, u32 idx) -{ - return ((u64)opcode << 32) | idx; -} -static inline enum srpt_opcode opcode_from_wr_id(u64 wr_id) -{ - return wr_id >> 32; -} -static inline u32 idx_from_wr_id(u64 wr_id) -{ - return (u32)wr_id; -} - -struct rdma_iu { - u64 raddr; - u32 rkey; - struct ib_sge *sge; - u32 sge_cnt; - int mem_id; -}; - /** * enum srpt_command_state - SCSI command state managed by SRPT. * @SRPT_STATE_NEW: New command arrived and is being processed. @@ -189,6 +159,7 @@ enum srpt_command_state { * @index: Index of the I/O context in its ioctx_ring array. */ struct srpt_ioctx { + struct ib_cqe cqe; void *buf; dma_addr_t dma; uint32_t index; @@ -215,32 +186,30 @@ struct srpt_recv_ioctx { * @sg: Pointer to sg-list associated with this I/O context. * @sg_cnt: SG-list size. * @mapped_sg_count: ib_dma_map_sg() return value. - * @n_rdma_ius: Number of elements in the rdma_ius array. - * @rdma_ius: Array with information about the RDMA mapping. + * @n_rdma_wrs: Number of elements in the rdma_wrs array. + * @rdma_wrs: Array with information about the RDMA mapping. * @tag: Tag of the received SRP information unit. * @spinlock: Protects 'state'. * @state: I/O context state. - * @rdma_aborted: If initiating a multipart RDMA transfer failed, whether - * the already initiated transfers have finished. * @cmd: Target core command data structure. * @sense_data: SCSI sense data. */ struct srpt_send_ioctx { struct srpt_ioctx ioctx; struct srpt_rdma_ch *ch; - struct rdma_iu *rdma_ius; + struct ib_rdma_wr *rdma_wrs; + struct ib_cqe rdma_cqe; struct srp_direct_buf *rbufs; struct srp_direct_buf single_rbuf; struct scatterlist *sg; struct list_head free_list; spinlock_t spinlock; enum srpt_command_state state; - bool rdma_aborted; struct se_cmd cmd; struct completion tx_done; int sg_cnt; int mapped_sg_count; - u16 n_rdma_ius; + u16 n_rdma_wrs; u8 n_rdma; u8 n_rbuf; bool queue_status_only; @@ -267,9 +236,6 @@ enum rdma_ch_state { /** * struct srpt_rdma_ch - RDMA channel. - * @wait_queue: Allows the kernel thread to wait for more work. - * @thread: Kernel thread that processes the IB queues associated with - * the channel. * @cm_id: IB CM ID associated with the channel. * @qp: IB queue pair used for communicating over this channel. * @cq: IB completion queue for this channel. @@ -288,7 +254,6 @@ enum rdma_ch_state { * @free_list: Head of list with free send I/O contexts. * @state: channel state. See also enum rdma_ch_state. * @ioctx_ring: Send ring. - * @wc: IB work completion array for srpt_process_completion(). * @list: Node for insertion in the srpt_device.rch_list list. * @cmd_wait_list: List of SCSI commands that arrived before the RTU event. This * list contains struct srpt_ioctx elements and is protected @@ -299,8 +264,6 @@ enum rdma_ch_state { * @release_done: Enables waiting for srpt_release_channel() completion. */ struct srpt_rdma_ch { - wait_queue_head_t wait_queue; - struct task_struct *thread; struct ib_cm_id *cm_id; struct ib_qp *qp; struct ib_cq *cq; @@ -317,7 +280,6 @@ struct srpt_rdma_ch { struct list_head free_list; enum rdma_ch_state state; struct srpt_send_ioctx **ioctx_ring; - struct ib_wc wc[16]; struct list_head list; struct list_head cmd_wait_list; struct se_session *sess; @@ -377,8 +339,6 @@ struct srpt_port { * @mr: L_Key (local key) with write access to all local memory. * @srq: Per-HCA SRQ (shared receive queue). * @cm_id: Connection identifier. - * @dev_attr: Attributes of the InfiniBand device as obtained during the - * ib_client.add() callback. * @srq_size: SRQ size. * @ioctx_ring: Per-HCA SRQ. * @rch_list: Per-device channel list -- see also srpt_rdma_ch.list. @@ -393,7 +353,6 @@ struct srpt_device { struct ib_pd *pd; struct ib_srq *srq; struct ib_cm_id *cm_id; - struct ib_device_attr dev_attr; int srq_size; struct srpt_recv_ioctx **ioctx_ring; struct list_head rch_list; diff --git a/drivers/input/joystick/xpad.c b/drivers/input/joystick/xpad.c index fd4100d56d8c..6727954ab74b 100644 --- a/drivers/input/joystick/xpad.c +++ b/drivers/input/joystick/xpad.c @@ -76,10 +76,13 @@ */ #include <linux/kernel.h> +#include <linux/input.h> +#include <linux/rcupdate.h> #include <linux/slab.h> #include <linux/stat.h> #include <linux/module.h> #include <linux/usb/input.h> +#include <linux/usb/quirks.h> #define DRIVER_AUTHOR "Marko Friedemann <mfr@bmx-chemnitz.de>" #define DRIVER_DESC "X-Box pad driver" @@ -125,7 +128,7 @@ static const struct xpad_device { { 0x045e, 0x0289, "Microsoft X-Box pad v2 (US)", 0, XTYPE_XBOX }, { 0x045e, 0x028e, "Microsoft X-Box 360 pad", 0, XTYPE_XBOX360 }, { 0x045e, 0x02d1, "Microsoft X-Box One pad", 0, XTYPE_XBOXONE }, - { 0x045e, 0x02dd, "Microsoft X-Box One pad (Covert Forces)", 0, XTYPE_XBOXONE }, + { 0x045e, 0x02dd, "Microsoft X-Box One pad (Firmware 2015)", 0, XTYPE_XBOXONE }, { 0x045e, 0x0291, "Xbox 360 Wireless Receiver (XBOX)", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, { 0x045e, 0x0719, "Xbox 360 Wireless Receiver", MAP_DPAD_TO_BUTTONS, XTYPE_XBOX360W }, { 0x044f, 0x0f07, "Thrustmaster, Inc. Controller", 0, XTYPE_XBOX }, @@ -317,21 +320,42 @@ static struct usb_device_id xpad_table[] = { MODULE_DEVICE_TABLE(usb, xpad_table); +struct xpad_output_packet { + u8 data[XPAD_PKT_LEN]; + u8 len; + bool pending; +}; + +#define XPAD_OUT_CMD_IDX 0 +#define XPAD_OUT_FF_IDX 1 +#define XPAD_OUT_LED_IDX (1 + IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF)) +#define XPAD_NUM_OUT_PACKETS (1 + \ + IS_ENABLED(CONFIG_JOYSTICK_XPAD_FF) + \ + IS_ENABLED(CONFIG_JOYSTICK_XPAD_LEDS)) + struct usb_xpad { struct input_dev *dev; /* input device interface */ + struct input_dev __rcu *x360w_dev; struct usb_device *udev; /* usb device */ struct usb_interface *intf; /* usb interface */ - int pad_present; + bool pad_present; + bool input_created; struct urb *irq_in; /* urb for interrupt in report */ unsigned char *idata; /* input data */ dma_addr_t idata_dma; struct urb *irq_out; /* urb for interrupt out report */ + struct usb_anchor irq_out_anchor; + bool irq_out_active; /* we must not use an active URB */ + u8 odata_serial; /* serial number for xbox one protocol */ unsigned char *odata; /* output data */ dma_addr_t odata_dma; - struct mutex odata_mutex; + spinlock_t odata_lock; + + struct xpad_output_packet out_packets[XPAD_NUM_OUT_PACKETS]; + int last_out_packet; #if defined(CONFIG_JOYSTICK_XPAD_LEDS) struct xpad_led *led; @@ -343,8 +367,12 @@ struct usb_xpad { int xtype; /* type of xbox device */ int pad_nr; /* the order x360 pads were attached */ const char *name; /* name of the device */ + struct work_struct work; /* init/remove device from callback */ }; +static int xpad_init_input(struct usb_xpad *xpad); +static void xpad_deinit_input(struct usb_xpad *xpad); + /* * xpad_process_packet * @@ -424,11 +452,9 @@ static void xpad_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *d * http://www.free60.org/wiki/Gamepad */ -static void xpad360_process_packet(struct usb_xpad *xpad, +static void xpad360_process_packet(struct usb_xpad *xpad, struct input_dev *dev, u16 cmd, unsigned char *data) { - struct input_dev *dev = xpad->dev; - /* digital pad */ if (xpad->mapping & MAP_DPAD_TO_BUTTONS) { /* dpad as buttons (left, right, up, down) */ @@ -495,7 +521,30 @@ static void xpad360_process_packet(struct usb_xpad *xpad, input_sync(dev); } -static void xpad_identify_controller(struct usb_xpad *xpad); +static void xpad_presence_work(struct work_struct *work) +{ + struct usb_xpad *xpad = container_of(work, struct usb_xpad, work); + int error; + + if (xpad->pad_present) { + error = xpad_init_input(xpad); + if (error) { + /* complain only, not much else we can do here */ + dev_err(&xpad->dev->dev, + "unable to init device: %d\n", error); + } else { + rcu_assign_pointer(xpad->x360w_dev, xpad->dev); + } + } else { + RCU_INIT_POINTER(xpad->x360w_dev, NULL); + synchronize_rcu(); + /* + * Now that we are sure xpad360w_process_packet is not + * using input device we can get rid of it. + */ + xpad_deinit_input(xpad); + } +} /* * xpad360w_process_packet @@ -513,24 +562,28 @@ static void xpad_identify_controller(struct usb_xpad *xpad); */ static void xpad360w_process_packet(struct usb_xpad *xpad, u16 cmd, unsigned char *data) { + struct input_dev *dev; + bool present; + /* Presence change */ if (data[0] & 0x08) { - if (data[1] & 0x80) { - xpad->pad_present = 1; - /* - * Light up the segment corresponding to - * controller number. - */ - xpad_identify_controller(xpad); - } else - xpad->pad_present = 0; + present = (data[1] & 0x80) != 0; + + if (xpad->pad_present != present) { + xpad->pad_present = present; + schedule_work(&xpad->work); + } } /* Valid pad data */ - if (!(data[1] & 0x1)) + if (data[1] != 0x1) return; - xpad360_process_packet(xpad, cmd, &data[4]); + rcu_read_lock(); + dev = rcu_dereference(xpad->x360w_dev); + if (dev) + xpad360_process_packet(xpad, dev, cmd, &data[4]); + rcu_read_unlock(); } /* @@ -659,7 +712,7 @@ static void xpad_irq_in(struct urb *urb) switch (xpad->xtype) { case XTYPE_XBOX360: - xpad360_process_packet(xpad, 0, xpad->idata); + xpad360_process_packet(xpad, xpad->dev, 0, xpad->idata); break; case XTYPE_XBOX360W: xpad360w_process_packet(xpad, 0, xpad->idata); @@ -678,18 +731,73 @@ exit: __func__, retval); } +/* Callers must hold xpad->odata_lock spinlock */ +static bool xpad_prepare_next_out_packet(struct usb_xpad *xpad) +{ + struct xpad_output_packet *pkt, *packet = NULL; + int i; + + for (i = 0; i < XPAD_NUM_OUT_PACKETS; i++) { + if (++xpad->last_out_packet >= XPAD_NUM_OUT_PACKETS) + xpad->last_out_packet = 0; + + pkt = &xpad->out_packets[xpad->last_out_packet]; + if (pkt->pending) { + dev_dbg(&xpad->intf->dev, + "%s - found pending output packet %d\n", + __func__, xpad->last_out_packet); + packet = pkt; + break; + } + } + + if (packet) { + memcpy(xpad->odata, packet->data, packet->len); + xpad->irq_out->transfer_buffer_length = packet->len; + return true; + } + + return false; +} + +/* Callers must hold xpad->odata_lock spinlock */ +static int xpad_try_sending_next_out_packet(struct usb_xpad *xpad) +{ + int error; + + if (!xpad->irq_out_active && xpad_prepare_next_out_packet(xpad)) { + usb_anchor_urb(xpad->irq_out, &xpad->irq_out_anchor); + error = usb_submit_urb(xpad->irq_out, GFP_ATOMIC); + if (error) { + dev_err(&xpad->intf->dev, + "%s - usb_submit_urb failed with result %d\n", + __func__, error); + usb_unanchor_urb(xpad->irq_out); + return -EIO; + } + + xpad->irq_out_active = true; + } + + return 0; +} + static void xpad_irq_out(struct urb *urb) { struct usb_xpad *xpad = urb->context; struct device *dev = &xpad->intf->dev; - int retval, status; + int status = urb->status; + int error; + unsigned long flags; - status = urb->status; + spin_lock_irqsave(&xpad->odata_lock, flags); switch (status) { case 0: /* success */ - return; + xpad->out_packets[xpad->last_out_packet].pending = false; + xpad->irq_out_active = xpad_prepare_next_out_packet(xpad); + break; case -ECONNRESET: case -ENOENT: @@ -697,19 +805,28 @@ static void xpad_irq_out(struct urb *urb) /* this urb is terminated, clean up */ dev_dbg(dev, "%s - urb shutting down with status: %d\n", __func__, status); - return; + xpad->irq_out_active = false; + break; default: dev_dbg(dev, "%s - nonzero urb status received: %d\n", __func__, status); - goto exit; + break; } -exit: - retval = usb_submit_urb(urb, GFP_ATOMIC); - if (retval) - dev_err(dev, "%s - usb_submit_urb failed with result %d\n", - __func__, retval); + if (xpad->irq_out_active) { + usb_anchor_urb(urb, &xpad->irq_out_anchor); + error = usb_submit_urb(urb, GFP_ATOMIC); + if (error) { + dev_err(dev, + "%s - usb_submit_urb failed with result %d\n", + __func__, error); + usb_unanchor_urb(urb); + xpad->irq_out_active = false; + } + } + + spin_unlock_irqrestore(&xpad->odata_lock, flags); } static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad) @@ -721,6 +838,8 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad) if (xpad->xtype == XTYPE_UNKNOWN) return 0; + init_usb_anchor(&xpad->irq_out_anchor); + xpad->odata = usb_alloc_coherent(xpad->udev, XPAD_PKT_LEN, GFP_KERNEL, &xpad->odata_dma); if (!xpad->odata) { @@ -728,7 +847,7 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad) goto fail1; } - mutex_init(&xpad->odata_mutex); + spin_lock_init(&xpad->odata_lock); xpad->irq_out = usb_alloc_urb(0, GFP_KERNEL); if (!xpad->irq_out) { @@ -755,8 +874,14 @@ static int xpad_init_output(struct usb_interface *intf, struct usb_xpad *xpad) static void xpad_stop_output(struct usb_xpad *xpad) { - if (xpad->xtype != XTYPE_UNKNOWN) - usb_kill_urb(xpad->irq_out); + if (xpad->xtype != XTYPE_UNKNOWN) { + if (!usb_wait_anchor_empty_timeout(&xpad->irq_out_anchor, + 5000)) { + dev_warn(&xpad->intf->dev, + "timed out waiting for output URB to complete, killing\n"); + usb_kill_anchored_urbs(&xpad->irq_out_anchor); + } + } } static void xpad_deinit_output(struct usb_xpad *xpad) @@ -770,27 +895,60 @@ static void xpad_deinit_output(struct usb_xpad *xpad) static int xpad_inquiry_pad_presence(struct usb_xpad *xpad) { + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_CMD_IDX]; + unsigned long flags; int retval; - mutex_lock(&xpad->odata_mutex); + spin_lock_irqsave(&xpad->odata_lock, flags); + + packet->data[0] = 0x08; + packet->data[1] = 0x00; + packet->data[2] = 0x0F; + packet->data[3] = 0xC0; + packet->data[4] = 0x00; + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; + + /* Reset the sequence so we send out presence first */ + xpad->last_out_packet = -1; + retval = xpad_try_sending_next_out_packet(xpad); + + spin_unlock_irqrestore(&xpad->odata_lock, flags); - xpad->odata[0] = 0x08; - xpad->odata[1] = 0x00; - xpad->odata[2] = 0x0F; - xpad->odata[3] = 0xC0; - xpad->odata[4] = 0x00; - xpad->odata[5] = 0x00; - xpad->odata[6] = 0x00; - xpad->odata[7] = 0x00; - xpad->odata[8] = 0x00; - xpad->odata[9] = 0x00; - xpad->odata[10] = 0x00; - xpad->odata[11] = 0x00; - xpad->irq_out->transfer_buffer_length = 12; + return retval; +} + +static int xpad_start_xbox_one(struct usb_xpad *xpad) +{ + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_CMD_IDX]; + unsigned long flags; + int retval; - retval = usb_submit_urb(xpad->irq_out, GFP_KERNEL); + spin_lock_irqsave(&xpad->odata_lock, flags); - mutex_unlock(&xpad->odata_mutex); + /* Xbox one controller needs to be initialized. */ + packet->data[0] = 0x05; + packet->data[1] = 0x20; + packet->data[2] = xpad->odata_serial++; /* packet serial */ + packet->data[3] = 0x01; /* rumble bit enable? */ + packet->data[4] = 0x00; + packet->len = 5; + packet->pending = true; + + /* Reset the sequence so we send out start packet first */ + xpad->last_out_packet = -1; + retval = xpad_try_sending_next_out_packet(xpad); + + spin_unlock_irqrestore(&xpad->odata_lock, flags); return retval; } @@ -799,8 +957,11 @@ static int xpad_inquiry_pad_presence(struct usb_xpad *xpad) static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect *effect) { struct usb_xpad *xpad = input_get_drvdata(dev); + struct xpad_output_packet *packet = &xpad->out_packets[XPAD_OUT_FF_IDX]; __u16 strong; __u16 weak; + int retval; + unsigned long flags; if (effect->type != FF_RUMBLE) return 0; @@ -808,69 +969,81 @@ static int xpad_play_effect(struct input_dev *dev, void *data, struct ff_effect strong = effect->u.rumble.strong_magnitude; weak = effect->u.rumble.weak_magnitude; + spin_lock_irqsave(&xpad->odata_lock, flags); + switch (xpad->xtype) { case XTYPE_XBOX: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x06; - xpad->odata[2] = 0x00; - xpad->odata[3] = strong / 256; /* left actuator */ - xpad->odata[4] = 0x00; - xpad->odata[5] = weak / 256; /* right actuator */ - xpad->irq_out->transfer_buffer_length = 6; + packet->data[0] = 0x00; + packet->data[1] = 0x06; + packet->data[2] = 0x00; + packet->data[3] = strong / 256; /* left actuator */ + packet->data[4] = 0x00; + packet->data[5] = weak / 256; /* right actuator */ + packet->len = 6; + packet->pending = true; break; case XTYPE_XBOX360: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x08; - xpad->odata[2] = 0x00; - xpad->odata[3] = strong / 256; /* left actuator? */ - xpad->odata[4] = weak / 256; /* right actuator? */ - xpad->odata[5] = 0x00; - xpad->odata[6] = 0x00; - xpad->odata[7] = 0x00; - xpad->irq_out->transfer_buffer_length = 8; + packet->data[0] = 0x00; + packet->data[1] = 0x08; + packet->data[2] = 0x00; + packet->data[3] = strong / 256; /* left actuator? */ + packet->data[4] = weak / 256; /* right actuator? */ + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->len = 8; + packet->pending = true; break; case XTYPE_XBOX360W: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x01; - xpad->odata[2] = 0x0F; - xpad->odata[3] = 0xC0; - xpad->odata[4] = 0x00; - xpad->odata[5] = strong / 256; - xpad->odata[6] = weak / 256; - xpad->odata[7] = 0x00; - xpad->odata[8] = 0x00; - xpad->odata[9] = 0x00; - xpad->odata[10] = 0x00; - xpad->odata[11] = 0x00; - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x00; + packet->data[1] = 0x01; + packet->data[2] = 0x0F; + packet->data[3] = 0xC0; + packet->data[4] = 0x00; + packet->data[5] = strong / 256; + packet->data[6] = weak / 256; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; break; case XTYPE_XBOXONE: - xpad->odata[0] = 0x09; /* activate rumble */ - xpad->odata[1] = 0x08; - xpad->odata[2] = 0x00; - xpad->odata[3] = 0x08; /* continuous effect */ - xpad->odata[4] = 0x00; /* simple rumble mode */ - xpad->odata[5] = 0x03; /* L and R actuator only */ - xpad->odata[6] = 0x00; /* TODO: LT actuator */ - xpad->odata[7] = 0x00; /* TODO: RT actuator */ - xpad->odata[8] = strong / 256; /* left actuator */ - xpad->odata[9] = weak / 256; /* right actuator */ - xpad->odata[10] = 0x80; /* length of pulse */ - xpad->odata[11] = 0x00; /* stop period of pulse */ - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x09; /* activate rumble */ + packet->data[1] = 0x08; + packet->data[2] = xpad->odata_serial++; + packet->data[3] = 0x08; /* continuous effect */ + packet->data[4] = 0x00; /* simple rumble mode */ + packet->data[5] = 0x03; /* L and R actuator only */ + packet->data[6] = 0x00; /* TODO: LT actuator */ + packet->data[7] = 0x00; /* TODO: RT actuator */ + packet->data[8] = strong / 512; /* left actuator */ + packet->data[9] = weak / 512; /* right actuator */ + packet->data[10] = 0x80; /* length of pulse */ + packet->data[11] = 0x00; /* stop period of pulse */ + packet->data[12] = 0x00; + packet->len = 13; + packet->pending = true; break; default: dev_dbg(&xpad->dev->dev, "%s - rumble command sent to unsupported xpad type: %d\n", __func__, xpad->xtype); - return -EINVAL; + retval = -EINVAL; + goto out; } - return usb_submit_urb(xpad->irq_out, GFP_ATOMIC); + retval = xpad_try_sending_next_out_packet(xpad); + +out: + spin_unlock_irqrestore(&xpad->odata_lock, flags); + return retval; } static int xpad_init_ff(struct usb_xpad *xpad) @@ -921,36 +1094,44 @@ struct xpad_led { */ static void xpad_send_led_command(struct usb_xpad *xpad, int command) { + struct xpad_output_packet *packet = + &xpad->out_packets[XPAD_OUT_LED_IDX]; + unsigned long flags; + command %= 16; - mutex_lock(&xpad->odata_mutex); + spin_lock_irqsave(&xpad->odata_lock, flags); switch (xpad->xtype) { case XTYPE_XBOX360: - xpad->odata[0] = 0x01; - xpad->odata[1] = 0x03; - xpad->odata[2] = command; - xpad->irq_out->transfer_buffer_length = 3; + packet->data[0] = 0x01; + packet->data[1] = 0x03; + packet->data[2] = command; + packet->len = 3; + packet->pending = true; break; + case XTYPE_XBOX360W: - xpad->odata[0] = 0x00; - xpad->odata[1] = 0x00; - xpad->odata[2] = 0x08; - xpad->odata[3] = 0x40 + command; - xpad->odata[4] = 0x00; - xpad->odata[5] = 0x00; - xpad->odata[6] = 0x00; - xpad->odata[7] = 0x00; - xpad->odata[8] = 0x00; - xpad->odata[9] = 0x00; - xpad->odata[10] = 0x00; - xpad->odata[11] = 0x00; - xpad->irq_out->transfer_buffer_length = 12; + packet->data[0] = 0x00; + packet->data[1] = 0x00; + packet->data[2] = 0x08; + packet->data[3] = 0x40 + command; + packet->data[4] = 0x00; + packet->data[5] = 0x00; + packet->data[6] = 0x00; + packet->data[7] = 0x00; + packet->data[8] = 0x00; + packet->data[9] = 0x00; + packet->data[10] = 0x00; + packet->data[11] = 0x00; + packet->len = 12; + packet->pending = true; break; } - usb_submit_urb(xpad->irq_out, GFP_KERNEL); - mutex_unlock(&xpad->odata_mutex); + xpad_try_sending_next_out_packet(xpad); + + spin_unlock_irqrestore(&xpad->odata_lock, flags); } /* @@ -959,7 +1140,7 @@ static void xpad_send_led_command(struct usb_xpad *xpad, int command) */ static void xpad_identify_controller(struct usb_xpad *xpad) { - xpad_send_led_command(xpad, (xpad->pad_nr % 4) + 2); + led_set_brightness(&xpad->led->led_cdev, (xpad->pad_nr % 4) + 2); } static void xpad_led_set(struct led_classdev *led_cdev, @@ -1001,14 +1182,7 @@ static int xpad_led_probe(struct usb_xpad *xpad) if (error) goto err_free_id; - if (xpad->xtype == XTYPE_XBOX360) { - /* - * Light up the segment corresponding to controller - * number on wired devices. On wireless we'll do that - * when they respond to "presence" packet. - */ - xpad_identify_controller(xpad); - } + xpad_identify_controller(xpad); return 0; @@ -1036,37 +1210,73 @@ static void xpad_led_disconnect(struct usb_xpad *xpad) { } static void xpad_identify_controller(struct usb_xpad *xpad) { } #endif -static int xpad_open(struct input_dev *dev) +static int xpad_start_input(struct usb_xpad *xpad) { - struct usb_xpad *xpad = input_get_drvdata(dev); - - /* URB was submitted in probe */ - if (xpad->xtype == XTYPE_XBOX360W) - return 0; + int error; - xpad->irq_in->dev = xpad->udev; if (usb_submit_urb(xpad->irq_in, GFP_KERNEL)) return -EIO; if (xpad->xtype == XTYPE_XBOXONE) { - /* Xbox one controller needs to be initialized. */ - xpad->odata[0] = 0x05; - xpad->odata[1] = 0x20; - xpad->irq_out->transfer_buffer_length = 2; - return usb_submit_urb(xpad->irq_out, GFP_KERNEL); + error = xpad_start_xbox_one(xpad); + if (error) { + usb_kill_urb(xpad->irq_in); + return error; + } } return 0; } -static void xpad_close(struct input_dev *dev) +static void xpad_stop_input(struct usb_xpad *xpad) { - struct usb_xpad *xpad = input_get_drvdata(dev); + usb_kill_urb(xpad->irq_in); +} + +static int xpad360w_start_input(struct usb_xpad *xpad) +{ + int error; - if (xpad->xtype != XTYPE_XBOX360W) + error = usb_submit_urb(xpad->irq_in, GFP_KERNEL); + if (error) + return -EIO; + + /* + * Send presence packet. + * This will force the controller to resend connection packets. + * This is useful in the case we activate the module after the + * adapter has been plugged in, as it won't automatically + * send us info about the controllers. + */ + error = xpad_inquiry_pad_presence(xpad); + if (error) { usb_kill_urb(xpad->irq_in); + return error; + } - xpad_stop_output(xpad); + return 0; +} + +static void xpad360w_stop_input(struct usb_xpad *xpad) +{ + usb_kill_urb(xpad->irq_in); + + /* Make sure we are done with presence work if it was scheduled */ + flush_work(&xpad->work); +} + +static int xpad_open(struct input_dev *dev) +{ + struct usb_xpad *xpad = input_get_drvdata(dev); + + return xpad_start_input(xpad); +} + +static void xpad_close(struct input_dev *dev) +{ + struct usb_xpad *xpad = input_get_drvdata(dev); + + xpad_stop_input(xpad); } static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs) @@ -1097,8 +1307,11 @@ static void xpad_set_up_abs(struct input_dev *input_dev, signed short abs) static void xpad_deinit_input(struct usb_xpad *xpad) { - xpad_led_disconnect(xpad); - input_unregister_device(xpad->dev); + if (xpad->input_created) { + xpad->input_created = false; + xpad_led_disconnect(xpad); + input_unregister_device(xpad->dev); + } } static int xpad_init_input(struct usb_xpad *xpad) @@ -1118,8 +1331,10 @@ static int xpad_init_input(struct usb_xpad *xpad) input_set_drvdata(input_dev, xpad); - input_dev->open = xpad_open; - input_dev->close = xpad_close; + if (xpad->xtype != XTYPE_XBOX360W) { + input_dev->open = xpad_open; + input_dev->close = xpad_close; + } __set_bit(EV_KEY, input_dev->evbit); @@ -1181,6 +1396,7 @@ static int xpad_init_input(struct usb_xpad *xpad) if (error) goto err_disconnect_led; + xpad->input_created = true; return 0; err_disconnect_led: @@ -1241,6 +1457,7 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id xpad->mapping = xpad_device[i].mapping; xpad->xtype = xpad_device[i].xtype; xpad->name = xpad_device[i].name; + INIT_WORK(&xpad->work, xpad_presence_work); if (xpad->xtype == XTYPE_UNKNOWN) { if (intf->cur_altsetting->desc.bInterfaceClass == USB_CLASS_VENDOR_SPEC) { @@ -1277,10 +1494,6 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id usb_set_intfdata(intf, xpad); - error = xpad_init_input(xpad); - if (error) - goto err_deinit_output; - if (xpad->xtype == XTYPE_XBOX360W) { /* * Submit the int URB immediately rather than waiting for open @@ -1289,28 +1502,24 @@ static int xpad_probe(struct usb_interface *intf, const struct usb_device_id *id * exactly the message that a controller has arrived that * we're waiting for. */ - xpad->irq_in->dev = xpad->udev; - error = usb_submit_urb(xpad->irq_in, GFP_KERNEL); + error = xpad360w_start_input(xpad); if (error) - goto err_deinit_input; - + goto err_deinit_output; /* - * Send presence packet. - * This will force the controller to resend connection packets. - * This is useful in the case we activate the module after the - * adapter has been plugged in, as it won't automatically - * send us info about the controllers. + * Wireless controllers require RESET_RESUME to work properly + * after suspend. Ideally this quirk should be in usb core + * quirk list, but we have too many vendors producing these + * controllers and we'd need to maintain 2 identical lists + * here in this driver and in usb core. */ - error = xpad_inquiry_pad_presence(xpad); + udev->quirks |= USB_QUIRK_RESET_RESUME; + } else { + error = xpad_init_input(xpad); if (error) - goto err_kill_in_urb; + goto err_deinit_output; } return 0; -err_kill_in_urb: - usb_kill_urb(xpad->irq_in); -err_deinit_input: - xpad_deinit_input(xpad); err_deinit_output: xpad_deinit_output(xpad); err_free_in_urb: @@ -1320,19 +1529,24 @@ err_free_idata: err_free_mem: kfree(xpad); return error; - } static void xpad_disconnect(struct usb_interface *intf) { - struct usb_xpad *xpad = usb_get_intfdata (intf); + struct usb_xpad *xpad = usb_get_intfdata(intf); + + if (xpad->xtype == XTYPE_XBOX360W) + xpad360w_stop_input(xpad); xpad_deinit_input(xpad); - xpad_deinit_output(xpad); - if (xpad->xtype == XTYPE_XBOX360W) { - usb_kill_urb(xpad->irq_in); - } + /* + * Now that both input device and LED device are gone we can + * stop output URB. + */ + xpad_stop_output(xpad); + + xpad_deinit_output(xpad); usb_free_urb(xpad->irq_in); usb_free_coherent(xpad->udev, XPAD_PKT_LEN, @@ -1343,10 +1557,55 @@ static void xpad_disconnect(struct usb_interface *intf) usb_set_intfdata(intf, NULL); } +static int xpad_suspend(struct usb_interface *intf, pm_message_t message) +{ + struct usb_xpad *xpad = usb_get_intfdata(intf); + struct input_dev *input = xpad->dev; + + if (xpad->xtype == XTYPE_XBOX360W) { + /* + * Wireless controllers always listen to input so + * they are notified when controller shows up + * or goes away. + */ + xpad360w_stop_input(xpad); + } else { + mutex_lock(&input->mutex); + if (input->users) + xpad_stop_input(xpad); + mutex_unlock(&input->mutex); + } + + xpad_stop_output(xpad); + + return 0; +} + +static int xpad_resume(struct usb_interface *intf) +{ + struct usb_xpad *xpad = usb_get_intfdata(intf); + struct input_dev *input = xpad->dev; + int retval = 0; + + if (xpad->xtype == XTYPE_XBOX360W) { + retval = xpad360w_start_input(xpad); + } else { + mutex_lock(&input->mutex); + if (input->users) + retval = xpad_start_input(xpad); + mutex_unlock(&input->mutex); + } + + return retval; +} + static struct usb_driver xpad_driver = { .name = "xpad", .probe = xpad_probe, .disconnect = xpad_disconnect, + .suspend = xpad_suspend, + .resume = xpad_resume, + .reset_resume = xpad_resume, .id_table = xpad_table, }; diff --git a/drivers/input/keyboard/gpio_keys.c b/drivers/input/keyboard/gpio_keys.c index b9f01bd1b7ef..29093657f2ef 100644 --- a/drivers/input/keyboard/gpio_keys.c +++ b/drivers/input/keyboard/gpio_keys.c @@ -630,7 +630,7 @@ gpio_keys_get_devtree_pdata(struct device *dev) if (!node) return ERR_PTR(-ENODEV); - nbuttons = of_get_child_count(node); + nbuttons = of_get_available_child_count(node); if (nbuttons == 0) return ERR_PTR(-ENODEV); @@ -645,8 +645,10 @@ gpio_keys_get_devtree_pdata(struct device *dev) pdata->rep = !!of_get_property(node, "autorepeat", NULL); + of_property_read_string(node, "label", &pdata->name); + i = 0; - for_each_child_of_node(node, pp) { + for_each_available_child_of_node(node, pp) { enum of_gpio_flags flags; button = &pdata->buttons[i++]; diff --git a/drivers/input/touchscreen/atmel_mxt_ts.c b/drivers/input/touchscreen/atmel_mxt_ts.c index 2d5794ec338b..2160512e861a 100644 --- a/drivers/input/touchscreen/atmel_mxt_ts.c +++ b/drivers/input/touchscreen/atmel_mxt_ts.c @@ -113,8 +113,8 @@ struct t7_config { #define MXT_T9_DETECT (1 << 7) struct t9_range { - u16 x; - u16 y; + __le16 x; + __le16 y; } __packed; /* MXT_TOUCH_MULTI_T9 orient */ @@ -216,6 +216,7 @@ struct mxt_data { unsigned int irq; unsigned int max_x; unsigned int max_y; + bool xy_switch; bool in_bootloader; u16 mem_size; u8 t100_aux_ampl; @@ -1665,8 +1666,8 @@ static int mxt_read_t9_resolution(struct mxt_data *data) if (error) return error; - le16_to_cpus(&range.x); - le16_to_cpus(&range.y); + data->max_x = get_unaligned_le16(&range.x); + data->max_y = get_unaligned_le16(&range.y); error = __mxt_read_reg(client, object->start_address + MXT_T9_ORIENT, @@ -1674,23 +1675,7 @@ static int mxt_read_t9_resolution(struct mxt_data *data) if (error) return error; - /* Handle default values */ - if (range.x == 0) - range.x = 1023; - - if (range.y == 0) - range.y = 1023; - - if (orient & MXT_T9_ORIENT_SWITCH) { - data->max_x = range.y; - data->max_y = range.x; - } else { - data->max_x = range.x; - data->max_y = range.y; - } - - dev_dbg(&client->dev, - "Touchscreen size X%uY%u\n", data->max_x, data->max_y); + data->xy_switch = orient & MXT_T9_ORIENT_SWITCH; return 0; } @@ -1708,13 +1693,14 @@ static int mxt_read_t100_config(struct mxt_data *data) if (!object) return -EINVAL; + /* read touchscreen dimensions */ error = __mxt_read_reg(client, object->start_address + MXT_T100_XRANGE, sizeof(range_x), &range_x); if (error) return error; - le16_to_cpus(&range_x); + data->max_x = get_unaligned_le16(&range_x); error = __mxt_read_reg(client, object->start_address + MXT_T100_YRANGE, @@ -1722,36 +1708,24 @@ static int mxt_read_t100_config(struct mxt_data *data) if (error) return error; - le16_to_cpus(&range_y); + data->max_y = get_unaligned_le16(&range_y); + /* read orientation config */ error = __mxt_read_reg(client, object->start_address + MXT_T100_CFG1, 1, &cfg); if (error) return error; + data->xy_switch = cfg & MXT_T100_CFG_SWITCHXY; + + /* allocate aux bytes */ error = __mxt_read_reg(client, object->start_address + MXT_T100_TCHAUX, 1, &tchaux); if (error) return error; - /* Handle default values */ - if (range_x == 0) - range_x = 1023; - - if (range_y == 0) - range_y = 1023; - - if (cfg & MXT_T100_CFG_SWITCHXY) { - data->max_x = range_y; - data->max_y = range_x; - } else { - data->max_x = range_x; - data->max_y = range_y; - } - - /* allocate aux bytes */ aux = 6; if (tchaux & MXT_T100_TCHAUX_VECT) @@ -1767,9 +1741,6 @@ static int mxt_read_t100_config(struct mxt_data *data) "T100 aux mappings vect:%u ampl:%u area:%u\n", data->t100_aux_vect, data->t100_aux_ampl, data->t100_aux_area); - dev_info(&client->dev, - "T100 Touchscreen size X%uY%u\n", data->max_x, data->max_y); - return 0; } @@ -1828,6 +1799,19 @@ static int mxt_initialize_input_device(struct mxt_data *data) return -EINVAL; } + /* Handle default values and orientation switch */ + if (data->max_x == 0) + data->max_x = 1023; + + if (data->max_y == 0) + data->max_y = 1023; + + if (data->xy_switch) + swap(data->max_x, data->max_y); + + dev_info(dev, "Touchscreen size X%uY%u\n", data->max_x, data->max_y); + + /* Register input device */ input_dev = input_allocate_device(); if (!input_dev) { dev_err(dev, "Failed to allocate memory\n"); diff --git a/drivers/lightnvm/Makefile b/drivers/lightnvm/Makefile index 7e0f42acb737..a7a0a22cf1a5 100644 --- a/drivers/lightnvm/Makefile +++ b/drivers/lightnvm/Makefile @@ -2,6 +2,6 @@ # Makefile for Open-Channel SSDs. # -obj-$(CONFIG_NVM) := core.o +obj-$(CONFIG_NVM) := core.o sysblk.o obj-$(CONFIG_NVM_GENNVM) += gennvm.o obj-$(CONFIG_NVM_RRPC) += rrpc.o diff --git a/drivers/lightnvm/core.c b/drivers/lightnvm/core.c index 8f41b245cd55..33224cb91c5b 100644 --- a/drivers/lightnvm/core.c +++ b/drivers/lightnvm/core.c @@ -27,6 +27,7 @@ #include <linux/module.h> #include <linux/miscdevice.h> #include <linux/lightnvm.h> +#include <linux/sched/sysctl.h> #include <uapi/linux/lightnvm.h> static LIST_HEAD(nvm_targets); @@ -105,6 +106,9 @@ struct nvmm_type *nvm_init_mgr(struct nvm_dev *dev) lockdep_assert_held(&nvm_lock); list_for_each_entry(mt, &nvm_mgrs, list) { + if (strncmp(dev->sb.mmtype, mt->name, NVM_MMTYPE_LEN)) + continue; + ret = mt->register_mgr(dev); if (ret < 0) { pr_err("nvm: media mgr failed to init (%d) on dev %s\n", @@ -166,6 +170,20 @@ static struct nvm_dev *nvm_find_nvm_dev(const char *name) return NULL; } +struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *lun, + unsigned long flags) +{ + return dev->mt->get_blk_unlocked(dev, lun, flags); +} +EXPORT_SYMBOL(nvm_get_blk_unlocked); + +/* Assumes that all valid pages have already been moved on release to bm */ +void nvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) +{ + return dev->mt->put_blk_unlocked(dev, blk); +} +EXPORT_SYMBOL(nvm_put_blk_unlocked); + struct nvm_block *nvm_get_blk(struct nvm_dev *dev, struct nvm_lun *lun, unsigned long flags) { @@ -192,6 +210,206 @@ int nvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk) } EXPORT_SYMBOL(nvm_erase_blk); +void nvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + int i; + + if (rqd->nr_pages > 1) { + for (i = 0; i < rqd->nr_pages; i++) + rqd->ppa_list[i] = dev_to_generic_addr(dev, + rqd->ppa_list[i]); + } else { + rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); + } +} +EXPORT_SYMBOL(nvm_addr_to_generic_mode); + +void nvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + int i; + + if (rqd->nr_pages > 1) { + for (i = 0; i < rqd->nr_pages; i++) + rqd->ppa_list[i] = generic_to_dev_addr(dev, + rqd->ppa_list[i]); + } else { + rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); + } +} +EXPORT_SYMBOL(nvm_generic_to_addr_mode); + +int nvm_set_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd, + struct ppa_addr *ppas, int nr_ppas) +{ + int i, plane_cnt, pl_idx; + + if (dev->plane_mode == NVM_PLANE_SINGLE && nr_ppas == 1) { + rqd->nr_pages = 1; + rqd->ppa_addr = ppas[0]; + + return 0; + } + + plane_cnt = (1 << dev->plane_mode); + rqd->nr_pages = plane_cnt * nr_ppas; + + if (dev->ops->max_phys_sect < rqd->nr_pages) + return -EINVAL; + + rqd->ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, &rqd->dma_ppa_list); + if (!rqd->ppa_list) { + pr_err("nvm: failed to allocate dma memory\n"); + return -ENOMEM; + } + + for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { + for (i = 0; i < nr_ppas; i++) { + ppas[i].g.pl = pl_idx; + rqd->ppa_list[(pl_idx * nr_ppas) + i] = ppas[i]; + } + } + + return 0; +} +EXPORT_SYMBOL(nvm_set_rqd_ppalist); + +void nvm_free_rqd_ppalist(struct nvm_dev *dev, struct nvm_rq *rqd) +{ + if (!rqd->ppa_list) + return; + + nvm_dev_dma_free(dev, rqd->ppa_list, rqd->dma_ppa_list); +} +EXPORT_SYMBOL(nvm_free_rqd_ppalist); + +int nvm_erase_ppa(struct nvm_dev *dev, struct ppa_addr *ppas, int nr_ppas) +{ + struct nvm_rq rqd; + int ret; + + if (!dev->ops->erase_block) + return 0; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + ret = nvm_set_rqd_ppalist(dev, &rqd, ppas, nr_ppas); + if (ret) + return ret; + + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->erase_block(dev, &rqd); + + nvm_free_rqd_ppalist(dev, &rqd); + + return ret; +} +EXPORT_SYMBOL(nvm_erase_ppa); + +void nvm_end_io(struct nvm_rq *rqd, int error) +{ + rqd->error = error; + rqd->end_io(rqd); +} +EXPORT_SYMBOL(nvm_end_io); + +static void nvm_end_io_sync(struct nvm_rq *rqd) +{ + struct completion *waiting = rqd->wait; + + rqd->wait = NULL; + + complete(waiting); +} + +int nvm_submit_ppa(struct nvm_dev *dev, struct ppa_addr *ppa, int nr_ppas, + int opcode, int flags, void *buf, int len) +{ + DECLARE_COMPLETION_ONSTACK(wait); + struct nvm_rq rqd; + struct bio *bio; + int ret; + unsigned long hang_check; + + bio = bio_map_kern(dev->q, buf, len, GFP_KERNEL); + if (IS_ERR_OR_NULL(bio)) + return -ENOMEM; + + memset(&rqd, 0, sizeof(struct nvm_rq)); + ret = nvm_set_rqd_ppalist(dev, &rqd, ppa, nr_ppas); + if (ret) { + bio_put(bio); + return ret; + } + + rqd.opcode = opcode; + rqd.bio = bio; + rqd.wait = &wait; + rqd.dev = dev; + rqd.end_io = nvm_end_io_sync; + rqd.flags = flags; + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->submit_io(dev, &rqd); + + /* Prevent hang_check timer from firing at us during very long I/O */ + hang_check = sysctl_hung_task_timeout_secs; + if (hang_check) + while (!wait_for_completion_io_timeout(&wait, hang_check * (HZ/2))); + else + wait_for_completion_io(&wait); + + nvm_free_rqd_ppalist(dev, &rqd); + + return rqd.error; +} +EXPORT_SYMBOL(nvm_submit_ppa); + +static int nvm_init_slc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) +{ + int i; + + dev->lps_per_blk = dev->pgs_per_blk; + dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); + if (!dev->lptbl) + return -ENOMEM; + + /* Just a linear array */ + for (i = 0; i < dev->lps_per_blk; i++) + dev->lptbl[i] = i; + + return 0; +} + +static int nvm_init_mlc_tbl(struct nvm_dev *dev, struct nvm_id_group *grp) +{ + int i, p; + struct nvm_id_lp_mlc *mlc = &grp->lptbl.mlc; + + if (!mlc->num_pairs) + return 0; + + dev->lps_per_blk = mlc->num_pairs; + dev->lptbl = kcalloc(dev->lps_per_blk, sizeof(int), GFP_KERNEL); + if (!dev->lptbl) + return -ENOMEM; + + /* The lower page table encoding consists of a list of bytes, where each + * has a lower and an upper half. The first half byte maintains the + * increment value and every value after is an offset added to the + * previous incrementation value */ + dev->lptbl[0] = mlc->pairs[0] & 0xF; + for (i = 1; i < dev->lps_per_blk; i++) { + p = mlc->pairs[i >> 1]; + if (i & 0x1) /* upper */ + dev->lptbl[i] = dev->lptbl[i - 1] + ((p & 0xF0) >> 4); + else /* lower */ + dev->lptbl[i] = dev->lptbl[i - 1] + (p & 0xF); + } + + return 0; +} + static int nvm_core_init(struct nvm_dev *dev) { struct nvm_id *id = &dev->identity; @@ -206,6 +424,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->sec_size = grp->csecs; dev->oob_size = grp->sos; dev->sec_per_pg = grp->fpg_sz / grp->csecs; + dev->mccap = grp->mccap; memcpy(&dev->ppaf, &id->ppaf, sizeof(struct nvm_addr_format)); dev->plane_mode = NVM_PLANE_SINGLE; @@ -216,11 +435,23 @@ static int nvm_core_init(struct nvm_dev *dev) return -EINVAL; } - if (grp->fmtype != 0 && grp->fmtype != 1) { + switch (grp->fmtype) { + case NVM_ID_FMTYPE_SLC: + if (nvm_init_slc_tbl(dev, grp)) + return -ENOMEM; + break; + case NVM_ID_FMTYPE_MLC: + if (nvm_init_mlc_tbl(dev, grp)) + return -ENOMEM; + break; + default: pr_err("nvm: flash type not supported\n"); return -EINVAL; } + if (!dev->lps_per_blk) + pr_info("nvm: lower page programming table missing\n"); + if (grp->mpos & 0x020202) dev->plane_mode = NVM_PLANE_DOUBLE; if (grp->mpos & 0x040404) @@ -238,6 +469,7 @@ static int nvm_core_init(struct nvm_dev *dev) dev->nr_chnls; dev->total_pages = dev->total_blocks * dev->pgs_per_blk; INIT_LIST_HEAD(&dev->online_targets); + mutex_init(&dev->mlock); return 0; } @@ -249,6 +481,8 @@ static void nvm_free(struct nvm_dev *dev) if (dev->mt) dev->mt->unregister_mgr(dev); + + kfree(dev->lptbl); } static int nvm_init(struct nvm_dev *dev) @@ -338,9 +572,16 @@ int nvm_register(struct request_queue *q, char *disk_name, } } + ret = nvm_get_sysblock(dev, &dev->sb); + if (!ret) + pr_err("nvm: device not initialized.\n"); + else if (ret < 0) + pr_err("nvm: err (%d) on device initialization\n", ret); + /* register device with a supported media manager */ down_write(&nvm_lock); - dev->mt = nvm_init_mgr(dev); + if (ret > 0) + dev->mt = nvm_init_mgr(dev); list_add(&dev->devices, &nvm_devices); up_write(&nvm_lock); @@ -788,6 +1029,97 @@ static long nvm_ioctl_dev_remove(struct file *file, void __user *arg) return __nvm_configure_remove(&remove); } +static void nvm_setup_nvm_sb_info(struct nvm_sb_info *info) +{ + info->seqnr = 1; + info->erase_cnt = 0; + info->version = 1; +} + +static long __nvm_ioctl_dev_init(struct nvm_ioctl_dev_init *init) +{ + struct nvm_dev *dev; + struct nvm_sb_info info; + int ret; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(init->dev); + up_write(&nvm_lock); + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + nvm_setup_nvm_sb_info(&info); + + strncpy(info.mmtype, init->mmtype, NVM_MMTYPE_LEN); + info.fs_ppa.ppa = -1; + + ret = nvm_init_sysblock(dev, &info); + if (ret) + return ret; + + memcpy(&dev->sb, &info, sizeof(struct nvm_sb_info)); + + down_write(&nvm_lock); + dev->mt = nvm_init_mgr(dev); + up_write(&nvm_lock); + + return 0; +} + +static long nvm_ioctl_dev_init(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_init init; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&init, arg, sizeof(struct nvm_ioctl_dev_init))) + return -EFAULT; + + if (init.flags != 0) { + pr_err("nvm: no flags supported\n"); + return -EINVAL; + } + + init.dev[DISK_NAME_LEN - 1] = '\0'; + + return __nvm_ioctl_dev_init(&init); +} + +static long nvm_ioctl_dev_factory(struct file *file, void __user *arg) +{ + struct nvm_ioctl_dev_factory fact; + struct nvm_dev *dev; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (copy_from_user(&fact, arg, sizeof(struct nvm_ioctl_dev_factory))) + return -EFAULT; + + fact.dev[DISK_NAME_LEN - 1] = '\0'; + + if (fact.flags & ~(NVM_FACTORY_NR_BITS - 1)) + return -EINVAL; + + down_write(&nvm_lock); + dev = nvm_find_nvm_dev(fact.dev); + up_write(&nvm_lock); + if (!dev) { + pr_err("nvm: device not found\n"); + return -EINVAL; + } + + if (dev->mt) { + dev->mt->unregister_mgr(dev); + dev->mt = NULL; + } + + return nvm_dev_factory(dev, fact.flags); +} + static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) { void __user *argp = (void __user *)arg; @@ -801,6 +1133,10 @@ static long nvm_ctl_ioctl(struct file *file, uint cmd, unsigned long arg) return nvm_ioctl_dev_create(file, argp); case NVM_DEV_REMOVE: return nvm_ioctl_dev_remove(file, argp); + case NVM_DEV_INIT: + return nvm_ioctl_dev_init(file, argp); + case NVM_DEV_FACTORY: + return nvm_ioctl_dev_factory(file, argp); } return 0; } diff --git a/drivers/lightnvm/gennvm.c b/drivers/lightnvm/gennvm.c index a54b339951a3..7fb725b16148 100644 --- a/drivers/lightnvm/gennvm.c +++ b/drivers/lightnvm/gennvm.c @@ -60,7 +60,8 @@ static int gennvm_luns_init(struct nvm_dev *dev, struct gen_nvm *gn) lun->vlun.lun_id = i % dev->luns_per_chnl; lun->vlun.chnl_id = i / dev->luns_per_chnl; lun->vlun.nr_free_blocks = dev->blks_per_lun; - lun->vlun.nr_inuse_blocks = 0; + lun->vlun.nr_open_blocks = 0; + lun->vlun.nr_closed_blocks = 0; lun->vlun.nr_bad_blocks = 0; } return 0; @@ -89,6 +90,7 @@ static int gennvm_block_bb(struct ppa_addr ppa, int nr_blocks, u8 *blks, list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; + lun->vlun.nr_free_blocks--; } return 0; @@ -133,15 +135,15 @@ static int gennvm_block_map(u64 slba, u32 nlb, __le64 *entries, void *private) pba = pba - (dev->sec_per_lun * lun_id); blk = &lun->vlun.blocks[div_u64(pba, dev->sec_per_blk)]; - if (!blk->type) { + if (!blk->state) { /* at this point, we don't know anything about the * block. It's up to the FTL on top to re-etablish the - * block state + * block state. The block is assumed to be open. */ list_move_tail(&blk->list, &lun->used_list); - blk->type = 1; + blk->state = NVM_BLK_ST_OPEN; lun->vlun.nr_free_blocks--; - lun->vlun.nr_inuse_blocks++; + lun->vlun.nr_open_blocks++; } } @@ -255,14 +257,14 @@ static void gennvm_unregister(struct nvm_dev *dev) module_put(THIS_MODULE); } -static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, +static struct nvm_block *gennvm_get_blk_unlocked(struct nvm_dev *dev, struct nvm_lun *vlun, unsigned long flags) { struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); struct nvm_block *blk = NULL; int is_gc = flags & NVM_IOTYPE_GC; - spin_lock(&vlun->lock); + assert_spin_locked(&vlun->lock); if (list_empty(&lun->free_list)) { pr_err_ratelimited("gennvm: lun %u have no free pages available", @@ -275,83 +277,64 @@ static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, blk = list_first_entry(&lun->free_list, struct nvm_block, list); list_move_tail(&blk->list, &lun->used_list); - blk->type = 1; + blk->state = NVM_BLK_ST_OPEN; lun->vlun.nr_free_blocks--; - lun->vlun.nr_inuse_blocks++; + lun->vlun.nr_open_blocks++; out: + return blk; +} + +static struct nvm_block *gennvm_get_blk(struct nvm_dev *dev, + struct nvm_lun *vlun, unsigned long flags) +{ + struct nvm_block *blk; + + spin_lock(&vlun->lock); + blk = gennvm_get_blk_unlocked(dev, vlun, flags); spin_unlock(&vlun->lock); return blk; } -static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) +static void gennvm_put_blk_unlocked(struct nvm_dev *dev, struct nvm_block *blk) { struct nvm_lun *vlun = blk->lun; struct gen_lun *lun = container_of(vlun, struct gen_lun, vlun); - spin_lock(&vlun->lock); + assert_spin_locked(&vlun->lock); - switch (blk->type) { - case 1: + if (blk->state & NVM_BLK_ST_OPEN) { list_move_tail(&blk->list, &lun->free_list); + lun->vlun.nr_open_blocks--; lun->vlun.nr_free_blocks++; - lun->vlun.nr_inuse_blocks--; - blk->type = 0; - break; - case 2: + blk->state = NVM_BLK_ST_FREE; + } else if (blk->state & NVM_BLK_ST_CLOSED) { + list_move_tail(&blk->list, &lun->free_list); + lun->vlun.nr_closed_blocks--; + lun->vlun.nr_free_blocks++; + blk->state = NVM_BLK_ST_FREE; + } else if (blk->state & NVM_BLK_ST_BAD) { list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; - lun->vlun.nr_inuse_blocks--; - break; - default: + blk->state = NVM_BLK_ST_BAD; + } else { WARN_ON_ONCE(1); pr_err("gennvm: erroneous block type (%lu -> %u)\n", - blk->id, blk->type); + blk->id, blk->state); list_move_tail(&blk->list, &lun->bb_list); lun->vlun.nr_bad_blocks++; - lun->vlun.nr_inuse_blocks--; - } - - spin_unlock(&vlun->lock); -} - -static void gennvm_addr_to_generic_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; - - if (rqd->nr_pages > 1) { - for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = dev_to_generic_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = dev_to_generic_addr(dev, rqd->ppa_addr); + blk->state = NVM_BLK_ST_BAD; } } -static void gennvm_generic_to_addr_mode(struct nvm_dev *dev, struct nvm_rq *rqd) -{ - int i; - - if (rqd->nr_pages > 1) { - for (i = 0; i < rqd->nr_pages; i++) - rqd->ppa_list[i] = generic_to_dev_addr(dev, - rqd->ppa_list[i]); - } else { - rqd->ppa_addr = generic_to_dev_addr(dev, rqd->ppa_addr); - } -} - -static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) +static void gennvm_put_blk(struct nvm_dev *dev, struct nvm_block *blk) { - if (!dev->ops->submit_io) - return 0; - - /* Convert address space */ - gennvm_generic_to_addr_mode(dev, rqd); + struct nvm_lun *vlun = blk->lun; - rqd->dev = dev; - return dev->ops->submit_io(dev, rqd); + spin_lock(&vlun->lock); + gennvm_put_blk_unlocked(dev, blk); + spin_unlock(&vlun->lock); } static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, @@ -376,7 +359,7 @@ static void gennvm_blk_set_type(struct nvm_dev *dev, struct ppa_addr *ppa, blk = &lun->vlun.blocks[ppa->g.blk]; /* will be moved to bb list on put_blk from target */ - blk->type = type; + blk->state = type; } /* mark block bad. It is expected the target recover from the error. */ @@ -390,77 +373,51 @@ static void gennvm_mark_blk_bad(struct nvm_dev *dev, struct nvm_rq *rqd) if (dev->ops->set_bb_tbl(dev, rqd, 1)) return; - gennvm_addr_to_generic_mode(dev, rqd); + nvm_addr_to_generic_mode(dev, rqd); /* look up blocks and mark them as bad */ if (rqd->nr_pages > 1) for (i = 0; i < rqd->nr_pages; i++) - gennvm_blk_set_type(dev, &rqd->ppa_list[i], 2); + gennvm_blk_set_type(dev, &rqd->ppa_list[i], + NVM_BLK_ST_BAD); else - gennvm_blk_set_type(dev, &rqd->ppa_addr, 2); + gennvm_blk_set_type(dev, &rqd->ppa_addr, NVM_BLK_ST_BAD); } -static int gennvm_end_io(struct nvm_rq *rqd, int error) +static void gennvm_end_io(struct nvm_rq *rqd) { struct nvm_tgt_instance *ins = rqd->ins; - int ret = 0; - switch (error) { + switch (rqd->error) { case NVM_RSP_SUCCESS: - break; case NVM_RSP_ERR_EMPTYPAGE: break; case NVM_RSP_ERR_FAILWRITE: gennvm_mark_blk_bad(rqd->dev, rqd); - default: - ret++; } - ret += ins->tt->end_io(rqd, error); - - return ret; + ins->tt->end_io(rqd); } -static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, - unsigned long flags) +static int gennvm_submit_io(struct nvm_dev *dev, struct nvm_rq *rqd) { - int plane_cnt = 0, pl_idx, ret; - struct ppa_addr addr; - struct nvm_rq rqd; - - if (!dev->ops->erase_block) - return 0; - - addr = block_to_ppa(dev, blk); - - if (dev->plane_mode == NVM_PLANE_SINGLE) { - rqd.nr_pages = 1; - rqd.ppa_addr = addr; - } else { - plane_cnt = (1 << dev->plane_mode); - rqd.nr_pages = plane_cnt; - - rqd.ppa_list = nvm_dev_dma_alloc(dev, GFP_KERNEL, - &rqd.dma_ppa_list); - if (!rqd.ppa_list) { - pr_err("gennvm: failed to allocate dma memory\n"); - return -ENOMEM; - } - - for (pl_idx = 0; pl_idx < plane_cnt; pl_idx++) { - addr.g.pl = pl_idx; - rqd.ppa_list[pl_idx] = addr; - } - } + if (!dev->ops->submit_io) + return -ENODEV; - gennvm_generic_to_addr_mode(dev, &rqd); + /* Convert address space */ + nvm_generic_to_addr_mode(dev, rqd); - ret = dev->ops->erase_block(dev, &rqd); + rqd->dev = dev; + rqd->end_io = gennvm_end_io; + return dev->ops->submit_io(dev, rqd); +} - if (plane_cnt) - nvm_dev_dma_free(dev, rqd.ppa_list, rqd.dma_ppa_list); +static int gennvm_erase_blk(struct nvm_dev *dev, struct nvm_block *blk, + unsigned long flags) +{ + struct ppa_addr addr = block_to_ppa(dev, blk); - return ret; + return nvm_erase_ppa(dev, &addr, 1); } static struct nvm_lun *gennvm_get_lun(struct nvm_dev *dev, int lunid) @@ -480,10 +437,11 @@ static void gennvm_lun_info_print(struct nvm_dev *dev) gennvm_for_each_lun(gn, lun, i) { spin_lock(&lun->vlun.lock); - pr_info("%s: lun%8u\t%u\t%u\t%u\n", + pr_info("%s: lun%8u\t%u\t%u\t%u\t%u\n", dev->name, i, lun->vlun.nr_free_blocks, - lun->vlun.nr_inuse_blocks, + lun->vlun.nr_open_blocks, + lun->vlun.nr_closed_blocks, lun->vlun.nr_bad_blocks); spin_unlock(&lun->vlun.lock); @@ -491,21 +449,23 @@ static void gennvm_lun_info_print(struct nvm_dev *dev) } static struct nvmm_type gennvm = { - .name = "gennvm", - .version = {0, 1, 0}, + .name = "gennvm", + .version = {0, 1, 0}, + + .register_mgr = gennvm_register, + .unregister_mgr = gennvm_unregister, - .register_mgr = gennvm_register, - .unregister_mgr = gennvm_unregister, + .get_blk_unlocked = gennvm_get_blk_unlocked, + .put_blk_unlocked = gennvm_put_blk_unlocked, - .get_blk = gennvm_get_blk, - .put_blk = gennvm_put_blk, + .get_blk = gennvm_get_blk, + .put_blk = gennvm_put_blk, - .submit_io = gennvm_submit_io, - .end_io = gennvm_end_io, - .erase_blk = gennvm_erase_blk, + .submit_io = gennvm_submit_io, + .erase_blk = gennvm_erase_blk, - .get_lun = gennvm_get_lun, - .lun_info_print = gennvm_lun_info_print, + .get_lun = gennvm_get_lun, + .lun_info_print = gennvm_lun_info_print, }; static int __init gennvm_module_init(void) diff --git a/drivers/lightnvm/rrpc.c b/drivers/lightnvm/rrpc.c index 134e4faba482..d8c75958ced3 100644 --- a/drivers/lightnvm/rrpc.c +++ b/drivers/lightnvm/rrpc.c @@ -179,16 +179,23 @@ static void rrpc_set_lun_cur(struct rrpc_lun *rlun, struct rrpc_block *rblk) static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, unsigned long flags) { + struct nvm_lun *lun = rlun->parent; struct nvm_block *blk; struct rrpc_block *rblk; - blk = nvm_get_blk(rrpc->dev, rlun->parent, flags); - if (!blk) + spin_lock(&lun->lock); + blk = nvm_get_blk_unlocked(rrpc->dev, rlun->parent, flags); + if (!blk) { + pr_err("nvm: rrpc: cannot get new block from media manager\n"); + spin_unlock(&lun->lock); return NULL; + } rblk = &rlun->blocks[blk->id]; - blk->priv = rblk; + list_add_tail(&rblk->list, &rlun->open_list); + spin_unlock(&lun->lock); + blk->priv = rblk; bitmap_zero(rblk->invalid_pages, rrpc->dev->pgs_per_blk); rblk->next_page = 0; rblk->nr_invalid_pages = 0; @@ -199,7 +206,13 @@ static struct rrpc_block *rrpc_get_blk(struct rrpc *rrpc, struct rrpc_lun *rlun, static void rrpc_put_blk(struct rrpc *rrpc, struct rrpc_block *rblk) { - nvm_put_blk(rrpc->dev, rblk->parent); + struct rrpc_lun *rlun = rblk->rlun; + struct nvm_lun *lun = rlun->parent; + + spin_lock(&lun->lock); + nvm_put_blk_unlocked(rrpc->dev, rblk->parent); + list_del(&rblk->list); + spin_unlock(&lun->lock); } static void rrpc_put_blks(struct rrpc *rrpc) @@ -287,6 +300,8 @@ static int rrpc_move_valid_pages(struct rrpc *rrpc, struct rrpc_block *rblk) } page = mempool_alloc(rrpc->page_pool, GFP_NOIO); + if (!page) + return -ENOMEM; while ((slot = find_first_zero_bit(rblk->invalid_pages, nr_pgs_per_blk)) < nr_pgs_per_blk) { @@ -328,6 +343,10 @@ try: goto finished; } wait_for_completion_io(&wait); + if (bio->bi_error) { + rrpc_inflight_laddr_release(rrpc, rqd); + goto finished; + } bio_reset(bio); reinit_completion(&wait); @@ -350,6 +369,8 @@ try: wait_for_completion_io(&wait); rrpc_inflight_laddr_release(rrpc, rqd); + if (bio->bi_error) + goto finished; bio_reset(bio); } @@ -373,16 +394,26 @@ static void rrpc_block_gc(struct work_struct *work) struct rrpc *rrpc = gcb->rrpc; struct rrpc_block *rblk = gcb->rblk; struct nvm_dev *dev = rrpc->dev; + struct nvm_lun *lun = rblk->parent->lun; + struct rrpc_lun *rlun = &rrpc->luns[lun->id - rrpc->lun_offset]; + mempool_free(gcb, rrpc->gcb_pool); pr_debug("nvm: block '%lu' being reclaimed\n", rblk->parent->id); if (rrpc_move_valid_pages(rrpc, rblk)) - goto done; + goto put_back; + + if (nvm_erase_blk(dev, rblk->parent)) + goto put_back; - nvm_erase_blk(dev, rblk->parent); rrpc_put_blk(rrpc, rblk); -done: - mempool_free(gcb, rrpc->gcb_pool); + + return; + +put_back: + spin_lock(&rlun->lock); + list_add_tail(&rblk->prio, &rlun->prio_list); + spin_unlock(&rlun->lock); } /* the block with highest number of invalid pages, will be in the beginning @@ -427,7 +458,7 @@ static void rrpc_lun_gc(struct work_struct *work) if (nr_blocks_need < rrpc->nr_luns) nr_blocks_need = rrpc->nr_luns; - spin_lock(&lun->lock); + spin_lock(&rlun->lock); while (nr_blocks_need > lun->nr_free_blocks && !list_empty(&rlun->prio_list)) { struct rrpc_block *rblock = block_prio_find_max(rlun); @@ -436,16 +467,16 @@ static void rrpc_lun_gc(struct work_struct *work) if (!rblock->nr_invalid_pages) break; + gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); + if (!gcb) + break; + list_del_init(&rblock->prio); BUG_ON(!block_is_full(rrpc, rblock)); pr_debug("rrpc: selected block '%lu' for GC\n", block->id); - gcb = mempool_alloc(rrpc->gcb_pool, GFP_ATOMIC); - if (!gcb) - break; - gcb->rrpc = rrpc; gcb->rblk = rblock; INIT_WORK(&gcb->ws_gc, rrpc_block_gc); @@ -454,7 +485,7 @@ static void rrpc_lun_gc(struct work_struct *work) nr_blocks_need--; } - spin_unlock(&lun->lock); + spin_unlock(&rlun->lock); /* TODO: Hint that request queue can be started again */ } @@ -635,12 +666,24 @@ static void rrpc_end_io_write(struct rrpc *rrpc, struct rrpc_rq *rrqd, lun = rblk->parent->lun; cmnt_size = atomic_inc_return(&rblk->data_cmnt_size); - if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) + if (unlikely(cmnt_size == rrpc->dev->pgs_per_blk)) { + struct nvm_block *blk = rblk->parent; + struct rrpc_lun *rlun = rblk->rlun; + + spin_lock(&lun->lock); + lun->nr_open_blocks--; + lun->nr_closed_blocks++; + blk->state &= ~NVM_BLK_ST_OPEN; + blk->state |= NVM_BLK_ST_CLOSED; + list_move_tail(&rblk->list, &rlun->closed_list); + spin_unlock(&lun->lock); + rrpc_run_gc(rrpc, rblk); + } } } -static int rrpc_end_io(struct nvm_rq *rqd, int error) +static void rrpc_end_io(struct nvm_rq *rqd) { struct rrpc *rrpc = container_of(rqd->ins, struct rrpc, instance); struct rrpc_rq *rrqd = nvm_rq_to_pdu(rqd); @@ -650,11 +693,12 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error) if (bio_data_dir(rqd->bio) == WRITE) rrpc_end_io_write(rrpc, rrqd, laddr, npages); + bio_put(rqd->bio); + if (rrqd->flags & NVM_IOTYPE_GC) - return 0; + return; rrpc_unlock_rq(rrpc, rqd); - bio_put(rqd->bio); if (npages > 1) nvm_dev_dma_free(rrpc->dev, rqd->ppa_list, rqd->dma_ppa_list); @@ -662,8 +706,6 @@ static int rrpc_end_io(struct nvm_rq *rqd, int error) nvm_dev_dma_free(rrpc->dev, rqd->metadata, rqd->dma_metadata); mempool_free(rqd, rrpc->rq_pool); - - return 0; } static int rrpc_read_ppalist_rq(struct rrpc *rrpc, struct bio *bio, @@ -841,6 +883,13 @@ static int rrpc_submit_io(struct rrpc *rrpc, struct bio *bio, err = nvm_submit_io(rrpc->dev, rqd); if (err) { pr_err("rrpc: I/O submission failed: %d\n", err); + bio_put(bio); + if (!(flags & NVM_IOTYPE_GC)) { + rrpc_unlock_rq(rrpc, rqd); + if (rqd->nr_pages > 1) + nvm_dev_dma_free(rrpc->dev, + rqd->ppa_list, rqd->dma_ppa_list); + } return NVM_IO_ERR; } @@ -1090,6 +1139,11 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) struct rrpc_lun *rlun; int i, j; + if (dev->pgs_per_blk > MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { + pr_err("rrpc: number of pages per block too high."); + return -EINVAL; + } + spin_lock_init(&rrpc->rev_lock); rrpc->luns = kcalloc(rrpc->nr_luns, sizeof(struct rrpc_lun), @@ -1101,16 +1155,13 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) for (i = 0; i < rrpc->nr_luns; i++) { struct nvm_lun *lun = dev->mt->get_lun(dev, lun_begin + i); - if (dev->pgs_per_blk > - MAX_INVALID_PAGES_STORAGE * BITS_PER_LONG) { - pr_err("rrpc: number of pages per block too high."); - goto err; - } - rlun = &rrpc->luns[i]; rlun->rrpc = rrpc; rlun->parent = lun; INIT_LIST_HEAD(&rlun->prio_list); + INIT_LIST_HEAD(&rlun->open_list); + INIT_LIST_HEAD(&rlun->closed_list); + INIT_WORK(&rlun->ws_gc, rrpc_lun_gc); spin_lock_init(&rlun->lock); @@ -1127,6 +1178,7 @@ static int rrpc_luns_init(struct rrpc *rrpc, int lun_begin, int lun_end) struct nvm_block *blk = &lun->blocks[j]; rblk->parent = blk; + rblk->rlun = rlun; INIT_LIST_HEAD(&rblk->prio); spin_lock_init(&rblk->lock); } diff --git a/drivers/lightnvm/rrpc.h b/drivers/lightnvm/rrpc.h index a9696a06c38c..ef13ac7700c8 100644 --- a/drivers/lightnvm/rrpc.h +++ b/drivers/lightnvm/rrpc.h @@ -54,7 +54,9 @@ struct rrpc_rq { struct rrpc_block { struct nvm_block *parent; + struct rrpc_lun *rlun; struct list_head prio; + struct list_head list; #define MAX_INVALID_PAGES_STORAGE 8 /* Bitmap for invalid page intries */ @@ -73,7 +75,16 @@ struct rrpc_lun { struct nvm_lun *parent; struct rrpc_block *cur, *gc_cur; struct rrpc_block *blocks; /* Reference to block allocation */ - struct list_head prio_list; /* Blocks that may be GC'ed */ + + struct list_head prio_list; /* Blocks that may be GC'ed */ + struct list_head open_list; /* In-use open blocks. These are blocks + * that can be both written to and read + * from + */ + struct list_head closed_list; /* In-use closed blocks. These are + * blocks that can _only_ be read from + */ + struct work_struct ws_gc; spinlock_t lock; diff --git a/drivers/lightnvm/sysblk.c b/drivers/lightnvm/sysblk.c new file mode 100644 index 000000000000..321de1f154c5 --- /dev/null +++ b/drivers/lightnvm/sysblk.c @@ -0,0 +1,741 @@ +/* + * Copyright (C) 2015 Matias Bjorling. All rights reserved. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License version + * 2 as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; see the file COPYING. If not, write to + * the Free Software Foundation, 675 Mass Ave, Cambridge, MA 02139, + * USA. + * + */ + +#include <linux/lightnvm.h> + +#define MAX_SYSBLKS 3 /* remember to update mapping scheme on change */ +#define MAX_BLKS_PR_SYSBLK 2 /* 2 blks with 256 pages and 3000 erases + * enables ~1.5M updates per sysblk unit + */ + +struct sysblk_scan { + /* A row is a collection of flash blocks for a system block. */ + int nr_rows; + int row; + int act_blk[MAX_SYSBLKS]; + + int nr_ppas; + struct ppa_addr ppas[MAX_SYSBLKS * MAX_BLKS_PR_SYSBLK];/* all sysblks */ +}; + +static inline int scan_ppa_idx(int row, int blkid) +{ + return (row * MAX_BLKS_PR_SYSBLK) + blkid; +} + +void nvm_sysblk_to_cpu(struct nvm_sb_info *info, struct nvm_system_block *sb) +{ + info->seqnr = be32_to_cpu(sb->seqnr); + info->erase_cnt = be32_to_cpu(sb->erase_cnt); + info->version = be16_to_cpu(sb->version); + strncpy(info->mmtype, sb->mmtype, NVM_MMTYPE_LEN); + info->fs_ppa.ppa = be64_to_cpu(sb->fs_ppa); +} + +void nvm_cpu_to_sysblk(struct nvm_system_block *sb, struct nvm_sb_info *info) +{ + sb->magic = cpu_to_be32(NVM_SYSBLK_MAGIC); + sb->seqnr = cpu_to_be32(info->seqnr); + sb->erase_cnt = cpu_to_be32(info->erase_cnt); + sb->version = cpu_to_be16(info->version); + strncpy(sb->mmtype, info->mmtype, NVM_MMTYPE_LEN); + sb->fs_ppa = cpu_to_be64(info->fs_ppa.ppa); +} + +static int nvm_setup_sysblks(struct nvm_dev *dev, struct ppa_addr *sysblk_ppas) +{ + int nr_rows = min_t(int, MAX_SYSBLKS, dev->nr_chnls); + int i; + + for (i = 0; i < nr_rows; i++) + sysblk_ppas[i].ppa = 0; + + /* if possible, place sysblk at first channel, middle channel and last + * channel of the device. If not, create only one or two sys blocks + */ + switch (dev->nr_chnls) { + case 2: + sysblk_ppas[1].g.ch = 1; + /* fall-through */ + case 1: + sysblk_ppas[0].g.ch = 0; + break; + default: + sysblk_ppas[0].g.ch = 0; + sysblk_ppas[1].g.ch = dev->nr_chnls / 2; + sysblk_ppas[2].g.ch = dev->nr_chnls - 1; + break; + } + + return nr_rows; +} + +void nvm_setup_sysblk_scan(struct nvm_dev *dev, struct sysblk_scan *s, + struct ppa_addr *sysblk_ppas) +{ + memset(s, 0, sizeof(struct sysblk_scan)); + s->nr_rows = nvm_setup_sysblks(dev, sysblk_ppas); +} + +static int sysblk_get_host_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct sysblk_scan *s = private; + int i, nr_sysblk = 0; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] != NVM_BLK_T_HOST) + continue; + + if (s->nr_ppas == MAX_BLKS_PR_SYSBLK * MAX_SYSBLKS) { + pr_err("nvm: too many host blks\n"); + return -EINVAL; + } + + ppa.g.blk = i; + + s->ppas[scan_ppa_idx(s->row, nr_sysblk)] = ppa; + s->nr_ppas++; + nr_sysblk++; + } + + return 0; +} + +static int nvm_get_all_sysblks(struct nvm_dev *dev, struct sysblk_scan *s, + struct ppa_addr *ppas, nvm_bb_update_fn *fn) +{ + struct ppa_addr dppa; + int i, ret; + + s->nr_ppas = 0; + + for (i = 0; i < s->nr_rows; i++) { + dppa = generic_to_dev_addr(dev, ppas[i]); + s->row = i; + + ret = dev->ops->get_bb_tbl(dev, dppa, dev->blks_per_lun, fn, s); + if (ret) { + pr_err("nvm: failed bb tbl for ppa (%u %u)\n", + ppas[i].g.ch, + ppas[i].g.blk); + return ret; + } + } + + return ret; +} + +/* + * scans a block for latest sysblk. + * Returns: + * 0 - newer sysblk not found. PPA is updated to latest page. + * 1 - newer sysblk found and stored in *cur. PPA is updated to + * next valid page. + * <0- error. + */ +static int nvm_scan_block(struct nvm_dev *dev, struct ppa_addr *ppa, + struct nvm_system_block *sblk) +{ + struct nvm_system_block *cur; + int pg, cursz, ret, found = 0; + + /* the full buffer for a flash page is allocated. Only the first of it + * contains the system block information + */ + cursz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; + cur = kmalloc(cursz, GFP_KERNEL); + if (!cur) + return -ENOMEM; + + /* perform linear scan through the block */ + for (pg = 0; pg < dev->lps_per_blk; pg++) { + ppa->g.pg = ppa_to_slc(dev, pg); + + ret = nvm_submit_ppa(dev, ppa, 1, NVM_OP_PREAD, NVM_IO_SLC_MODE, + cur, cursz); + if (ret) { + if (ret == NVM_RSP_ERR_EMPTYPAGE) { + pr_debug("nvm: sysblk scan empty ppa (%u %u %u %u)\n", + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; + } + pr_err("nvm: read failed (%x) for ppa (%u %u %u %u)", + ret, + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; /* if we can't read a page, continue to the + * next blk + */ + } + + if (be32_to_cpu(cur->magic) != NVM_SYSBLK_MAGIC) { + pr_debug("nvm: scan break for ppa (%u %u %u %u)\n", + ppa->g.ch, + ppa->g.lun, + ppa->g.blk, + ppa->g.pg); + break; /* last valid page already found */ + } + + if (be32_to_cpu(cur->seqnr) < be32_to_cpu(sblk->seqnr)) + continue; + + memcpy(sblk, cur, sizeof(struct nvm_system_block)); + found = 1; + } + + kfree(cur); + + return found; +} + +static int nvm_set_bb_tbl(struct nvm_dev *dev, struct sysblk_scan *s, int type) +{ + struct nvm_rq rqd; + int ret; + + if (s->nr_ppas > dev->ops->max_phys_sect) { + pr_err("nvm: unable to update all sysblocks atomically\n"); + return -EINVAL; + } + + memset(&rqd, 0, sizeof(struct nvm_rq)); + + nvm_set_rqd_ppalist(dev, &rqd, s->ppas, s->nr_ppas); + nvm_generic_to_addr_mode(dev, &rqd); + + ret = dev->ops->set_bb_tbl(dev, &rqd, type); + nvm_free_rqd_ppalist(dev, &rqd); + if (ret) { + pr_err("nvm: sysblk failed bb mark\n"); + return -EINVAL; + } + + return 0; +} + +static int sysblk_get_free_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct sysblk_scan *s = private; + struct ppa_addr *sppa; + int i, blkid = 0; + + for (i = 0; i < nr_blks; i++) { + if (blks[i] == NVM_BLK_T_HOST) + return -EEXIST; + + if (blks[i] != NVM_BLK_T_FREE) + continue; + + sppa = &s->ppas[scan_ppa_idx(s->row, blkid)]; + sppa->g.ch = ppa.g.ch; + sppa->g.lun = ppa.g.lun; + sppa->g.blk = i; + s->nr_ppas++; + blkid++; + + pr_debug("nvm: use (%u %u %u) as sysblk\n", + sppa->g.ch, sppa->g.lun, sppa->g.blk); + if (blkid > MAX_BLKS_PR_SYSBLK - 1) + return 0; + } + + pr_err("nvm: sysblk failed get sysblk\n"); + return -EINVAL; +} + +static int nvm_write_and_verify(struct nvm_dev *dev, struct nvm_sb_info *info, + struct sysblk_scan *s) +{ + struct nvm_system_block nvmsb; + void *buf; + int i, sect, ret, bufsz; + struct ppa_addr *ppas; + + nvm_cpu_to_sysblk(&nvmsb, info); + + /* buffer for flash page */ + bufsz = dev->sec_size * dev->sec_per_pg * dev->nr_planes; + buf = kzalloc(bufsz, GFP_KERNEL); + if (!buf) + return -ENOMEM; + memcpy(buf, &nvmsb, sizeof(struct nvm_system_block)); + + ppas = kcalloc(dev->sec_per_pg, sizeof(struct ppa_addr), GFP_KERNEL); + if (!ppas) { + ret = -ENOMEM; + goto err; + } + + /* Write and verify */ + for (i = 0; i < s->nr_rows; i++) { + ppas[0] = s->ppas[scan_ppa_idx(i, s->act_blk[i])]; + + pr_debug("nvm: writing sysblk to ppa (%u %u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk, + ppas[0].g.pg); + + /* Expand to all sectors within a flash page */ + if (dev->sec_per_pg > 1) { + for (sect = 1; sect < dev->sec_per_pg; sect++) { + ppas[sect].ppa = ppas[0].ppa; + ppas[sect].g.sec = sect; + } + } + + ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PWRITE, + NVM_IO_SLC_MODE, buf, bufsz); + if (ret) { + pr_err("nvm: sysblk failed program (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + break; + } + + ret = nvm_submit_ppa(dev, ppas, dev->sec_per_pg, NVM_OP_PREAD, + NVM_IO_SLC_MODE, buf, bufsz); + if (ret) { + pr_err("nvm: sysblk failed read (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + break; + } + + if (memcmp(buf, &nvmsb, sizeof(struct nvm_system_block))) { + pr_err("nvm: sysblk failed verify (%u %u %u)\n", + ppas[0].g.ch, + ppas[0].g.lun, + ppas[0].g.blk); + ret = -EINVAL; + break; + } + } + + kfree(ppas); +err: + kfree(buf); + + return ret; +} + +static int nvm_prepare_new_sysblks(struct nvm_dev *dev, struct sysblk_scan *s) +{ + int i, ret; + unsigned long nxt_blk; + struct ppa_addr *ppa; + + for (i = 0; i < s->nr_rows; i++) { + nxt_blk = (s->act_blk[i] + 1) % MAX_BLKS_PR_SYSBLK; + ppa = &s->ppas[scan_ppa_idx(i, nxt_blk)]; + ppa->g.pg = ppa_to_slc(dev, 0); + + ret = nvm_erase_ppa(dev, ppa, 1); + if (ret) + return ret; + + s->act_blk[i] = nxt_blk; + } + + return 0; +} + +int nvm_get_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) +{ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + struct nvm_system_block *cur; + int i, j, found = 0; + int ret = -ENOMEM; + + /* + * 1. setup sysblk locations + * 2. get bad block list + * 3. filter on host-specific (type 3) + * 4. iterate through all and find the highest seq nr. + * 5. return superblock information + */ + + if (!dev->ops->get_bb_tbl) + return -EINVAL; + + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + if (ret) + goto err_sysblk; + + /* no sysblocks initialized */ + if (!s.nr_ppas) + goto err_sysblk; + + cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); + if (!cur) + goto err_sysblk; + + /* find the latest block across all sysblocks */ + for (i = 0; i < s.nr_rows; i++) { + for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { + struct ppa_addr ppa = s.ppas[scan_ppa_idx(i, j)]; + + ret = nvm_scan_block(dev, &ppa, cur); + if (ret > 0) + found = 1; + else if (ret < 0) + break; + } + } + + nvm_sysblk_to_cpu(info, cur); + + kfree(cur); +err_sysblk: + mutex_unlock(&dev->mlock); + + if (found) + return 1; + return ret; +} + +int nvm_update_sysblock(struct nvm_dev *dev, struct nvm_sb_info *new) +{ + /* 1. for each latest superblock + * 2. if room + * a. write new flash page entry with the updated information + * 3. if no room + * a. find next available block on lun (linear search) + * if none, continue to next lun + * if none at all, report error. also report that it wasn't + * possible to write to all superblocks. + * c. write data to block. + */ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + struct nvm_system_block *cur; + int i, j, ppaidx, found = 0; + int ret = -ENOMEM; + + if (!dev->ops->get_bb_tbl) + return -EINVAL; + + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_host_blks); + if (ret) + goto err_sysblk; + + cur = kzalloc(sizeof(struct nvm_system_block), GFP_KERNEL); + if (!cur) + goto err_sysblk; + + /* Get the latest sysblk for each sysblk row */ + for (i = 0; i < s.nr_rows; i++) { + found = 0; + for (j = 0; j < MAX_BLKS_PR_SYSBLK; j++) { + ppaidx = scan_ppa_idx(i, j); + ret = nvm_scan_block(dev, &s.ppas[ppaidx], cur); + if (ret > 0) { + s.act_blk[i] = j; + found = 1; + } else if (ret < 0) + break; + } + } + + if (!found) { + pr_err("nvm: no valid sysblks found to update\n"); + ret = -EINVAL; + goto err_cur; + } + + /* + * All sysblocks found. Check that they have same page id in their flash + * blocks + */ + for (i = 1; i < s.nr_rows; i++) { + struct ppa_addr l = s.ppas[scan_ppa_idx(0, s.act_blk[0])]; + struct ppa_addr r = s.ppas[scan_ppa_idx(i, s.act_blk[i])]; + + if (l.g.pg != r.g.pg) { + pr_err("nvm: sysblks not on same page. Previous update failed.\n"); + ret = -EINVAL; + goto err_cur; + } + } + + /* + * Check that there haven't been another update to the seqnr since we + * began + */ + if ((new->seqnr - 1) != be32_to_cpu(cur->seqnr)) { + pr_err("nvm: seq is not sequential\n"); + ret = -EINVAL; + goto err_cur; + } + + /* + * When all pages in a block has been written, a new block is selected + * and writing is performed on the new block. + */ + if (s.ppas[scan_ppa_idx(0, s.act_blk[0])].g.pg == + dev->lps_per_blk - 1) { + ret = nvm_prepare_new_sysblks(dev, &s); + if (ret) + goto err_cur; + } + + ret = nvm_write_and_verify(dev, new, &s); +err_cur: + kfree(cur); +err_sysblk: + mutex_unlock(&dev->mlock); + + return ret; +} + +int nvm_init_sysblock(struct nvm_dev *dev, struct nvm_sb_info *info) +{ + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + int ret; + + /* + * 1. select master blocks and select first available blks + * 2. get bad block list + * 3. mark MAX_SYSBLKS block as host-based device allocated. + * 4. write and verify data to block + */ + + if (!dev->ops->get_bb_tbl || !dev->ops->set_bb_tbl) + return -EINVAL; + + if (!(dev->mccap & NVM_ID_CAP_SLC) || !dev->lps_per_blk) { + pr_err("nvm: memory does not support SLC access\n"); + return -EINVAL; + } + + /* Index all sysblocks and mark them as host-driven */ + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, sysblk_get_free_blks); + if (ret) + goto err_mark; + + ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_HOST); + if (ret) + goto err_mark; + + /* Write to the first block of each row */ + ret = nvm_write_and_verify(dev, info, &s); +err_mark: + mutex_unlock(&dev->mlock); + return ret; +} + +struct factory_blks { + struct nvm_dev *dev; + int flags; + unsigned long *blks; +}; + +static int factory_nblks(int nblks) +{ + /* Round up to nearest BITS_PER_LONG */ + return (nblks + (BITS_PER_LONG - 1)) & ~(BITS_PER_LONG - 1); +} + +static unsigned int factory_blk_offset(struct nvm_dev *dev, int ch, int lun) +{ + int nblks = factory_nblks(dev->blks_per_lun); + + return ((ch * dev->luns_per_chnl * nblks) + (lun * nblks)) / + BITS_PER_LONG; +} + +static int nvm_factory_blks(struct ppa_addr ppa, int nr_blks, u8 *blks, + void *private) +{ + struct factory_blks *f = private; + struct nvm_dev *dev = f->dev; + int i, lunoff; + + lunoff = factory_blk_offset(dev, ppa.g.ch, ppa.g.lun); + + /* non-set bits correspond to the block must be erased */ + for (i = 0; i < nr_blks; i++) { + switch (blks[i]) { + case NVM_BLK_T_FREE: + if (f->flags & NVM_FACTORY_ERASE_ONLY_USER) + set_bit(i, &f->blks[lunoff]); + break; + case NVM_BLK_T_HOST: + if (!(f->flags & NVM_FACTORY_RESET_HOST_BLKS)) + set_bit(i, &f->blks[lunoff]); + break; + case NVM_BLK_T_GRWN_BAD: + if (!(f->flags & NVM_FACTORY_RESET_GRWN_BBLKS)) + set_bit(i, &f->blks[lunoff]); + break; + default: + set_bit(i, &f->blks[lunoff]); + break; + } + } + + return 0; +} + +static int nvm_fact_get_blks(struct nvm_dev *dev, struct ppa_addr *erase_list, + int max_ppas, struct factory_blks *f) +{ + struct ppa_addr ppa; + int ch, lun, blkid, idx, done = 0, ppa_cnt = 0; + unsigned long *offset; + + while (!done) { + done = 1; + for (ch = 0; ch < dev->nr_chnls; ch++) { + for (lun = 0; lun < dev->luns_per_chnl; lun++) { + idx = factory_blk_offset(dev, ch, lun); + offset = &f->blks[idx]; + + blkid = find_first_zero_bit(offset, + dev->blks_per_lun); + if (blkid >= dev->blks_per_lun) + continue; + set_bit(blkid, offset); + + ppa.ppa = 0; + ppa.g.ch = ch; + ppa.g.lun = lun; + ppa.g.blk = blkid; + pr_debug("nvm: erase ppa (%u %u %u)\n", + ppa.g.ch, + ppa.g.lun, + ppa.g.blk); + + erase_list[ppa_cnt] = ppa; + ppa_cnt++; + done = 0; + + if (ppa_cnt == max_ppas) + return ppa_cnt; + } + } + } + + return ppa_cnt; +} + +static int nvm_fact_get_bb_tbl(struct nvm_dev *dev, struct ppa_addr ppa, + nvm_bb_update_fn *fn, void *priv) +{ + struct ppa_addr dev_ppa; + int ret; + + dev_ppa = generic_to_dev_addr(dev, ppa); + + ret = dev->ops->get_bb_tbl(dev, dev_ppa, dev->blks_per_lun, fn, priv); + if (ret) + pr_err("nvm: failed bb tbl for ch%u lun%u\n", + ppa.g.ch, ppa.g.blk); + return ret; +} + +static int nvm_fact_select_blks(struct nvm_dev *dev, struct factory_blks *f) +{ + int ch, lun, ret; + struct ppa_addr ppa; + + ppa.ppa = 0; + for (ch = 0; ch < dev->nr_chnls; ch++) { + for (lun = 0; lun < dev->luns_per_chnl; lun++) { + ppa.g.ch = ch; + ppa.g.lun = lun; + + ret = nvm_fact_get_bb_tbl(dev, ppa, nvm_factory_blks, + f); + if (ret) + return ret; + } + } + + return 0; +} + +int nvm_dev_factory(struct nvm_dev *dev, int flags) +{ + struct factory_blks f; + struct ppa_addr *ppas; + int ppa_cnt, ret = -ENOMEM; + int max_ppas = dev->ops->max_phys_sect / dev->nr_planes; + struct ppa_addr sysblk_ppas[MAX_SYSBLKS]; + struct sysblk_scan s; + + f.blks = kzalloc(factory_nblks(dev->blks_per_lun) * dev->nr_luns, + GFP_KERNEL); + if (!f.blks) + return ret; + + ppas = kcalloc(max_ppas, sizeof(struct ppa_addr), GFP_KERNEL); + if (!ppas) + goto err_blks; + + f.dev = dev; + f.flags = flags; + + /* create list of blks to be erased */ + ret = nvm_fact_select_blks(dev, &f); + if (ret) + goto err_ppas; + + /* continue to erase until list of blks until empty */ + while ((ppa_cnt = nvm_fact_get_blks(dev, ppas, max_ppas, &f)) > 0) + nvm_erase_ppa(dev, ppas, ppa_cnt); + + /* mark host reserved blocks free */ + if (flags & NVM_FACTORY_RESET_HOST_BLKS) { + nvm_setup_sysblk_scan(dev, &s, sysblk_ppas); + mutex_lock(&dev->mlock); + ret = nvm_get_all_sysblks(dev, &s, sysblk_ppas, + sysblk_get_host_blks); + if (!ret) + ret = nvm_set_bb_tbl(dev, &s, NVM_BLK_T_FREE); + mutex_unlock(&dev->mlock); + } +err_ppas: + kfree(ppas); +err_blks: + kfree(f.blks); + return ret; +} +EXPORT_SYMBOL(nvm_dev_factory); diff --git a/drivers/md/bcache/btree.c b/drivers/md/bcache/btree.c index 83392f856dfd..22b9e34ceb75 100644 --- a/drivers/md/bcache/btree.c +++ b/drivers/md/bcache/btree.c @@ -1741,6 +1741,7 @@ static void bch_btree_gc(struct cache_set *c) do { ret = btree_root(gc_root, c, &op, &writes, &stats); closure_sync(&writes); + cond_resched(); if (ret && ret != -EAGAIN) pr_warn("gc failed!"); @@ -2162,8 +2163,10 @@ int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, rw_lock(true, b, b->level); if (b->key.ptr[0] != btree_ptr || - b->seq != seq + 1) + b->seq != seq + 1) { + op->lock = b->level; goto out; + } } SET_KEY_PTRS(check_key, 1); diff --git a/drivers/md/bcache/super.c b/drivers/md/bcache/super.c index 679a093a3bf6..8d0ead98eb6e 100644 --- a/drivers/md/bcache/super.c +++ b/drivers/md/bcache/super.c @@ -685,6 +685,8 @@ static void bcache_device_link(struct bcache_device *d, struct cache_set *c, WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || sysfs_create_link(&c->kobj, &d->kobj, d->name), "Couldn't create device <-> cache set symlinks"); + + clear_bit(BCACHE_DEV_UNLINK_DONE, &d->flags); } static void bcache_device_detach(struct bcache_device *d) @@ -847,8 +849,11 @@ void bch_cached_dev_run(struct cached_dev *dc) buf[SB_LABEL_SIZE] = '\0'; env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); - if (atomic_xchg(&dc->running, 1)) + if (atomic_xchg(&dc->running, 1)) { + kfree(env[1]); + kfree(env[2]); return; + } if (!d->c && BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { @@ -1933,6 +1938,8 @@ static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, else err = "device busy"; mutex_unlock(&bch_register_lock); + if (attr == &ksysfs_register_quiet) + goto out; } goto err; } @@ -1971,8 +1978,7 @@ out: err_close: blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); err: - if (attr != &ksysfs_register_quiet) - pr_info("error opening %s: %s", path, err); + pr_info("error opening %s: %s", path, err); ret = -EINVAL; goto out; } @@ -2066,8 +2072,10 @@ static int __init bcache_init(void) closure_debug_init(); bcache_major = register_blkdev(0, "bcache"); - if (bcache_major < 0) + if (bcache_major < 0) { + unregister_reboot_notifier(&reboot); return bcache_major; + } if (!(bcache_wq = create_workqueue("bcache")) || !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || diff --git a/drivers/md/bcache/writeback.c b/drivers/md/bcache/writeback.c index b23f88d9f18c..b9346cd9cda1 100644 --- a/drivers/md/bcache/writeback.c +++ b/drivers/md/bcache/writeback.c @@ -323,6 +323,10 @@ void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, static bool dirty_pred(struct keybuf *buf, struct bkey *k) { + struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); + + BUG_ON(KEY_INODE(k) != dc->disk.id); + return KEY_DIRTY(k); } @@ -372,11 +376,24 @@ next: } } +/* + * Returns true if we scanned the entire disk + */ static bool refill_dirty(struct cached_dev *dc) { struct keybuf *buf = &dc->writeback_keys; + struct bkey start = KEY(dc->disk.id, 0, 0); struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); - bool searched_from_start = false; + struct bkey start_pos; + + /* + * make sure keybuf pos is inside the range for this disk - at bringup + * we might not be attached yet so this disk's inode nr isn't + * initialized then + */ + if (bkey_cmp(&buf->last_scanned, &start) < 0 || + bkey_cmp(&buf->last_scanned, &end) > 0) + buf->last_scanned = start; if (dc->partial_stripes_expensive) { refill_full_stripes(dc); @@ -384,14 +401,20 @@ static bool refill_dirty(struct cached_dev *dc) return false; } - if (bkey_cmp(&buf->last_scanned, &end) >= 0) { - buf->last_scanned = KEY(dc->disk.id, 0, 0); - searched_from_start = true; - } - + start_pos = buf->last_scanned; bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); - return bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start; + if (bkey_cmp(&buf->last_scanned, &end) < 0) + return false; + + /* + * If we get to the end start scanning again from the beginning, and + * only scan up to where we initially started scanning from: + */ + buf->last_scanned = start; + bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); + + return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; } static int bch_writeback_thread(void *arg) diff --git a/drivers/md/bcache/writeback.h b/drivers/md/bcache/writeback.h index 0a9dab187b79..073a042aed24 100644 --- a/drivers/md/bcache/writeback.h +++ b/drivers/md/bcache/writeback.h @@ -63,7 +63,8 @@ static inline bool should_writeback(struct cached_dev *dc, struct bio *bio, static inline void bch_writeback_queue(struct cached_dev *dc) { - wake_up_process(dc->writeback_thread); + if (!IS_ERR_OR_NULL(dc->writeback_thread)) + wake_up_process(dc->writeback_thread); } static inline void bch_writeback_add(struct cached_dev *dc) diff --git a/drivers/mmc/core/debugfs.c b/drivers/mmc/core/debugfs.c index 154aced0b91b..65cc0ac9b82d 100644 --- a/drivers/mmc/core/debugfs.c +++ b/drivers/mmc/core/debugfs.c @@ -170,7 +170,7 @@ static int mmc_ios_show(struct seq_file *s, void *data) str = "invalid"; break; } - seq_printf(s, "signal voltage:\t%u (%s)\n", ios->chip_select, str); + seq_printf(s, "signal voltage:\t%u (%s)\n", ios->signal_voltage, str); switch (ios->drv_type) { case MMC_SET_DRIVER_TYPE_A: diff --git a/drivers/mmc/core/pwrseq_simple.c b/drivers/mmc/core/pwrseq_simple.c index 2b16263458af..aba786daebca 100644 --- a/drivers/mmc/core/pwrseq_simple.c +++ b/drivers/mmc/core/pwrseq_simple.c @@ -29,15 +29,18 @@ struct mmc_pwrseq_simple { static void mmc_pwrseq_simple_set_gpios_value(struct mmc_pwrseq_simple *pwrseq, int value) { - int i; struct gpio_descs *reset_gpios = pwrseq->reset_gpios; - int values[reset_gpios->ndescs]; - for (i = 0; i < reset_gpios->ndescs; i++) - values[i] = value; + if (!IS_ERR(reset_gpios)) { + int i; + int values[reset_gpios->ndescs]; - gpiod_set_array_value_cansleep(reset_gpios->ndescs, reset_gpios->desc, - values); + for (i = 0; i < reset_gpios->ndescs; i++) + values[i] = value; + + gpiod_set_array_value_cansleep( + reset_gpios->ndescs, reset_gpios->desc, values); + } } static void mmc_pwrseq_simple_pre_power_on(struct mmc_host *host) @@ -79,7 +82,8 @@ static void mmc_pwrseq_simple_free(struct mmc_host *host) struct mmc_pwrseq_simple *pwrseq = container_of(host->pwrseq, struct mmc_pwrseq_simple, pwrseq); - gpiod_put_array(pwrseq->reset_gpios); + if (!IS_ERR(pwrseq->reset_gpios)) + gpiod_put_array(pwrseq->reset_gpios); if (!IS_ERR(pwrseq->ext_clk)) clk_put(pwrseq->ext_clk); @@ -112,7 +116,9 @@ struct mmc_pwrseq *mmc_pwrseq_simple_alloc(struct mmc_host *host, } pwrseq->reset_gpios = gpiod_get_array(dev, "reset", GPIOD_OUT_HIGH); - if (IS_ERR(pwrseq->reset_gpios)) { + if (IS_ERR(pwrseq->reset_gpios) && + PTR_ERR(pwrseq->reset_gpios) != -ENOENT && + PTR_ERR(pwrseq->reset_gpios) != -ENOSYS) { ret = PTR_ERR(pwrseq->reset_gpios); goto clk_put; } diff --git a/drivers/mmc/core/sd.c b/drivers/mmc/core/sd.c index f2b164b214ae..bb39a29b2db6 100644 --- a/drivers/mmc/core/sd.c +++ b/drivers/mmc/core/sd.c @@ -329,6 +329,7 @@ static int mmc_read_switch(struct mmc_card *card) card->sw_caps.sd3_bus_mode = status[13]; /* Driver Strengths supported by the card */ card->sw_caps.sd3_drv_type = status[9]; + card->sw_caps.sd3_curr_limit = status[7] | status[6] << 8; } out: @@ -545,14 +546,25 @@ static int sd_set_current_limit(struct mmc_card *card, u8 *status) * when we set current limit to 200ma, the card will draw 200ma, and * when we set current limit to 400/600/800ma, the card will draw its * maximum 300ma from the host. + * + * The above is incorrect: if we try to set a current limit that is + * not supported by the card, the card can rightfully error out the + * attempt, and remain at the default current limit. This results + * in a 300mA card being limited to 200mA even though the host + * supports 800mA. Failures seen with SanDisk 8GB UHS cards with + * an iMX6 host. --rmk */ - if (max_current >= 800) + if (max_current >= 800 && + card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_800) current_limit = SD_SET_CURRENT_LIMIT_800; - else if (max_current >= 600) + else if (max_current >= 600 && + card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_600) current_limit = SD_SET_CURRENT_LIMIT_600; - else if (max_current >= 400) + else if (max_current >= 400 && + card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_400) current_limit = SD_SET_CURRENT_LIMIT_400; - else if (max_current >= 200) + else if (max_current >= 200 && + card->sw_caps.sd3_curr_limit & SD_MAX_CURRENT_200) current_limit = SD_SET_CURRENT_LIMIT_200; if (current_limit != SD_SET_CURRENT_NO_CHANGE) { @@ -626,9 +638,9 @@ static int mmc_sd_init_uhs_card(struct mmc_card *card) * SDR104 mode SD-cards. Note that tuning is mandatory for SDR104. */ if (!mmc_host_is_spi(card->host) && - (card->sd_bus_speed == UHS_SDR50_BUS_SPEED || - card->sd_bus_speed == UHS_DDR50_BUS_SPEED || - card->sd_bus_speed == UHS_SDR104_BUS_SPEED)) { + (card->host->ios.timing == MMC_TIMING_UHS_SDR50 || + card->host->ios.timing == MMC_TIMING_UHS_DDR50 || + card->host->ios.timing == MMC_TIMING_UHS_SDR104)) { err = mmc_execute_tuning(card); /* @@ -638,7 +650,7 @@ static int mmc_sd_init_uhs_card(struct mmc_card *card) * difference between v3.00 and 3.01 spec means that CMD19 * tuning is also available for DDR50 mode. */ - if (err && card->sd_bus_speed == UHS_DDR50_BUS_SPEED) { + if (err && card->host->ios.timing == MMC_TIMING_UHS_DDR50) { pr_warn("%s: ddr50 tuning failed\n", mmc_hostname(card->host)); err = 0; diff --git a/drivers/mmc/core/sdio.c b/drivers/mmc/core/sdio.c index d61ba1a0495e..467b3cf80c44 100644 --- a/drivers/mmc/core/sdio.c +++ b/drivers/mmc/core/sdio.c @@ -535,8 +535,8 @@ static int mmc_sdio_init_uhs_card(struct mmc_card *card) * SDR104 mode SD-cards. Note that tuning is mandatory for SDR104. */ if (!mmc_host_is_spi(card->host) && - ((card->sw_caps.sd3_bus_mode & SD_MODE_UHS_SDR50) || - (card->sw_caps.sd3_bus_mode & SD_MODE_UHS_SDR104))) + ((card->host->ios.timing == MMC_TIMING_UHS_SDR50) || + (card->host->ios.timing == MMC_TIMING_UHS_SDR104))) err = mmc_execute_tuning(card); out: return err; diff --git a/drivers/mmc/core/sdio_cis.c b/drivers/mmc/core/sdio_cis.c index 8e94e555b788..6f6fc527a263 100644 --- a/drivers/mmc/core/sdio_cis.c +++ b/drivers/mmc/core/sdio_cis.c @@ -223,6 +223,7 @@ static const struct cis_tpl cis_tpl_list[] = { { 0x20, 4, cistpl_manfid }, { 0x21, 2, /* cistpl_funcid */ }, { 0x22, 0, cistpl_funce }, + { 0x91, 2, /* cistpl_sdio_std */ }, }; static int sdio_read_cis(struct mmc_card *card, struct sdio_func *func) diff --git a/drivers/mmc/host/mmci.c b/drivers/mmc/host/mmci.c index fb266745f824..0d6ca4116f3d 100644 --- a/drivers/mmc/host/mmci.c +++ b/drivers/mmc/host/mmci.c @@ -151,6 +151,7 @@ static struct variant_data variant_nomadik = { .fifosize = 16 * 4, .fifohalfsize = 8 * 4, .clkreg = MCI_CLK_ENABLE, + .clkreg_8bit_bus_enable = MCI_ST_8BIT_BUS, .datalength_bits = 24, .datactrl_mask_sdio = MCI_ST_DPSM_SDIOEN, .st_sdio = true, @@ -1886,7 +1887,7 @@ static struct amba_id mmci_ids[] = { { .id = 0x00280180, .mask = 0x00ffffff, - .data = &variant_u300, + .data = &variant_nomadik, }, { .id = 0x00480180, diff --git a/drivers/mmc/host/tmio_mmc_dma.c b/drivers/mmc/host/tmio_mmc_dma.c index e4b05dbb9ca8..4a0d6b80eaa3 100644 --- a/drivers/mmc/host/tmio_mmc_dma.c +++ b/drivers/mmc/host/tmio_mmc_dma.c @@ -94,9 +94,9 @@ static void tmio_mmc_start_dma_rx(struct tmio_mmc_host *host) desc = NULL; ret = cookie; } + dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n", + __func__, host->sg_len, ret, cookie, host->mrq); } - dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n", - __func__, host->sg_len, ret, cookie, host->mrq); pio: if (!desc) { @@ -116,8 +116,8 @@ pio: "DMA failed: %d, falling back to PIO\n", ret); } - dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d, sg[%d]\n", __func__, - desc, cookie, host->sg_len); + dev_dbg(&host->pdev->dev, "%s(): desc %p, sg[%d]\n", __func__, + desc, host->sg_len); } static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host) @@ -174,9 +174,9 @@ static void tmio_mmc_start_dma_tx(struct tmio_mmc_host *host) desc = NULL; ret = cookie; } + dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n", + __func__, host->sg_len, ret, cookie, host->mrq); } - dev_dbg(&host->pdev->dev, "%s(): mapped %d -> %d, cookie %d, rq %p\n", - __func__, host->sg_len, ret, cookie, host->mrq); pio: if (!desc) { @@ -196,8 +196,7 @@ pio: "DMA failed: %d, falling back to PIO\n", ret); } - dev_dbg(&host->pdev->dev, "%s(): desc %p, cookie %d\n", __func__, - desc, cookie); + dev_dbg(&host->pdev->dev, "%s(): desc %p\n", __func__, desc); } void tmio_mmc_start_dma(struct tmio_mmc_host *host, diff --git a/drivers/mtd/ubi/cdev.c b/drivers/mtd/ubi/cdev.c index 54e056d3be02..ee2b74d1d1b5 100644 --- a/drivers/mtd/ubi/cdev.c +++ b/drivers/mtd/ubi/cdev.c @@ -174,9 +174,9 @@ static int vol_cdev_fsync(struct file *file, loff_t start, loff_t end, struct ubi_device *ubi = desc->vol->ubi; struct inode *inode = file_inode(file); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = ubi_sync(ubi->ubi_num); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index 2c2baab9d880..d66c690a8597 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -157,6 +157,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [29] = "802.1ad offload support", [31] = "Modifying loopback source checks using UPDATE_QP support", [32] = "Loopback source checks support", + [33] = "RoCEv2 support" }; int i; @@ -626,6 +627,8 @@ out: return err; } +static void disable_unsupported_roce_caps(void *buf); + int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) { struct mlx4_cmd_mailbox *mailbox; @@ -738,6 +741,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) if (err) goto out; + if (mlx4_is_mfunc(dev)) + disable_unsupported_roce_caps(outbox); MLX4_GET(field, outbox, QUERY_DEV_CAP_RSVD_QP_OFFSET); dev_cap->reserved_qps = 1 << (field & 0xf); MLX4_GET(field, outbox, QUERY_DEV_CAP_MAX_QP_OFFSET); @@ -905,6 +910,8 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_EQE_STRIDE; MLX4_GET(dev_cap->bmme_flags, outbox, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); + if (dev_cap->bmme_flags & MLX4_FLAG_ROCE_V1_V2) + dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_ROCE_V1_V2; if (dev_cap->bmme_flags & MLX4_FLAG_PORT_REMAP) dev_cap->flags2 |= MLX4_DEV_CAP_FLAG2_PORT_REMAP; MLX4_GET(field, outbox, QUERY_DEV_CAP_CONFIG_DEV_OFFSET); @@ -1161,6 +1168,7 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, if (err) return err; + disable_unsupported_roce_caps(outbox->buf); /* add port mng change event capability and disable mw type 1 * unconditionally to slaves */ @@ -1258,6 +1266,21 @@ int mlx4_QUERY_DEV_CAP_wrapper(struct mlx4_dev *dev, int slave, return 0; } +static void disable_unsupported_roce_caps(void *buf) +{ + u32 flags; + + MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + flags &= ~(1UL << 31); + MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_FLAGS_OFFSET); + MLX4_GET(flags, buf, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); + flags &= ~(1UL << 24); + MLX4_PUT(buf, flags, QUERY_DEV_CAP_EXT_2_FLAGS_OFFSET); + MLX4_GET(flags, buf, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); + flags &= ~(MLX4_FLAG_ROCE_V1_V2); + MLX4_PUT(buf, flags, QUERY_DEV_CAP_BMME_FLAGS_OFFSET); +} + int mlx4_QUERY_PORT_wrapper(struct mlx4_dev *dev, int slave, struct mlx4_vhcr *vhcr, struct mlx4_cmd_mailbox *inbox, @@ -2239,7 +2262,8 @@ struct mlx4_config_dev { __be32 rsvd1[3]; __be16 vxlan_udp_dport; __be16 rsvd2; - __be32 rsvd3; + __be16 roce_v2_entropy; + __be16 roce_v2_udp_dport; __be32 roce_flags; __be32 rsvd4[25]; __be16 rsvd5; @@ -2248,6 +2272,7 @@ struct mlx4_config_dev { }; #define MLX4_VXLAN_UDP_DPORT (1 << 0) +#define MLX4_ROCE_V2_UDP_DPORT BIT(3) #define MLX4_DISABLE_RX_PORT BIT(18) static int mlx4_CONFIG_DEV_set(struct mlx4_dev *dev, struct mlx4_config_dev *config_dev) @@ -2365,6 +2390,18 @@ int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis) return mlx4_CONFIG_DEV_set(dev, &config_dev); } +int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port) +{ + struct mlx4_config_dev config_dev; + + memset(&config_dev, 0, sizeof(config_dev)); + config_dev.update_flags = cpu_to_be32(MLX4_ROCE_V2_UDP_DPORT); + config_dev.roce_v2_udp_dport = cpu_to_be16(udp_port); + + return mlx4_CONFIG_DEV_set(dev, &config_dev); +} +EXPORT_SYMBOL_GPL(mlx4_config_roce_v2_port); + int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2) { struct mlx4_cmd_mailbox *mailbox; diff --git a/drivers/net/ethernet/mellanox/mlx4/mlx4.h b/drivers/net/ethernet/mellanox/mlx4/mlx4.h index 2404c22ad2b2..7baef52db6b7 100644 --- a/drivers/net/ethernet/mellanox/mlx4/mlx4.h +++ b/drivers/net/ethernet/mellanox/mlx4/mlx4.h @@ -780,7 +780,10 @@ struct mlx4_set_port_general_context { u16 reserved1; u8 v_ignore_fcs; u8 flags; - u8 ignore_fcs; + union { + u8 ignore_fcs; + u8 roce_mode; + }; u8 reserved2; __be16 mtu; u8 pptx; diff --git a/drivers/net/ethernet/mellanox/mlx4/port.c b/drivers/net/ethernet/mellanox/mlx4/port.c index f2550425c251..787b7bb54d52 100644 --- a/drivers/net/ethernet/mellanox/mlx4/port.c +++ b/drivers/net/ethernet/mellanox/mlx4/port.c @@ -1520,6 +1520,8 @@ int mlx4_SET_PORT(struct mlx4_dev *dev, u8 port, int pkey_tbl_sz) return err; } +#define SET_PORT_ROCE_2_FLAGS 0x10 +#define MLX4_SET_PORT_ROCE_V1_V2 0x2 int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, u8 pptx, u8 pfctx, u8 pprx, u8 pfcrx) { @@ -1539,6 +1541,11 @@ int mlx4_SET_PORT_general(struct mlx4_dev *dev, u8 port, int mtu, context->pprx = (pprx * (!pfcrx)) << 7; context->pfcrx = pfcrx; + if (dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) { + context->flags |= SET_PORT_ROCE_2_FLAGS; + context->roce_mode |= + MLX4_SET_PORT_ROCE_V1_V2 << 4; + } in_mod = MLX4_SET_PORT_GENERAL << 8 | port; err = mlx4_cmd(dev, mailbox->dma, in_mod, MLX4_SET_PORT_ETH_OPCODE, MLX4_CMD_SET_PORT, MLX4_CMD_TIME_CLASS_B, diff --git a/drivers/net/ethernet/mellanox/mlx4/qp.c b/drivers/net/ethernet/mellanox/mlx4/qp.c index 168823dde79f..d1cd9c32a9ae 100644 --- a/drivers/net/ethernet/mellanox/mlx4/qp.c +++ b/drivers/net/ethernet/mellanox/mlx4/qp.c @@ -167,6 +167,12 @@ static int __mlx4_qp_modify(struct mlx4_dev *dev, struct mlx4_mtt *mtt, context->log_page_size = mtt->page_shift - MLX4_ICM_PAGE_SHIFT; } + if ((cur_state == MLX4_QP_STATE_RTR) && + (new_state == MLX4_QP_STATE_RTS) && + dev->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2) + context->roce_entropy = + cpu_to_be16(mlx4_qp_roce_entropy(dev, qp->qpn)); + *(__be32 *) mailbox->buf = cpu_to_be32(optpar); memcpy(mailbox->buf + 8, context, sizeof *context); @@ -921,3 +927,23 @@ int mlx4_qp_to_ready(struct mlx4_dev *dev, struct mlx4_mtt *mtt, return 0; } EXPORT_SYMBOL_GPL(mlx4_qp_to_ready); + +u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn) +{ + struct mlx4_qp_context context; + struct mlx4_qp qp; + int err; + + qp.qpn = qpn; + err = mlx4_qp_query(dev, &qp, &context); + if (!err) { + u32 dest_qpn = be32_to_cpu(context.remote_qpn) & 0xffffff; + u16 folded_dst = folded_qp(dest_qpn); + u16 folded_src = folded_qp(qpn); + + return (dest_qpn != qpn) ? + ((folded_dst ^ folded_src) | 0xC000) : + folded_src | 0xC000; + } + return 0xdead; +} diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h b/drivers/net/ethernet/mellanox/mlx5/core/en.h index 9ea49a893323..aac071a7e830 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en.h +++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h @@ -39,8 +39,8 @@ #include <linux/mlx5/qp.h> #include <linux/mlx5/cq.h> #include <linux/mlx5/vport.h> +#include <linux/mlx5/transobj.h> #include "wq.h" -#include "transobj.h" #include "mlx5_core.h" #define MLX5E_MAX_NUM_TC 8 diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c index c56d91a2812b..6a3e430f1062 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c @@ -2241,7 +2241,7 @@ static void *mlx5e_create_netdev(struct mlx5_core_dev *mdev) goto err_unmap_free_uar; } - err = mlx5_alloc_transport_domain(mdev, &priv->tdn); + err = mlx5_core_alloc_transport_domain(mdev, &priv->tdn); if (err) { mlx5_core_err(mdev, "alloc td failed, %d\n", err); goto err_dealloc_pd; @@ -2324,7 +2324,7 @@ err_destroy_mkey: mlx5_core_destroy_mkey(mdev, &priv->mr); err_dealloc_transport_domain: - mlx5_dealloc_transport_domain(mdev, priv->tdn); + mlx5_core_dealloc_transport_domain(mdev, priv->tdn); err_dealloc_pd: mlx5_core_dealloc_pd(mdev, priv->pdn); @@ -2356,7 +2356,7 @@ static void mlx5e_destroy_netdev(struct mlx5_core_dev *mdev, void *vpriv) mlx5e_close_drop_rq(priv); mlx5e_destroy_tises(priv); mlx5_core_destroy_mkey(priv->mdev, &priv->mr); - mlx5_dealloc_transport_domain(priv->mdev, priv->tdn); + mlx5_core_dealloc_transport_domain(priv->mdev, priv->tdn); mlx5_core_dealloc_pd(priv->mdev, priv->pdn); mlx5_unmap_free_uar(priv->mdev, &priv->cq_uar); free_netdev(netdev); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/eq.c b/drivers/net/ethernet/mellanox/mlx5/core/eq.c index 23c244a7e5d7..647a3ca2c2a9 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/eq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/eq.c @@ -230,6 +230,7 @@ static int mlx5_eq_int(struct mlx5_core_dev *dev, struct mlx5_eq *eq) case MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR: case MLX5_EVENT_TYPE_WQ_ACCESS_ERROR: rsn = be32_to_cpu(eqe->data.qp_srq.qp_srq_n) & 0xffffff; + rsn |= (eqe->data.qp_srq.type << MLX5_USER_INDEX_LEN); mlx5_core_dbg(dev, "event %s(%d) arrived on resource 0x%x\n", eqe_type_str(eqe->type), eqe->type, rsn); mlx5_rsc_event(dev, rsn, eqe->type); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/main.c b/drivers/net/ethernet/mellanox/mlx5/core/main.c index b37749a3730e..1545a944c309 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/main.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/main.c @@ -78,6 +78,11 @@ struct mlx5_device_context { void *context; }; +enum { + MLX5_ATOMIC_REQ_MODE_BE = 0x0, + MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS = 0x1, +}; + static struct mlx5_profile profile[] = { [0] = { .mask = 0, @@ -387,7 +392,7 @@ query_ex: return err; } -static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) +static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz, int opmod) { u32 out[MLX5_ST_SZ_DW(set_hca_cap_out)]; int err; @@ -395,6 +400,7 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) memset(out, 0, sizeof(out)); MLX5_SET(set_hca_cap_in, in, opcode, MLX5_CMD_OP_SET_HCA_CAP); + MLX5_SET(set_hca_cap_in, in, op_mod, opmod << 1); err = mlx5_cmd_exec(dev, in, in_sz, out, sizeof(out)); if (err) return err; @@ -404,6 +410,46 @@ static int set_caps(struct mlx5_core_dev *dev, void *in, int in_sz) return err; } +static int handle_hca_cap_atomic(struct mlx5_core_dev *dev) +{ + void *set_ctx; + void *set_hca_cap; + int set_sz = MLX5_ST_SZ_BYTES(set_hca_cap_in); + int req_endianness; + int err; + + if (MLX5_CAP_GEN(dev, atomic)) { + err = mlx5_core_get_caps(dev, MLX5_CAP_ATOMIC, + HCA_CAP_OPMOD_GET_CUR); + if (err) + return err; + } else { + return 0; + } + + req_endianness = + MLX5_CAP_ATOMIC(dev, + supported_atomic_req_8B_endianess_mode_1); + + if (req_endianness != MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS) + return 0; + + set_ctx = kzalloc(set_sz, GFP_KERNEL); + if (!set_ctx) + return -ENOMEM; + + set_hca_cap = MLX5_ADDR_OF(set_hca_cap_in, set_ctx, capability); + + /* Set requestor to host endianness */ + MLX5_SET(atomic_caps, set_hca_cap, atomic_req_8B_endianess_mode, + MLX5_ATOMIC_REQ_MODE_HOST_ENDIANNESS); + + err = set_caps(dev, set_ctx, set_sz, MLX5_SET_HCA_CAP_OP_MOD_ATOMIC); + + kfree(set_ctx); + return err; +} + static int handle_hca_cap(struct mlx5_core_dev *dev) { void *set_ctx = NULL; @@ -445,7 +491,8 @@ static int handle_hca_cap(struct mlx5_core_dev *dev) MLX5_SET(cmd_hca_cap, set_hca_cap, log_uar_page_sz, PAGE_SHIFT - 12); - err = set_caps(dev, set_ctx, set_sz); + err = set_caps(dev, set_ctx, set_sz, + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE); query_ex: kfree(set_ctx); @@ -667,7 +714,6 @@ clean: return err; } -#ifdef CONFIG_MLX5_CORE_EN static int mlx5_core_set_issi(struct mlx5_core_dev *dev) { u32 query_in[MLX5_ST_SZ_DW(query_issi_in)]; @@ -720,7 +766,6 @@ static int mlx5_core_set_issi(struct mlx5_core_dev *dev) return -ENOTSUPP; } -#endif static int map_bf_area(struct mlx5_core_dev *dev) { @@ -966,13 +1011,11 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) goto err_pagealloc_cleanup; } -#ifdef CONFIG_MLX5_CORE_EN err = mlx5_core_set_issi(dev); if (err) { dev_err(&pdev->dev, "failed to set issi\n"); goto err_disable_hca; } -#endif err = mlx5_satisfy_startup_pages(dev, 1); if (err) { @@ -992,6 +1035,12 @@ static int mlx5_load_one(struct mlx5_core_dev *dev, struct mlx5_priv *priv) goto reclaim_boot_pages; } + err = handle_hca_cap_atomic(dev); + if (err) { + dev_err(&pdev->dev, "handle_hca_cap_atomic failed\n"); + goto reclaim_boot_pages; + } + err = mlx5_satisfy_startup_pages(dev, 0); if (err) { dev_err(&pdev->dev, "failed to allocate init pages\n"); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/qp.c b/drivers/net/ethernet/mellanox/mlx5/core/qp.c index 30e2ba3f5f16..def289375ecb 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/qp.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/qp.c @@ -36,6 +36,7 @@ #include <linux/mlx5/cmd.h> #include <linux/mlx5/qp.h> #include <linux/mlx5/driver.h> +#include <linux/mlx5/transobj.h> #include "mlx5_core.h" @@ -67,6 +68,52 @@ void mlx5_core_put_rsc(struct mlx5_core_rsc_common *common) complete(&common->free); } +static u64 qp_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_PATH_MIG) | + BIT(MLX5_EVENT_TYPE_COMM_EST) | + BIT(MLX5_EVENT_TYPE_SQ_DRAINED) | + BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR) | + BIT(MLX5_EVENT_TYPE_PATH_MIG_FAILED) | + BIT(MLX5_EVENT_TYPE_WQ_INVAL_REQ_ERROR) | + BIT(MLX5_EVENT_TYPE_WQ_ACCESS_ERROR); + + return mask; +} + +static u64 rq_allowed_event_types(void) +{ + u64 mask; + + mask = BIT(MLX5_EVENT_TYPE_SRQ_LAST_WQE) | + BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); + + return mask; +} + +static u64 sq_allowed_event_types(void) +{ + return BIT(MLX5_EVENT_TYPE_WQ_CATAS_ERROR); +} + +static bool is_event_type_allowed(int rsc_type, int event_type) +{ + switch (rsc_type) { + case MLX5_EVENT_QUEUE_TYPE_QP: + return BIT(event_type) & qp_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_RQ: + return BIT(event_type) & rq_allowed_event_types(); + case MLX5_EVENT_QUEUE_TYPE_SQ: + return BIT(event_type) & sq_allowed_event_types(); + default: + WARN(1, "Event arrived for unknown resource type"); + return false; + } +} + void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) { struct mlx5_core_rsc_common *common = mlx5_get_rsc(dev, rsn); @@ -75,8 +122,16 @@ void mlx5_rsc_event(struct mlx5_core_dev *dev, u32 rsn, int event_type) if (!common) return; + if (!is_event_type_allowed((rsn >> MLX5_USER_INDEX_LEN), event_type)) { + mlx5_core_warn(dev, "event 0x%.2x is not allowed on resource 0x%.8x\n", + event_type, rsn); + return; + } + switch (common->res) { case MLX5_RES_QP: + case MLX5_RES_RQ: + case MLX5_RES_SQ: qp = (struct mlx5_core_qp *)common; qp->event(qp, event_type); break; @@ -177,27 +232,56 @@ void mlx5_eq_pagefault(struct mlx5_core_dev *dev, struct mlx5_eqe *eqe) } #endif +static int create_qprqsq_common(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp, + int rsc_type) +{ + struct mlx5_qp_table *table = &dev->priv.qp_table; + int err; + + qp->common.res = rsc_type; + spin_lock_irq(&table->lock); + err = radix_tree_insert(&table->tree, + qp->qpn | (rsc_type << MLX5_USER_INDEX_LEN), + qp); + spin_unlock_irq(&table->lock); + if (err) + return err; + + atomic_set(&qp->common.refcount, 1); + init_completion(&qp->common.free); + qp->pid = current->pid; + + return 0; +} + +static void destroy_qprqsq_common(struct mlx5_core_dev *dev, + struct mlx5_core_qp *qp) +{ + struct mlx5_qp_table *table = &dev->priv.qp_table; + unsigned long flags; + + spin_lock_irqsave(&table->lock, flags); + radix_tree_delete(&table->tree, + qp->qpn | (qp->common.res << MLX5_USER_INDEX_LEN)); + spin_unlock_irqrestore(&table->lock, flags); + mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); + wait_for_completion(&qp->common.free); +} + int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, int inlen) { - struct mlx5_qp_table *table = &dev->priv.qp_table; struct mlx5_create_qp_mbox_out out; struct mlx5_destroy_qp_mbox_in din; struct mlx5_destroy_qp_mbox_out dout; int err; - void *qpc; memset(&out, 0, sizeof(out)); in->hdr.opcode = cpu_to_be16(MLX5_CMD_OP_CREATE_QP); - if (dev->issi) { - qpc = MLX5_ADDR_OF(create_qp_in, in, qpc); - /* 0xffffff means we ask to work with cqe version 0 */ - MLX5_SET(qpc, qpc, user_index, 0xffffff); - } - err = mlx5_cmd_exec(dev, in, inlen, &out, sizeof(out)); if (err) { mlx5_core_warn(dev, "ret %d\n", err); @@ -213,24 +297,16 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, qp->qpn = be32_to_cpu(out.qpn) & 0xffffff; mlx5_core_dbg(dev, "qpn = 0x%x\n", qp->qpn); - qp->common.res = MLX5_RES_QP; - spin_lock_irq(&table->lock); - err = radix_tree_insert(&table->tree, qp->qpn, qp); - spin_unlock_irq(&table->lock); - if (err) { - mlx5_core_warn(dev, "err %d\n", err); + err = create_qprqsq_common(dev, qp, MLX5_RES_QP); + if (err) goto err_cmd; - } err = mlx5_debug_qp_add(dev, qp); if (err) mlx5_core_dbg(dev, "failed adding QP 0x%x to debug file system\n", qp->qpn); - qp->pid = current->pid; - atomic_set(&qp->common.refcount, 1); atomic_inc(&dev->num_qps); - init_completion(&qp->common.free); return 0; @@ -250,18 +326,11 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, { struct mlx5_destroy_qp_mbox_in in; struct mlx5_destroy_qp_mbox_out out; - struct mlx5_qp_table *table = &dev->priv.qp_table; - unsigned long flags; int err; mlx5_debug_qp_remove(dev, qp); - spin_lock_irqsave(&table->lock, flags); - radix_tree_delete(&table->tree, qp->qpn); - spin_unlock_irqrestore(&table->lock, flags); - - mlx5_core_put_rsc((struct mlx5_core_rsc_common *)qp); - wait_for_completion(&qp->common.free); + destroy_qprqsq_common(dev, qp); memset(&in, 0, sizeof(in)); memset(&out, 0, sizeof(out)); @@ -279,59 +348,15 @@ int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_core_destroy_qp); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, - enum mlx5_qp_state new_state, +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, struct mlx5_modify_qp_mbox_in *in, int sqd_event, struct mlx5_core_qp *qp) { - static const u16 optab[MLX5_QP_NUM_STATE][MLX5_QP_NUM_STATE] = { - [MLX5_QP_STATE_RST] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_RST2INIT_QP, - }, - [MLX5_QP_STATE_INIT] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_INIT] = MLX5_CMD_OP_INIT2INIT_QP, - [MLX5_QP_STATE_RTR] = MLX5_CMD_OP_INIT2RTR_QP, - }, - [MLX5_QP_STATE_RTR] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTR2RTS_QP, - }, - [MLX5_QP_STATE_RTS] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_RTS2RTS_QP, - }, - [MLX5_QP_STATE_SQD] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - }, - [MLX5_QP_STATE_SQER] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - [MLX5_QP_STATE_RTS] = MLX5_CMD_OP_SQERR2RTS_QP, - }, - [MLX5_QP_STATE_ERR] = { - [MLX5_QP_STATE_RST] = MLX5_CMD_OP_2RST_QP, - [MLX5_QP_STATE_ERR] = MLX5_CMD_OP_2ERR_QP, - } - }; - struct mlx5_modify_qp_mbox_out out; int err = 0; - u16 op; - - if (cur_state >= MLX5_QP_NUM_STATE || new_state >= MLX5_QP_NUM_STATE || - !optab[cur_state][new_state]) - return -EINVAL; memset(&out, 0, sizeof(out)); - op = optab[cur_state][new_state]; - in->hdr.opcode = cpu_to_be16(op); + in->hdr.opcode = cpu_to_be16(operation); in->qpn = cpu_to_be32(qp->qpn); err = mlx5_cmd_exec(dev, in, sizeof(*in), &out, sizeof(out)); if (err) @@ -449,3 +474,67 @@ int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, } EXPORT_SYMBOL_GPL(mlx5_core_page_fault_resume); #endif + +int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq) +{ + int err; + u32 rqn; + + err = mlx5_core_create_rq(dev, in, inlen, &rqn); + if (err) + return err; + + rq->qpn = rqn; + err = create_qprqsq_common(dev, rq, MLX5_RES_RQ); + if (err) + goto err_destroy_rq; + + return 0; + +err_destroy_rq: + mlx5_core_destroy_rq(dev, rq->qpn); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_rq_tracked); + +void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, + struct mlx5_core_qp *rq) +{ + destroy_qprqsq_common(dev, rq); + mlx5_core_destroy_rq(dev, rq->qpn); +} +EXPORT_SYMBOL(mlx5_core_destroy_rq_tracked); + +int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq) +{ + int err; + u32 sqn; + + err = mlx5_core_create_sq(dev, in, inlen, &sqn); + if (err) + return err; + + sq->qpn = sqn; + err = create_qprqsq_common(dev, sq, MLX5_RES_SQ); + if (err) + goto err_destroy_sq; + + return 0; + +err_destroy_sq: + mlx5_core_destroy_sq(dev, sq->qpn); + + return err; +} +EXPORT_SYMBOL(mlx5_core_create_sq_tracked); + +void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, + struct mlx5_core_qp *sq) +{ + destroy_qprqsq_common(dev, sq); + mlx5_core_destroy_sq(dev, sq->qpn); +} +EXPORT_SYMBOL(mlx5_core_destroy_sq_tracked); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/srq.c b/drivers/net/ethernet/mellanox/mlx5/core/srq.c index ffada801976b..04bc522605a0 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/srq.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/srq.c @@ -37,7 +37,7 @@ #include <linux/mlx5/srq.h> #include <rdma/ib_verbs.h> #include "mlx5_core.h" -#include "transobj.h" +#include <linux/mlx5/transobj.h> void mlx5_srq_event(struct mlx5_core_dev *dev, u32 srqn, int event_type) { @@ -241,8 +241,6 @@ static int create_xrc_srq_cmd(struct mlx5_core_dev *dev, memcpy(xrc_srqc, srqc, MLX5_ST_SZ_BYTES(srqc)); memcpy(pas, in->pas, pas_size); - /* 0xffffff means we ask to work with cqe version 0 */ - MLX5_SET(xrc_srqc, xrc_srqc, user_index, 0xffffff); MLX5_SET(create_xrc_srq_in, create_in, opcode, MLX5_CMD_OP_CREATE_XRC_SRQ); diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c index d7068f54e800..03a5093ffeb7 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/transobj.c @@ -32,9 +32,9 @@ #include <linux/mlx5/driver.h> #include "mlx5_core.h" -#include "transobj.h" +#include <linux/mlx5/transobj.h> -int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) +int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) { u32 in[MLX5_ST_SZ_DW(alloc_transport_domain_in)]; u32 out[MLX5_ST_SZ_DW(alloc_transport_domain_out)]; @@ -53,8 +53,9 @@ int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn) return err; } +EXPORT_SYMBOL(mlx5_core_alloc_transport_domain); -void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) +void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) { u32 in[MLX5_ST_SZ_DW(dealloc_transport_domain_in)]; u32 out[MLX5_ST_SZ_DW(dealloc_transport_domain_out)]; @@ -68,6 +69,7 @@ void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_dealloc_transport_domain); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn) { @@ -94,6 +96,7 @@ int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen) memset(out, 0, sizeof(out)); return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_modify_rq); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) { @@ -108,6 +111,18 @@ void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(query_rq_in)] = {0}; + int outlen = MLX5_ST_SZ_BYTES(query_rq_out); + + MLX5_SET(query_rq_in, in, opcode, MLX5_CMD_OP_QUERY_RQ); + MLX5_SET(query_rq_in, in, rqn, rqn); + + return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen); +} +EXPORT_SYMBOL(mlx5_core_query_rq); + int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn) { u32 out[MLX5_ST_SZ_DW(create_sq_out)]; @@ -133,6 +148,7 @@ int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen) memset(out, 0, sizeof(out)); return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_modify_sq); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn) { @@ -147,6 +163,18 @@ void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out) +{ + u32 in[MLX5_ST_SZ_DW(query_sq_in)] = {0}; + int outlen = MLX5_ST_SZ_BYTES(query_sq_out); + + MLX5_SET(query_sq_in, in, opcode, MLX5_CMD_OP_QUERY_SQ); + MLX5_SET(query_sq_in, in, sqn, sqn); + + return mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, outlen); +} +EXPORT_SYMBOL(mlx5_core_query_sq); + int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn) { @@ -162,6 +190,7 @@ int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; } +EXPORT_SYMBOL(mlx5_core_create_tir); int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, int inlen) @@ -187,6 +216,7 @@ void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_destroy_tir); int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn) @@ -203,6 +233,19 @@ int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, return err; } +EXPORT_SYMBOL(mlx5_core_create_tis); + +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, + int inlen) +{ + u32 out[MLX5_ST_SZ_DW(modify_tis_out)] = {0}; + + MLX5_SET(modify_tis_in, in, tisn, tisn); + MLX5_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS); + + return mlx5_cmd_exec_check_status(dev, in, inlen, out, sizeof(out)); +} +EXPORT_SYMBOL(mlx5_core_modify_tis); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn) { @@ -216,6 +259,7 @@ void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn) mlx5_cmd_exec_check_status(dev, in, sizeof(in), out, sizeof(out)); } +EXPORT_SYMBOL(mlx5_core_destroy_tis); int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rmpn) diff --git a/drivers/net/ethernet/mellanox/mlx5/core/vport.c b/drivers/net/ethernet/mellanox/mlx5/core/vport.c index 076197efea9b..c7398b95aecd 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/vport.c +++ b/drivers/net/ethernet/mellanox/mlx5/core/vport.c @@ -76,7 +76,7 @@ u8 mlx5_query_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport) return MLX5_GET(query_vport_state_out, out, admin_state); } -EXPORT_SYMBOL(mlx5_query_vport_admin_state); +EXPORT_SYMBOL_GPL(mlx5_query_vport_admin_state); int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, u16 vport, u8 state) @@ -104,7 +104,7 @@ int mlx5_modify_vport_admin_state(struct mlx5_core_dev *mdev, u8 opmod, return err; } -EXPORT_SYMBOL(mlx5_modify_vport_admin_state); +EXPORT_SYMBOL_GPL(mlx5_modify_vport_admin_state); static int mlx5_query_nic_vport_context(struct mlx5_core_dev *mdev, u16 vport, u32 *out, int outlen) @@ -151,12 +151,9 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, nic_vport_context.permanent_address); err = mlx5_query_nic_vport_context(mdev, vport, out, outlen); - if (err) - goto out; - - ether_addr_copy(addr, &out_addr[2]); + if (!err) + ether_addr_copy(addr, &out_addr[2]); -out: kvfree(out); return err; } @@ -197,7 +194,7 @@ int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *mdev, return err; } -EXPORT_SYMBOL(mlx5_modify_nic_vport_mac_address); +EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_mac_address); int mlx5_query_nic_vport_mac_list(struct mlx5_core_dev *dev, u32 vport, @@ -430,6 +427,68 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, } EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_vlans); +int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, + u64 *system_image_guid) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + mlx5_query_nic_vport_context(mdev, 0, out, outlen); + + *system_image_guid = MLX5_GET64(query_nic_vport_context_out, out, + nic_vport_context.system_image_guid); + + kfree(out); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_system_image_guid); + +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + mlx5_query_nic_vport_context(mdev, 0, out, outlen); + + *node_guid = MLX5_GET64(query_nic_vport_context_out, out, + nic_vport_context.node_guid); + + kfree(out); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_node_guid); + +int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, + u16 *qkey_viol_cntr) +{ + u32 *out; + int outlen = MLX5_ST_SZ_BYTES(query_nic_vport_context_out); + + out = mlx5_vzalloc(outlen); + if (!out) + return -ENOMEM; + + mlx5_query_nic_vport_context(mdev, 0, out, outlen); + + *qkey_viol_cntr = MLX5_GET(query_nic_vport_context_out, out, + nic_vport_context.qkey_violation_counter); + + kfree(out); + + return 0; +} +EXPORT_SYMBOL_GPL(mlx5_query_nic_vport_qkey_viol_cntr); + int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, u16 vf_num, u16 gid_index, union ib_gid *gid) @@ -750,3 +809,44 @@ int mlx5_modify_nic_vport_promisc(struct mlx5_core_dev *mdev, return err; } EXPORT_SYMBOL_GPL(mlx5_modify_nic_vport_promisc); + +enum mlx5_vport_roce_state { + MLX5_VPORT_ROCE_DISABLED = 0, + MLX5_VPORT_ROCE_ENABLED = 1, +}; + +static int mlx5_nic_vport_update_roce_state(struct mlx5_core_dev *mdev, + enum mlx5_vport_roce_state state) +{ + void *in; + int inlen = MLX5_ST_SZ_BYTES(modify_nic_vport_context_in); + int err; + + in = mlx5_vzalloc(inlen); + if (!in) { + mlx5_core_warn(mdev, "failed to allocate inbox\n"); + return -ENOMEM; + } + + MLX5_SET(modify_nic_vport_context_in, in, field_select.roce_en, 1); + MLX5_SET(modify_nic_vport_context_in, in, nic_vport_context.roce_en, + state); + + err = mlx5_modify_nic_vport_context(mdev, in, inlen); + + kvfree(in); + + return err; +} + +int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev) +{ + return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_ENABLED); +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_enable_roce); + +int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev) +{ + return mlx5_nic_vport_update_roce_state(mdev, MLX5_VPORT_ROCE_DISABLED); +} +EXPORT_SYMBOL_GPL(mlx5_nic_vport_disable_roce); diff --git a/drivers/ntb/hw/Kconfig b/drivers/ntb/hw/Kconfig index 4d5535c4cddf..7116472b4625 100644 --- a/drivers/ntb/hw/Kconfig +++ b/drivers/ntb/hw/Kconfig @@ -1 +1,2 @@ +source "drivers/ntb/hw/amd/Kconfig" source "drivers/ntb/hw/intel/Kconfig" diff --git a/drivers/ntb/hw/Makefile b/drivers/ntb/hw/Makefile index 175d7c92a569..532e0859b4a1 100644 --- a/drivers/ntb/hw/Makefile +++ b/drivers/ntb/hw/Makefile @@ -1 +1,2 @@ +obj-$(CONFIG_NTB_AMD) += amd/ obj-$(CONFIG_NTB_INTEL) += intel/ diff --git a/drivers/ntb/hw/amd/Kconfig b/drivers/ntb/hw/amd/Kconfig new file mode 100644 index 000000000000..cfe903cd9514 --- /dev/null +++ b/drivers/ntb/hw/amd/Kconfig @@ -0,0 +1,7 @@ +config NTB_AMD + tristate "AMD Non-Transparent Bridge support" + depends on X86_64 + help + This driver supports AMD NTB on capable Zeppelin hardware. + + If unsure, say N. diff --git a/drivers/ntb/hw/amd/Makefile b/drivers/ntb/hw/amd/Makefile new file mode 100644 index 000000000000..ad54da917563 --- /dev/null +++ b/drivers/ntb/hw/amd/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_NTB_AMD) += ntb_hw_amd.o diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.c b/drivers/ntb/hw/amd/ntb_hw_amd.c new file mode 100644 index 000000000000..588803ad6847 --- /dev/null +++ b/drivers/ntb/hw/amd/ntb_hw_amd.c @@ -0,0 +1,1143 @@ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * BSD LICENSE + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copy + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of AMD Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * AMD PCIe NTB Linux driver + * + * Contact Information: + * Xiangliang Yu <Xiangliang.Yu@amd.com> + */ + +#include <linux/debugfs.h> +#include <linux/delay.h> +#include <linux/init.h> +#include <linux/interrupt.h> +#include <linux/module.h> +#include <linux/acpi.h> +#include <linux/pci.h> +#include <linux/random.h> +#include <linux/slab.h> +#include <linux/ntb.h> + +#include "ntb_hw_amd.h" + +#define NTB_NAME "ntb_hw_amd" +#define NTB_DESC "AMD(R) PCI-E Non-Transparent Bridge Driver" +#define NTB_VER "1.0" + +MODULE_DESCRIPTION(NTB_DESC); +MODULE_VERSION(NTB_VER); +MODULE_LICENSE("Dual BSD/GPL"); +MODULE_AUTHOR("AMD Inc."); + +static const struct file_operations amd_ntb_debugfs_info; +static struct dentry *debugfs_dir; + +static int ndev_mw_to_bar(struct amd_ntb_dev *ndev, int idx) +{ + if (idx < 0 || idx > ndev->mw_count) + return -EINVAL; + + return 1 << idx; +} + +static int amd_ntb_mw_count(struct ntb_dev *ntb) +{ + return ntb_ndev(ntb)->mw_count; +} + +static int amd_ntb_mw_get_range(struct ntb_dev *ntb, int idx, + phys_addr_t *base, + resource_size_t *size, + resource_size_t *align, + resource_size_t *align_size) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + int bar; + + bar = ndev_mw_to_bar(ndev, idx); + if (bar < 0) + return bar; + + if (base) + *base = pci_resource_start(ndev->ntb.pdev, bar); + + if (size) + *size = pci_resource_len(ndev->ntb.pdev, bar); + + if (align) + *align = SZ_4K; + + if (align_size) + *align_size = 1; + + return 0; +} + +static int amd_ntb_mw_set_trans(struct ntb_dev *ntb, int idx, + dma_addr_t addr, resource_size_t size) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + unsigned long xlat_reg, limit_reg = 0; + resource_size_t mw_size; + void __iomem *mmio, *peer_mmio; + u64 base_addr, limit, reg_val; + int bar; + + bar = ndev_mw_to_bar(ndev, idx); + if (bar < 0) + return bar; + + mw_size = pci_resource_len(ndev->ntb.pdev, bar); + + /* make sure the range fits in the usable mw size */ + if (size > mw_size) + return -EINVAL; + + mmio = ndev->self_mmio; + peer_mmio = ndev->peer_mmio; + + base_addr = pci_resource_start(ndev->ntb.pdev, bar); + + if (bar != 1) { + xlat_reg = AMD_BAR23XLAT_OFFSET + ((bar - 2) << 3); + limit_reg = AMD_BAR23LMT_OFFSET + ((bar - 2) << 3); + + /* Set the limit if supported */ + limit = base_addr + size; + + /* set and verify setting the translation address */ + write64(addr, peer_mmio + xlat_reg); + reg_val = read64(peer_mmio + xlat_reg); + if (reg_val != addr) { + write64(0, peer_mmio + xlat_reg); + return -EIO; + } + + /* set and verify setting the limit */ + write64(limit, mmio + limit_reg); + reg_val = read64(mmio + limit_reg); + if (reg_val != limit) { + write64(base_addr, mmio + limit_reg); + write64(0, peer_mmio + xlat_reg); + return -EIO; + } + } else { + xlat_reg = AMD_BAR1XLAT_OFFSET; + limit_reg = AMD_BAR1LMT_OFFSET; + + /* split bar addr range must all be 32 bit */ + if (addr & (~0ull << 32)) + return -EINVAL; + if ((addr + size) & (~0ull << 32)) + return -EINVAL; + + /* Set the limit if supported */ + limit = base_addr + size; + + /* set and verify setting the translation address */ + write64(addr, peer_mmio + xlat_reg); + reg_val = read64(peer_mmio + xlat_reg); + if (reg_val != addr) { + write64(0, peer_mmio + xlat_reg); + return -EIO; + } + + /* set and verify setting the limit */ + writel(limit, mmio + limit_reg); + reg_val = readl(mmio + limit_reg); + if (reg_val != limit) { + writel(base_addr, mmio + limit_reg); + writel(0, peer_mmio + xlat_reg); + return -EIO; + } + } + + return 0; +} + +static int amd_link_is_up(struct amd_ntb_dev *ndev) +{ + if (!ndev->peer_sta) + return NTB_LNK_STA_ACTIVE(ndev->cntl_sta); + + /* If peer_sta is reset or D0 event, the ISR has + * started a timer to check link status of hardware. + * So here just clear status bit. And if peer_sta is + * D3 or PME_TO, D0/reset event will be happened when + * system wakeup/poweron, so do nothing here. + */ + if (ndev->peer_sta & AMD_PEER_RESET_EVENT) + ndev->peer_sta &= ~AMD_PEER_RESET_EVENT; + else if (ndev->peer_sta & AMD_PEER_D0_EVENT) + ndev->peer_sta = 0; + + return 0; +} + +static int amd_ntb_link_is_up(struct ntb_dev *ntb, + enum ntb_speed *speed, + enum ntb_width *width) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + int ret = 0; + + if (amd_link_is_up(ndev)) { + if (speed) + *speed = NTB_LNK_STA_SPEED(ndev->lnk_sta); + if (width) + *width = NTB_LNK_STA_WIDTH(ndev->lnk_sta); + + dev_dbg(ndev_dev(ndev), "link is up.\n"); + + ret = 1; + } else { + if (speed) + *speed = NTB_SPEED_NONE; + if (width) + *width = NTB_WIDTH_NONE; + + dev_dbg(ndev_dev(ndev), "link is down.\n"); + } + + return ret; +} + +static int amd_ntb_link_enable(struct ntb_dev *ntb, + enum ntb_speed max_speed, + enum ntb_width max_width) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + u32 ntb_ctl; + + /* Enable event interrupt */ + ndev->int_mask &= ~AMD_EVENT_INTMASK; + writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET); + + if (ndev->ntb.topo == NTB_TOPO_SEC) + return -EINVAL; + dev_dbg(ndev_dev(ndev), "Enabling Link.\n"); + + ntb_ctl = readl(mmio + AMD_CNTL_OFFSET); + ntb_ctl |= (PMM_REG_CTL | SMM_REG_CTL); + writel(ntb_ctl, mmio + AMD_CNTL_OFFSET); + + return 0; +} + +static int amd_ntb_link_disable(struct ntb_dev *ntb) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + u32 ntb_ctl; + + /* Disable event interrupt */ + ndev->int_mask |= AMD_EVENT_INTMASK; + writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET); + + if (ndev->ntb.topo == NTB_TOPO_SEC) + return -EINVAL; + dev_dbg(ndev_dev(ndev), "Enabling Link.\n"); + + ntb_ctl = readl(mmio + AMD_CNTL_OFFSET); + ntb_ctl &= ~(PMM_REG_CTL | SMM_REG_CTL); + writel(ntb_ctl, mmio + AMD_CNTL_OFFSET); + + return 0; +} + +static u64 amd_ntb_db_valid_mask(struct ntb_dev *ntb) +{ + return ntb_ndev(ntb)->db_valid_mask; +} + +static int amd_ntb_db_vector_count(struct ntb_dev *ntb) +{ + return ntb_ndev(ntb)->db_count; +} + +static u64 amd_ntb_db_vector_mask(struct ntb_dev *ntb, int db_vector) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + + if (db_vector < 0 || db_vector > ndev->db_count) + return 0; + + return ntb_ndev(ntb)->db_valid_mask & (1 << db_vector); +} + +static u64 amd_ntb_db_read(struct ntb_dev *ntb) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + + return (u64)readw(mmio + AMD_DBSTAT_OFFSET); +} + +static int amd_ntb_db_clear(struct ntb_dev *ntb, u64 db_bits) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + + writew((u16)db_bits, mmio + AMD_DBSTAT_OFFSET); + + return 0; +} + +static int amd_ntb_db_set_mask(struct ntb_dev *ntb, u64 db_bits) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + unsigned long flags; + + if (db_bits & ~ndev->db_valid_mask) + return -EINVAL; + + spin_lock_irqsave(&ndev->db_mask_lock, flags); + ndev->db_mask |= db_bits; + writew((u16)ndev->db_mask, mmio + AMD_DBMASK_OFFSET); + spin_unlock_irqrestore(&ndev->db_mask_lock, flags); + + return 0; +} + +static int amd_ntb_db_clear_mask(struct ntb_dev *ntb, u64 db_bits) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + unsigned long flags; + + if (db_bits & ~ndev->db_valid_mask) + return -EINVAL; + + spin_lock_irqsave(&ndev->db_mask_lock, flags); + ndev->db_mask &= ~db_bits; + writew((u16)ndev->db_mask, mmio + AMD_DBMASK_OFFSET); + spin_unlock_irqrestore(&ndev->db_mask_lock, flags); + + return 0; +} + +static int amd_ntb_peer_db_addr(struct ntb_dev *ntb, + phys_addr_t *db_addr, + resource_size_t *db_size) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + + if (db_addr) + *db_addr = (phys_addr_t)(ndev->peer_mmio + AMD_DBREQ_OFFSET); + if (db_size) + *db_size = sizeof(u32); + + return 0; +} + +static int amd_ntb_peer_db_set(struct ntb_dev *ntb, u64 db_bits) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + + writew((u16)db_bits, mmio + AMD_DBREQ_OFFSET); + + return 0; +} + +static int amd_ntb_spad_count(struct ntb_dev *ntb) +{ + return ntb_ndev(ntb)->spad_count; +} + +static u32 amd_ntb_spad_read(struct ntb_dev *ntb, int idx) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + u32 offset; + + if (idx < 0 || idx >= ndev->spad_count) + return 0; + + offset = ndev->self_spad + (idx << 2); + return readl(mmio + AMD_SPAD_OFFSET + offset); +} + +static int amd_ntb_spad_write(struct ntb_dev *ntb, + int idx, u32 val) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + u32 offset; + + if (idx < 0 || idx >= ndev->spad_count) + return -EINVAL; + + offset = ndev->self_spad + (idx << 2); + writel(val, mmio + AMD_SPAD_OFFSET + offset); + + return 0; +} + +static int amd_ntb_peer_spad_addr(struct ntb_dev *ntb, int idx, + phys_addr_t *spad_addr) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + + if (idx < 0 || idx >= ndev->spad_count) + return -EINVAL; + + if (spad_addr) + *spad_addr = (phys_addr_t)(ndev->self_mmio + AMD_SPAD_OFFSET + + ndev->peer_spad + (idx << 2)); + return 0; +} + +static u32 amd_ntb_peer_spad_read(struct ntb_dev *ntb, int idx) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + u32 offset; + + if (idx < 0 || idx >= ndev->spad_count) + return -EINVAL; + + offset = ndev->peer_spad + (idx << 2); + return readl(mmio + AMD_SPAD_OFFSET + offset); +} + +static int amd_ntb_peer_spad_write(struct ntb_dev *ntb, + int idx, u32 val) +{ + struct amd_ntb_dev *ndev = ntb_ndev(ntb); + void __iomem *mmio = ndev->self_mmio; + u32 offset; + + if (idx < 0 || idx >= ndev->spad_count) + return -EINVAL; + + offset = ndev->peer_spad + (idx << 2); + writel(val, mmio + AMD_SPAD_OFFSET + offset); + + return 0; +} + +static const struct ntb_dev_ops amd_ntb_ops = { + .mw_count = amd_ntb_mw_count, + .mw_get_range = amd_ntb_mw_get_range, + .mw_set_trans = amd_ntb_mw_set_trans, + .link_is_up = amd_ntb_link_is_up, + .link_enable = amd_ntb_link_enable, + .link_disable = amd_ntb_link_disable, + .db_valid_mask = amd_ntb_db_valid_mask, + .db_vector_count = amd_ntb_db_vector_count, + .db_vector_mask = amd_ntb_db_vector_mask, + .db_read = amd_ntb_db_read, + .db_clear = amd_ntb_db_clear, + .db_set_mask = amd_ntb_db_set_mask, + .db_clear_mask = amd_ntb_db_clear_mask, + .peer_db_addr = amd_ntb_peer_db_addr, + .peer_db_set = amd_ntb_peer_db_set, + .spad_count = amd_ntb_spad_count, + .spad_read = amd_ntb_spad_read, + .spad_write = amd_ntb_spad_write, + .peer_spad_addr = amd_ntb_peer_spad_addr, + .peer_spad_read = amd_ntb_peer_spad_read, + .peer_spad_write = amd_ntb_peer_spad_write, +}; + +static void amd_ack_smu(struct amd_ntb_dev *ndev, u32 bit) +{ + void __iomem *mmio = ndev->self_mmio; + int reg; + + reg = readl(mmio + AMD_SMUACK_OFFSET); + reg |= bit; + writel(reg, mmio + AMD_SMUACK_OFFSET); + + ndev->peer_sta |= bit; +} + +static void amd_handle_event(struct amd_ntb_dev *ndev, int vec) +{ + void __iomem *mmio = ndev->self_mmio; + u32 status; + + status = readl(mmio + AMD_INTSTAT_OFFSET); + if (!(status & AMD_EVENT_INTMASK)) + return; + + dev_dbg(ndev_dev(ndev), "status = 0x%x and vec = %d\n", status, vec); + + status &= AMD_EVENT_INTMASK; + switch (status) { + case AMD_PEER_FLUSH_EVENT: + dev_info(ndev_dev(ndev), "Flush is done.\n"); + break; + case AMD_PEER_RESET_EVENT: + amd_ack_smu(ndev, AMD_PEER_RESET_EVENT); + + /* link down first */ + ntb_link_event(&ndev->ntb); + /* polling peer status */ + schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT); + + break; + case AMD_PEER_D3_EVENT: + case AMD_PEER_PMETO_EVENT: + amd_ack_smu(ndev, status); + + /* link down */ + ntb_link_event(&ndev->ntb); + + break; + case AMD_PEER_D0_EVENT: + mmio = ndev->peer_mmio; + status = readl(mmio + AMD_PMESTAT_OFFSET); + /* check if this is WAKEUP event */ + if (status & 0x1) + dev_info(ndev_dev(ndev), "Wakeup is done.\n"); + + amd_ack_smu(ndev, AMD_PEER_D0_EVENT); + + /* start a timer to poll link status */ + schedule_delayed_work(&ndev->hb_timer, + AMD_LINK_HB_TIMEOUT); + break; + default: + dev_info(ndev_dev(ndev), "event status = 0x%x.\n", status); + break; + } +} + +static irqreturn_t ndev_interrupt(struct amd_ntb_dev *ndev, int vec) +{ + dev_dbg(ndev_dev(ndev), "vec %d\n", vec); + + if (vec > (AMD_DB_CNT - 1) || (ndev->msix_vec_count == 1)) + amd_handle_event(ndev, vec); + + if (vec < AMD_DB_CNT) + ntb_db_event(&ndev->ntb, vec); + + return IRQ_HANDLED; +} + +static irqreturn_t ndev_vec_isr(int irq, void *dev) +{ + struct amd_ntb_vec *nvec = dev; + + return ndev_interrupt(nvec->ndev, nvec->num); +} + +static irqreturn_t ndev_irq_isr(int irq, void *dev) +{ + struct amd_ntb_dev *ndev = dev; + + return ndev_interrupt(ndev, irq - ndev_pdev(ndev)->irq); +} + +static int ndev_init_isr(struct amd_ntb_dev *ndev, + int msix_min, int msix_max) +{ + struct pci_dev *pdev; + int rc, i, msix_count, node; + + pdev = ndev_pdev(ndev); + + node = dev_to_node(&pdev->dev); + + ndev->db_mask = ndev->db_valid_mask; + + /* Try to set up msix irq */ + ndev->vec = kzalloc_node(msix_max * sizeof(*ndev->vec), + GFP_KERNEL, node); + if (!ndev->vec) + goto err_msix_vec_alloc; + + ndev->msix = kzalloc_node(msix_max * sizeof(*ndev->msix), + GFP_KERNEL, node); + if (!ndev->msix) + goto err_msix_alloc; + + for (i = 0; i < msix_max; ++i) + ndev->msix[i].entry = i; + + msix_count = pci_enable_msix_range(pdev, ndev->msix, + msix_min, msix_max); + if (msix_count < 0) + goto err_msix_enable; + + /* NOTE: Disable MSIX if msix count is less than 16 because of + * hardware limitation. + */ + if (msix_count < msix_min) { + pci_disable_msix(pdev); + goto err_msix_enable; + } + + for (i = 0; i < msix_count; ++i) { + ndev->vec[i].ndev = ndev; + ndev->vec[i].num = i; + rc = request_irq(ndev->msix[i].vector, ndev_vec_isr, 0, + "ndev_vec_isr", &ndev->vec[i]); + if (rc) + goto err_msix_request; + } + + dev_dbg(ndev_dev(ndev), "Using msix interrupts\n"); + ndev->db_count = msix_min; + ndev->msix_vec_count = msix_max; + return 0; + +err_msix_request: + while (i-- > 0) + free_irq(ndev->msix[i].vector, ndev); + pci_disable_msix(pdev); +err_msix_enable: + kfree(ndev->msix); +err_msix_alloc: + kfree(ndev->vec); +err_msix_vec_alloc: + ndev->msix = NULL; + ndev->vec = NULL; + + /* Try to set up msi irq */ + rc = pci_enable_msi(pdev); + if (rc) + goto err_msi_enable; + + rc = request_irq(pdev->irq, ndev_irq_isr, 0, + "ndev_irq_isr", ndev); + if (rc) + goto err_msi_request; + + dev_dbg(ndev_dev(ndev), "Using msi interrupts\n"); + ndev->db_count = 1; + ndev->msix_vec_count = 1; + return 0; + +err_msi_request: + pci_disable_msi(pdev); +err_msi_enable: + + /* Try to set up intx irq */ + pci_intx(pdev, 1); + + rc = request_irq(pdev->irq, ndev_irq_isr, IRQF_SHARED, + "ndev_irq_isr", ndev); + if (rc) + goto err_intx_request; + + dev_dbg(ndev_dev(ndev), "Using intx interrupts\n"); + ndev->db_count = 1; + ndev->msix_vec_count = 1; + return 0; + +err_intx_request: + return rc; +} + +static void ndev_deinit_isr(struct amd_ntb_dev *ndev) +{ + struct pci_dev *pdev; + void __iomem *mmio = ndev->self_mmio; + int i; + + pdev = ndev_pdev(ndev); + + /* Mask all doorbell interrupts */ + ndev->db_mask = ndev->db_valid_mask; + writel(ndev->db_mask, mmio + AMD_DBMASK_OFFSET); + + if (ndev->msix) { + i = ndev->msix_vec_count; + while (i--) + free_irq(ndev->msix[i].vector, &ndev->vec[i]); + pci_disable_msix(pdev); + kfree(ndev->msix); + kfree(ndev->vec); + } else { + free_irq(pdev->irq, ndev); + if (pci_dev_msi_enabled(pdev)) + pci_disable_msi(pdev); + else + pci_intx(pdev, 0); + } +} + +static ssize_t ndev_debugfs_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *offp) +{ + struct amd_ntb_dev *ndev; + void __iomem *mmio; + char *buf; + size_t buf_size; + ssize_t ret, off; + union { u64 v64; u32 v32; u16 v16; } u; + + ndev = filp->private_data; + mmio = ndev->self_mmio; + + buf_size = min(count, 0x800ul); + + buf = kmalloc(buf_size, GFP_KERNEL); + if (!buf) + return -ENOMEM; + + off = 0; + + off += scnprintf(buf + off, buf_size - off, + "NTB Device Information:\n"); + + off += scnprintf(buf + off, buf_size - off, + "Connection Topology -\t%s\n", + ntb_topo_string(ndev->ntb.topo)); + + off += scnprintf(buf + off, buf_size - off, + "LNK STA -\t\t%#06x\n", ndev->lnk_sta); + + if (!amd_link_is_up(ndev)) { + off += scnprintf(buf + off, buf_size - off, + "Link Status -\t\tDown\n"); + } else { + off += scnprintf(buf + off, buf_size - off, + "Link Status -\t\tUp\n"); + off += scnprintf(buf + off, buf_size - off, + "Link Speed -\t\tPCI-E Gen %u\n", + NTB_LNK_STA_SPEED(ndev->lnk_sta)); + off += scnprintf(buf + off, buf_size - off, + "Link Width -\t\tx%u\n", + NTB_LNK_STA_WIDTH(ndev->lnk_sta)); + } + + off += scnprintf(buf + off, buf_size - off, + "Memory Window Count -\t%u\n", ndev->mw_count); + off += scnprintf(buf + off, buf_size - off, + "Scratchpad Count -\t%u\n", ndev->spad_count); + off += scnprintf(buf + off, buf_size - off, + "Doorbell Count -\t%u\n", ndev->db_count); + off += scnprintf(buf + off, buf_size - off, + "MSIX Vector Count -\t%u\n", ndev->msix_vec_count); + + off += scnprintf(buf + off, buf_size - off, + "Doorbell Valid Mask -\t%#llx\n", ndev->db_valid_mask); + + u.v32 = readl(ndev->self_mmio + AMD_DBMASK_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "Doorbell Mask -\t\t\t%#06x\n", u.v32); + + u.v32 = readl(mmio + AMD_DBSTAT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "Doorbell Bell -\t\t\t%#06x\n", u.v32); + + off += scnprintf(buf + off, buf_size - off, + "\nNTB Incoming XLAT:\n"); + + u.v64 = read64(mmio + AMD_BAR1XLAT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "XLAT1 -\t\t%#018llx\n", u.v64); + + u.v64 = read64(ndev->self_mmio + AMD_BAR23XLAT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "XLAT23 -\t\t%#018llx\n", u.v64); + + u.v64 = read64(ndev->self_mmio + AMD_BAR45XLAT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "XLAT45 -\t\t%#018llx\n", u.v64); + + u.v32 = readl(mmio + AMD_BAR1LMT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "LMT1 -\t\t\t%#06x\n", u.v32); + + u.v64 = read64(ndev->self_mmio + AMD_BAR23LMT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "LMT23 -\t\t\t%#018llx\n", u.v64); + + u.v64 = read64(ndev->self_mmio + AMD_BAR45LMT_OFFSET); + off += scnprintf(buf + off, buf_size - off, + "LMT45 -\t\t\t%#018llx\n", u.v64); + + ret = simple_read_from_buffer(ubuf, count, offp, buf, off); + kfree(buf); + return ret; +} + +static void ndev_init_debugfs(struct amd_ntb_dev *ndev) +{ + if (!debugfs_dir) { + ndev->debugfs_dir = NULL; + ndev->debugfs_info = NULL; + } else { + ndev->debugfs_dir = + debugfs_create_dir(ndev_name(ndev), debugfs_dir); + if (!ndev->debugfs_dir) + ndev->debugfs_info = NULL; + else + ndev->debugfs_info = + debugfs_create_file("info", S_IRUSR, + ndev->debugfs_dir, ndev, + &amd_ntb_debugfs_info); + } +} + +static void ndev_deinit_debugfs(struct amd_ntb_dev *ndev) +{ + debugfs_remove_recursive(ndev->debugfs_dir); +} + +static inline void ndev_init_struct(struct amd_ntb_dev *ndev, + struct pci_dev *pdev) +{ + ndev->ntb.pdev = pdev; + ndev->ntb.topo = NTB_TOPO_NONE; + ndev->ntb.ops = &amd_ntb_ops; + ndev->int_mask = AMD_EVENT_INTMASK; + spin_lock_init(&ndev->db_mask_lock); +} + +static int amd_poll_link(struct amd_ntb_dev *ndev) +{ + void __iomem *mmio = ndev->peer_mmio; + u32 reg, stat; + int rc; + + reg = readl(mmio + AMD_SIDEINFO_OFFSET); + reg &= NTB_LIN_STA_ACTIVE_BIT; + + dev_dbg(ndev_dev(ndev), "%s: reg_val = 0x%x.\n", __func__, reg); + + if (reg == ndev->cntl_sta) + return 0; + + ndev->cntl_sta = reg; + + rc = pci_read_config_dword(ndev->ntb.pdev, + AMD_LINK_STATUS_OFFSET, &stat); + if (rc) + return 0; + ndev->lnk_sta = stat; + + return 1; +} + +static void amd_link_hb(struct work_struct *work) +{ + struct amd_ntb_dev *ndev = hb_ndev(work); + + if (amd_poll_link(ndev)) + ntb_link_event(&ndev->ntb); + + if (!amd_link_is_up(ndev)) + schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT); +} + +static int amd_init_isr(struct amd_ntb_dev *ndev) +{ + return ndev_init_isr(ndev, AMD_DB_CNT, AMD_MSIX_VECTOR_CNT); +} + +static void amd_init_side_info(struct amd_ntb_dev *ndev) +{ + void __iomem *mmio = ndev->self_mmio; + unsigned int reg; + + reg = readl(mmio + AMD_SIDEINFO_OFFSET); + if (!(reg & AMD_SIDE_READY)) { + reg |= AMD_SIDE_READY; + writel(reg, mmio + AMD_SIDEINFO_OFFSET); + } +} + +static void amd_deinit_side_info(struct amd_ntb_dev *ndev) +{ + void __iomem *mmio = ndev->self_mmio; + unsigned int reg; + + reg = readl(mmio + AMD_SIDEINFO_OFFSET); + if (reg & AMD_SIDE_READY) { + reg &= ~AMD_SIDE_READY; + writel(reg, mmio + AMD_SIDEINFO_OFFSET); + readl(mmio + AMD_SIDEINFO_OFFSET); + } +} + +static int amd_init_ntb(struct amd_ntb_dev *ndev) +{ + void __iomem *mmio = ndev->self_mmio; + + ndev->mw_count = AMD_MW_CNT; + ndev->spad_count = AMD_SPADS_CNT; + ndev->db_count = AMD_DB_CNT; + + switch (ndev->ntb.topo) { + case NTB_TOPO_PRI: + case NTB_TOPO_SEC: + ndev->spad_count >>= 1; + if (ndev->ntb.topo == NTB_TOPO_PRI) { + ndev->self_spad = 0; + ndev->peer_spad = 0x20; + } else { + ndev->self_spad = 0x20; + ndev->peer_spad = 0; + } + + INIT_DELAYED_WORK(&ndev->hb_timer, amd_link_hb); + schedule_delayed_work(&ndev->hb_timer, AMD_LINK_HB_TIMEOUT); + + break; + default: + dev_err(ndev_dev(ndev), "AMD NTB does not support B2B mode.\n"); + return -EINVAL; + } + + ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1; + + /* Mask event interrupts */ + writel(ndev->int_mask, mmio + AMD_INTMASK_OFFSET); + + return 0; +} + +static enum ntb_topo amd_get_topo(struct amd_ntb_dev *ndev) +{ + void __iomem *mmio = ndev->self_mmio; + u32 info; + + info = readl(mmio + AMD_SIDEINFO_OFFSET); + if (info & AMD_SIDE_MASK) + return NTB_TOPO_SEC; + else + return NTB_TOPO_PRI; +} + +static int amd_init_dev(struct amd_ntb_dev *ndev) +{ + struct pci_dev *pdev; + int rc = 0; + + pdev = ndev_pdev(ndev); + + ndev->ntb.topo = amd_get_topo(ndev); + dev_dbg(ndev_dev(ndev), "AMD NTB topo is %s\n", + ntb_topo_string(ndev->ntb.topo)); + + rc = amd_init_ntb(ndev); + if (rc) + return rc; + + rc = amd_init_isr(ndev); + if (rc) { + dev_err(ndev_dev(ndev), "fail to init isr.\n"); + return rc; + } + + ndev->db_valid_mask = BIT_ULL(ndev->db_count) - 1; + + return 0; +} + +static void amd_deinit_dev(struct amd_ntb_dev *ndev) +{ + cancel_delayed_work_sync(&ndev->hb_timer); + + ndev_deinit_isr(ndev); +} + +static int amd_ntb_init_pci(struct amd_ntb_dev *ndev, + struct pci_dev *pdev) +{ + int rc; + + pci_set_drvdata(pdev, ndev); + + rc = pci_enable_device(pdev); + if (rc) + goto err_pci_enable; + + rc = pci_request_regions(pdev, NTB_NAME); + if (rc) + goto err_pci_regions; + + pci_set_master(pdev); + + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + if (rc) { + rc = pci_set_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) + goto err_dma_mask; + dev_warn(ndev_dev(ndev), "Cannot DMA highmem\n"); + } + + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); + if (rc) { + rc = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(32)); + if (rc) + goto err_dma_mask; + dev_warn(ndev_dev(ndev), "Cannot DMA consistent highmem\n"); + } + + ndev->self_mmio = pci_iomap(pdev, 0, 0); + if (!ndev->self_mmio) { + rc = -EIO; + goto err_dma_mask; + } + ndev->peer_mmio = ndev->self_mmio + AMD_PEER_OFFSET; + + return 0; + +err_dma_mask: + pci_clear_master(pdev); +err_pci_regions: + pci_disable_device(pdev); +err_pci_enable: + pci_set_drvdata(pdev, NULL); + return rc; +} + +static void amd_ntb_deinit_pci(struct amd_ntb_dev *ndev) +{ + struct pci_dev *pdev = ndev_pdev(ndev); + + pci_iounmap(pdev, ndev->self_mmio); + + pci_clear_master(pdev); + pci_release_regions(pdev); + pci_disable_device(pdev); + pci_set_drvdata(pdev, NULL); +} + +static int amd_ntb_pci_probe(struct pci_dev *pdev, + const struct pci_device_id *id) +{ + struct amd_ntb_dev *ndev; + int rc, node; + + node = dev_to_node(&pdev->dev); + + ndev = kzalloc_node(sizeof(*ndev), GFP_KERNEL, node); + if (!ndev) { + rc = -ENOMEM; + goto err_ndev; + } + + ndev_init_struct(ndev, pdev); + + rc = amd_ntb_init_pci(ndev, pdev); + if (rc) + goto err_init_pci; + + rc = amd_init_dev(ndev); + if (rc) + goto err_init_dev; + + /* write side info */ + amd_init_side_info(ndev); + + amd_poll_link(ndev); + + ndev_init_debugfs(ndev); + + rc = ntb_register_device(&ndev->ntb); + if (rc) + goto err_register; + + dev_info(&pdev->dev, "NTB device registered.\n"); + + return 0; + +err_register: + ndev_deinit_debugfs(ndev); + amd_deinit_dev(ndev); +err_init_dev: + amd_ntb_deinit_pci(ndev); +err_init_pci: + kfree(ndev); +err_ndev: + return rc; +} + +static void amd_ntb_pci_remove(struct pci_dev *pdev) +{ + struct amd_ntb_dev *ndev = pci_get_drvdata(pdev); + + ntb_unregister_device(&ndev->ntb); + ndev_deinit_debugfs(ndev); + amd_deinit_side_info(ndev); + amd_deinit_dev(ndev); + amd_ntb_deinit_pci(ndev); + kfree(ndev); +} + +static const struct file_operations amd_ntb_debugfs_info = { + .owner = THIS_MODULE, + .open = simple_open, + .read = ndev_debugfs_read, +}; + +static const struct pci_device_id amd_ntb_pci_tbl[] = { + {PCI_VDEVICE(AMD, PCI_DEVICE_ID_AMD_NTB)}, + {0} +}; +MODULE_DEVICE_TABLE(pci, amd_ntb_pci_tbl); + +static struct pci_driver amd_ntb_pci_driver = { + .name = KBUILD_MODNAME, + .id_table = amd_ntb_pci_tbl, + .probe = amd_ntb_pci_probe, + .remove = amd_ntb_pci_remove, +}; + +static int __init amd_ntb_pci_driver_init(void) +{ + pr_info("%s %s\n", NTB_DESC, NTB_VER); + + if (debugfs_initialized()) + debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL); + + return pci_register_driver(&amd_ntb_pci_driver); +} +module_init(amd_ntb_pci_driver_init); + +static void __exit amd_ntb_pci_driver_exit(void) +{ + pci_unregister_driver(&amd_ntb_pci_driver); + debugfs_remove_recursive(debugfs_dir); +} +module_exit(amd_ntb_pci_driver_exit); diff --git a/drivers/ntb/hw/amd/ntb_hw_amd.h b/drivers/ntb/hw/amd/ntb_hw_amd.h new file mode 100644 index 000000000000..2eac3cd3e646 --- /dev/null +++ b/drivers/ntb/hw/amd/ntb_hw_amd.h @@ -0,0 +1,217 @@ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * BSD LICENSE + * + * Copyright (C) 2016 Advanced Micro Devices, Inc. All Rights Reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copy + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of AMD Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * AMD PCIe NTB Linux driver + * + * Contact Information: + * Xiangliang Yu <Xiangliang.Yu@amd.com> + */ + +#ifndef NTB_HW_AMD_H +#define NTB_HW_AMD_H + +#include <linux/ntb.h> +#include <linux/pci.h> + +#define PCI_DEVICE_ID_AMD_NTB 0x145B +#define AMD_LINK_HB_TIMEOUT msecs_to_jiffies(1000) +#define AMD_LINK_STATUS_OFFSET 0x68 +#define NTB_LIN_STA_ACTIVE_BIT 0x00000002 +#define NTB_LNK_STA_SPEED_MASK 0x000F0000 +#define NTB_LNK_STA_WIDTH_MASK 0x03F00000 +#define NTB_LNK_STA_ACTIVE(x) (!!((x) & NTB_LIN_STA_ACTIVE_BIT)) +#define NTB_LNK_STA_SPEED(x) (((x) & NTB_LNK_STA_SPEED_MASK) >> 16) +#define NTB_LNK_STA_WIDTH(x) (((x) & NTB_LNK_STA_WIDTH_MASK) >> 20) + +#ifndef read64 +#ifdef readq +#define read64 readq +#else +#define read64 _read64 +static inline u64 _read64(void __iomem *mmio) +{ + u64 low, high; + + low = readl(mmio); + high = readl(mmio + sizeof(u32)); + return low | (high << 32); +} +#endif +#endif + +#ifndef write64 +#ifdef writeq +#define write64 writeq +#else +#define write64 _write64 +static inline void _write64(u64 val, void __iomem *mmio) +{ + writel(val, mmio); + writel(val >> 32, mmio + sizeof(u32)); +} +#endif +#endif + +enum { + /* AMD NTB Capability */ + AMD_MW_CNT = 3, + AMD_DB_CNT = 16, + AMD_MSIX_VECTOR_CNT = 24, + AMD_SPADS_CNT = 16, + + /* AMD NTB register offset */ + AMD_CNTL_OFFSET = 0x200, + + /* NTB control register bits */ + PMM_REG_CTL = BIT(21), + SMM_REG_CTL = BIT(20), + SMM_REG_ACC_PATH = BIT(18), + PMM_REG_ACC_PATH = BIT(17), + NTB_CLK_EN = BIT(16), + + AMD_STA_OFFSET = 0x204, + AMD_PGSLV_OFFSET = 0x208, + AMD_SPAD_MUX_OFFSET = 0x20C, + AMD_SPAD_OFFSET = 0x210, + AMD_RSMU_HCID = 0x250, + AMD_RSMU_SIID = 0x254, + AMD_PSION_OFFSET = 0x300, + AMD_SSION_OFFSET = 0x330, + AMD_MMINDEX_OFFSET = 0x400, + AMD_MMDATA_OFFSET = 0x404, + AMD_SIDEINFO_OFFSET = 0x408, + + AMD_SIDE_MASK = BIT(0), + AMD_SIDE_READY = BIT(1), + + /* limit register */ + AMD_ROMBARLMT_OFFSET = 0x410, + AMD_BAR1LMT_OFFSET = 0x414, + AMD_BAR23LMT_OFFSET = 0x418, + AMD_BAR45LMT_OFFSET = 0x420, + /* xlat address */ + AMD_POMBARXLAT_OFFSET = 0x428, + AMD_BAR1XLAT_OFFSET = 0x430, + AMD_BAR23XLAT_OFFSET = 0x438, + AMD_BAR45XLAT_OFFSET = 0x440, + /* doorbell and interrupt */ + AMD_DBFM_OFFSET = 0x450, + AMD_DBREQ_OFFSET = 0x454, + AMD_MIRRDBSTAT_OFFSET = 0x458, + AMD_DBMASK_OFFSET = 0x45C, + AMD_DBSTAT_OFFSET = 0x460, + AMD_INTMASK_OFFSET = 0x470, + AMD_INTSTAT_OFFSET = 0x474, + + /* event type */ + AMD_PEER_FLUSH_EVENT = BIT(0), + AMD_PEER_RESET_EVENT = BIT(1), + AMD_PEER_D3_EVENT = BIT(2), + AMD_PEER_PMETO_EVENT = BIT(3), + AMD_PEER_D0_EVENT = BIT(4), + AMD_EVENT_INTMASK = (AMD_PEER_FLUSH_EVENT | + AMD_PEER_RESET_EVENT | AMD_PEER_D3_EVENT | + AMD_PEER_PMETO_EVENT | AMD_PEER_D0_EVENT), + + AMD_PMESTAT_OFFSET = 0x480, + AMD_PMSGTRIG_OFFSET = 0x490, + AMD_LTRLATENCY_OFFSET = 0x494, + AMD_FLUSHTRIG_OFFSET = 0x498, + + /* SMU register*/ + AMD_SMUACK_OFFSET = 0x4A0, + AMD_SINRST_OFFSET = 0x4A4, + AMD_RSPNUM_OFFSET = 0x4A8, + AMD_SMU_SPADMUTEX = 0x4B0, + AMD_SMU_SPADOFFSET = 0x4B4, + + AMD_PEER_OFFSET = 0x400, +}; + +struct amd_ntb_dev; + +struct amd_ntb_vec { + struct amd_ntb_dev *ndev; + int num; +}; + +struct amd_ntb_dev { + struct ntb_dev ntb; + + u32 ntb_side; + u32 lnk_sta; + u32 cntl_sta; + u32 peer_sta; + + unsigned char mw_count; + unsigned char spad_count; + unsigned char db_count; + unsigned char msix_vec_count; + + u64 db_valid_mask; + u64 db_mask; + u32 int_mask; + + struct msix_entry *msix; + struct amd_ntb_vec *vec; + + /* synchronize rmw access of db_mask and hw reg */ + spinlock_t db_mask_lock; + + void __iomem *self_mmio; + void __iomem *peer_mmio; + unsigned int self_spad; + unsigned int peer_spad; + + struct delayed_work hb_timer; + + struct dentry *debugfs_dir; + struct dentry *debugfs_info; +}; + +#define ndev_pdev(ndev) ((ndev)->ntb.pdev) +#define ndev_name(ndev) pci_name(ndev_pdev(ndev)) +#define ndev_dev(ndev) (&ndev_pdev(ndev)->dev) +#define ntb_ndev(__ntb) container_of(__ntb, struct amd_ntb_dev, ntb) +#define hb_ndev(__work) container_of(__work, struct amd_ntb_dev, hb_timer.work) + +#endif diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.c b/drivers/ntb/hw/intel/ntb_hw_intel.c index a198f8298258..40d04ef5da9e 100644 --- a/drivers/ntb/hw/intel/ntb_hw_intel.c +++ b/drivers/ntb/hw/intel/ntb_hw_intel.c @@ -875,7 +875,7 @@ static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx, limit_reg = bar2_off(ndev->xlat_reg->bar2_limit, bar); if (bar < 4 || !ndev->bar4_split) { - base = ioread64(mmio + base_reg); + base = ioread64(mmio + base_reg) & NTB_BAR_MASK_64; /* Set the limit if supported, if size is not mw_size */ if (limit_reg && size != mw_size) @@ -906,7 +906,7 @@ static int intel_ntb_mw_set_trans(struct ntb_dev *ntb, int idx, if ((addr + size) & (~0ull << 32)) return -EINVAL; - base = ioread32(mmio + base_reg); + base = ioread32(mmio + base_reg) & NTB_BAR_MASK_32; /* Set the limit if supported, if size is not mw_size */ if (limit_reg && size != mw_size) diff --git a/drivers/ntb/hw/intel/ntb_hw_intel.h b/drivers/ntb/hw/intel/ntb_hw_intel.h index 2eb4addd10d0..3ec149cf6562 100644 --- a/drivers/ntb/hw/intel/ntb_hw_intel.h +++ b/drivers/ntb/hw/intel/ntb_hw_intel.h @@ -245,6 +245,9 @@ #define NTB_UNSAFE_DB BIT_ULL(0) #define NTB_UNSAFE_SPAD BIT_ULL(1) +#define NTB_BAR_MASK_64 ~(0xfull) +#define NTB_BAR_MASK_32 ~(0xfu) + struct intel_ntb_dev; struct intel_ntb_reg { @@ -334,7 +337,8 @@ struct intel_ntb_dev { #define ndev_pdev(ndev) ((ndev)->ntb.pdev) #define ndev_name(ndev) pci_name(ndev_pdev(ndev)) #define ndev_dev(ndev) (&ndev_pdev(ndev)->dev) -#define ntb_ndev(ntb) container_of(ntb, struct intel_ntb_dev, ntb) -#define hb_ndev(work) container_of(work, struct intel_ntb_dev, hb_timer.work) +#define ntb_ndev(__ntb) container_of(__ntb, struct intel_ntb_dev, ntb) +#define hb_ndev(__work) container_of(__work, struct intel_ntb_dev, \ + hb_timer.work) #endif diff --git a/drivers/ntb/ntb_transport.c b/drivers/ntb/ntb_transport.c index 60654d524858..ec4775f0ec16 100644 --- a/drivers/ntb/ntb_transport.c +++ b/drivers/ntb/ntb_transport.c @@ -171,12 +171,14 @@ struct ntb_transport_qp { u64 rx_err_ver; u64 rx_memcpy; u64 rx_async; + u64 dma_rx_prep_err; u64 tx_bytes; u64 tx_pkts; u64 tx_ring_full; u64 tx_err_no_buf; u64 tx_memcpy; u64 tx_async; + u64 dma_tx_prep_err; }; struct ntb_transport_mw { @@ -249,6 +251,8 @@ enum { #define QP_TO_MW(nt, qp) ((qp) % nt->mw_count) #define NTB_QP_DEF_NUM_ENTRIES 100 #define NTB_LINK_DOWN_TIMEOUT 10 +#define DMA_RETRIES 20 +#define DMA_OUT_RESOURCE_TO 50 static void ntb_transport_rxc_db(unsigned long data); static const struct ntb_ctx_ops ntb_transport_ops; @@ -501,6 +505,12 @@ static ssize_t debugfs_read(struct file *filp, char __user *ubuf, size_t count, out_offset += snprintf(buf + out_offset, out_count - out_offset, "free tx - \t%u\n", ntb_transport_tx_free_entry(qp)); + out_offset += snprintf(buf + out_offset, out_count - out_offset, + "DMA tx prep err - \t%llu\n", + qp->dma_tx_prep_err); + out_offset += snprintf(buf + out_offset, out_count - out_offset, + "DMA rx prep err - \t%llu\n", + qp->dma_rx_prep_err); out_offset += snprintf(buf + out_offset, out_count - out_offset, "\n"); @@ -726,6 +736,8 @@ static void ntb_qp_link_down_reset(struct ntb_transport_qp *qp) qp->tx_err_no_buf = 0; qp->tx_memcpy = 0; qp->tx_async = 0; + qp->dma_tx_prep_err = 0; + qp->dma_rx_prep_err = 0; } static void ntb_qp_link_cleanup(struct ntb_transport_qp *qp) @@ -1228,6 +1240,7 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset) struct dmaengine_unmap_data *unmap; dma_cookie_t cookie; void *buf = entry->buf; + int retries = 0; len = entry->len; @@ -1263,11 +1276,21 @@ static void ntb_async_rx(struct ntb_queue_entry *entry, void *offset) unmap->from_cnt = 1; - txd = device->device_prep_dma_memcpy(chan, unmap->addr[1], - unmap->addr[0], len, - DMA_PREP_INTERRUPT); - if (!txd) + for (retries = 0; retries < DMA_RETRIES; retries++) { + txd = device->device_prep_dma_memcpy(chan, unmap->addr[1], + unmap->addr[0], len, + DMA_PREP_INTERRUPT); + if (txd) + break; + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(DMA_OUT_RESOURCE_TO); + } + + if (!txd) { + qp->dma_rx_prep_err++; goto err_get_unmap; + } txd->callback = ntb_rx_copy_callback; txd->callback_param = entry; @@ -1460,6 +1483,7 @@ static void ntb_async_tx(struct ntb_transport_qp *qp, void __iomem *offset; size_t len = entry->len; void *buf = entry->buf; + int retries = 0; offset = qp->tx_mw + qp->tx_max_frame * qp->tx_index; hdr = offset + qp->tx_max_frame - sizeof(struct ntb_payload_header); @@ -1494,10 +1518,20 @@ static void ntb_async_tx(struct ntb_transport_qp *qp, unmap->to_cnt = 1; - txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], len, - DMA_PREP_INTERRUPT); - if (!txd) + for (retries = 0; retries < DMA_RETRIES; retries++) { + txd = device->device_prep_dma_memcpy(chan, dest, unmap->addr[0], + len, DMA_PREP_INTERRUPT); + if (txd) + break; + + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(DMA_OUT_RESOURCE_TO); + } + + if (!txd) { + qp->dma_tx_prep_err++; goto err_get_unmap; + } txd->callback = ntb_tx_copy_callback; txd->callback_param = entry; @@ -1532,7 +1566,7 @@ static int ntb_process_tx(struct ntb_transport_qp *qp, if (entry->len > qp->tx_max_frame - sizeof(struct ntb_payload_header)) { if (qp->tx_handler) - qp->tx_handler(qp->cb_data, qp, NULL, -EIO); + qp->tx_handler(qp, qp->cb_data, NULL, -EIO); ntb_list_add(&qp->ntb_tx_free_q_lock, &entry->entry, &qp->tx_free_q); diff --git a/drivers/ntb/test/Kconfig b/drivers/ntb/test/Kconfig index 01852f98a843..a5d0eda44438 100644 --- a/drivers/ntb/test/Kconfig +++ b/drivers/ntb/test/Kconfig @@ -17,3 +17,11 @@ config NTB_TOOL functioning at a basic level. If unsure, say N. + +config NTB_PERF + tristate "NTB RAW Perf Measuring Tool" + help + This is a tool to measure raw NTB performance by transferring data + to and from the window without additional software interaction. + + If unsure, say N. diff --git a/drivers/ntb/test/Makefile b/drivers/ntb/test/Makefile index 0ea32a324b6c..9e77e0b761c2 100644 --- a/drivers/ntb/test/Makefile +++ b/drivers/ntb/test/Makefile @@ -1,2 +1,3 @@ obj-$(CONFIG_NTB_PINGPONG) += ntb_pingpong.o obj-$(CONFIG_NTB_TOOL) += ntb_tool.o +obj-$(CONFIG_NTB_PERF) += ntb_perf.o diff --git a/drivers/ntb/test/ntb_perf.c b/drivers/ntb/test/ntb_perf.c new file mode 100644 index 000000000000..c8a37ba4b4f9 --- /dev/null +++ b/drivers/ntb/test/ntb_perf.c @@ -0,0 +1,748 @@ +/* + * This file is provided under a dual BSD/GPLv2 license. When using or + * redistributing this file, you may do so under either license. + * + * GPL LICENSE SUMMARY + * + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of version 2 of the GNU General Public License as + * published by the Free Software Foundation. + * + * BSD LICENSE + * + * Copyright(c) 2015 Intel Corporation. All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * + * * Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * * Redistributions in binary form must reproduce the above copy + * notice, this list of conditions and the following disclaimer in + * the documentation and/or other materials provided with the + * distribution. + * * Neither the name of Intel Corporation nor the names of its + * contributors may be used to endorse or promote products derived + * from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + * "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + * LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + * A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + * OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + * PCIe NTB Perf Linux driver + */ + +#include <linux/init.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/kthread.h> +#include <linux/time.h> +#include <linux/timer.h> +#include <linux/dma-mapping.h> +#include <linux/pci.h> +#include <linux/slab.h> +#include <linux/spinlock.h> +#include <linux/debugfs.h> +#include <linux/dmaengine.h> +#include <linux/delay.h> +#include <linux/sizes.h> +#include <linux/ntb.h> + +#define DRIVER_NAME "ntb_perf" +#define DRIVER_DESCRIPTION "PCIe NTB Performance Measurement Tool" + +#define DRIVER_LICENSE "Dual BSD/GPL" +#define DRIVER_VERSION "1.0" +#define DRIVER_AUTHOR "Dave Jiang <dave.jiang@intel.com>" + +#define PERF_LINK_DOWN_TIMEOUT 10 +#define PERF_VERSION 0xffff0001 +#define MAX_THREADS 32 +#define MAX_TEST_SIZE SZ_1M +#define MAX_SRCS 32 +#define DMA_OUT_RESOURCE_TO 50 +#define DMA_RETRIES 20 +#define SZ_4G (1ULL << 32) +#define MAX_SEG_ORDER 20 /* no larger than 1M for kmalloc buffer */ + +MODULE_LICENSE(DRIVER_LICENSE); +MODULE_VERSION(DRIVER_VERSION); +MODULE_AUTHOR(DRIVER_AUTHOR); +MODULE_DESCRIPTION(DRIVER_DESCRIPTION); + +static struct dentry *perf_debugfs_dir; + +static unsigned int seg_order = 19; /* 512K */ +module_param(seg_order, uint, 0644); +MODULE_PARM_DESC(seg_order, "size order [n^2] of buffer segment for testing"); + +static unsigned int run_order = 32; /* 4G */ +module_param(run_order, uint, 0644); +MODULE_PARM_DESC(run_order, "size order [n^2] of total data to transfer"); + +static bool use_dma; /* default to 0 */ +module_param(use_dma, bool, 0644); +MODULE_PARM_DESC(use_dma, "Using DMA engine to measure performance"); + +struct perf_mw { + phys_addr_t phys_addr; + resource_size_t phys_size; + resource_size_t xlat_align; + resource_size_t xlat_align_size; + void __iomem *vbase; + size_t xlat_size; + size_t buf_size; + void *virt_addr; + dma_addr_t dma_addr; +}; + +struct perf_ctx; + +struct pthr_ctx { + struct task_struct *thread; + struct perf_ctx *perf; + atomic_t dma_sync; + struct dma_chan *dma_chan; + int dma_prep_err; + int src_idx; + void *srcs[MAX_SRCS]; +}; + +struct perf_ctx { + struct ntb_dev *ntb; + spinlock_t db_lock; + struct perf_mw mw; + bool link_is_up; + struct work_struct link_cleanup; + struct delayed_work link_work; + struct dentry *debugfs_node_dir; + struct dentry *debugfs_run; + struct dentry *debugfs_threads; + u8 perf_threads; + bool run; + struct pthr_ctx pthr_ctx[MAX_THREADS]; + atomic_t tsync; +}; + +enum { + VERSION = 0, + MW_SZ_HIGH, + MW_SZ_LOW, + SPAD_MSG, + SPAD_ACK, + MAX_SPAD +}; + +static void perf_link_event(void *ctx) +{ + struct perf_ctx *perf = ctx; + + if (ntb_link_is_up(perf->ntb, NULL, NULL) == 1) + schedule_delayed_work(&perf->link_work, 2*HZ); + else + schedule_work(&perf->link_cleanup); +} + +static void perf_db_event(void *ctx, int vec) +{ + struct perf_ctx *perf = ctx; + u64 db_bits, db_mask; + + db_mask = ntb_db_vector_mask(perf->ntb, vec); + db_bits = ntb_db_read(perf->ntb); + + dev_dbg(&perf->ntb->dev, "doorbell vec %d mask %#llx bits %#llx\n", + vec, db_mask, db_bits); +} + +static const struct ntb_ctx_ops perf_ops = { + .link_event = perf_link_event, + .db_event = perf_db_event, +}; + +static void perf_copy_callback(void *data) +{ + struct pthr_ctx *pctx = data; + + atomic_dec(&pctx->dma_sync); +} + +static ssize_t perf_copy(struct pthr_ctx *pctx, char *dst, + char *src, size_t size) +{ + struct perf_ctx *perf = pctx->perf; + struct dma_async_tx_descriptor *txd; + struct dma_chan *chan = pctx->dma_chan; + struct dma_device *device; + struct dmaengine_unmap_data *unmap; + dma_cookie_t cookie; + size_t src_off, dst_off; + struct perf_mw *mw = &perf->mw; + u64 vbase, dst_vaddr; + dma_addr_t dst_phys; + int retries = 0; + + if (!use_dma) { + memcpy_toio(dst, src, size); + return size; + } + + if (!chan) { + dev_err(&perf->ntb->dev, "DMA engine does not exist\n"); + return -EINVAL; + } + + device = chan->device; + src_off = (size_t)src & ~PAGE_MASK; + dst_off = (size_t)dst & ~PAGE_MASK; + + if (!is_dma_copy_aligned(device, src_off, dst_off, size)) + return -ENODEV; + + vbase = (u64)(u64 *)mw->vbase; + dst_vaddr = (u64)(u64 *)dst; + dst_phys = mw->phys_addr + (dst_vaddr - vbase); + + unmap = dmaengine_get_unmap_data(device->dev, 1, GFP_NOWAIT); + if (!unmap) + return -ENOMEM; + + unmap->len = size; + unmap->addr[0] = dma_map_page(device->dev, virt_to_page(src), + src_off, size, DMA_TO_DEVICE); + if (dma_mapping_error(device->dev, unmap->addr[0])) + goto err_get_unmap; + + unmap->to_cnt = 1; + + do { + txd = device->device_prep_dma_memcpy(chan, dst_phys, + unmap->addr[0], + size, DMA_PREP_INTERRUPT); + if (!txd) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(DMA_OUT_RESOURCE_TO); + } + } while (!txd && (++retries < DMA_RETRIES)); + + if (!txd) { + pctx->dma_prep_err++; + goto err_get_unmap; + } + + txd->callback = perf_copy_callback; + txd->callback_param = pctx; + dma_set_unmap(txd, unmap); + + cookie = dmaengine_submit(txd); + if (dma_submit_error(cookie)) + goto err_set_unmap; + + atomic_inc(&pctx->dma_sync); + dma_async_issue_pending(chan); + + return size; + +err_set_unmap: + dmaengine_unmap_put(unmap); +err_get_unmap: + dmaengine_unmap_put(unmap); + return 0; +} + +static int perf_move_data(struct pthr_ctx *pctx, char *dst, char *src, + u64 buf_size, u64 win_size, u64 total) +{ + int chunks, total_chunks, i; + int copied_chunks = 0; + u64 copied = 0, result; + char *tmp = dst; + u64 perf, diff_us; + ktime_t kstart, kstop, kdiff; + + chunks = div64_u64(win_size, buf_size); + total_chunks = div64_u64(total, buf_size); + kstart = ktime_get(); + + for (i = 0; i < total_chunks; i++) { + result = perf_copy(pctx, tmp, src, buf_size); + copied += result; + copied_chunks++; + if (copied_chunks == chunks) { + tmp = dst; + copied_chunks = 0; + } else + tmp += buf_size; + + /* Probably should schedule every 4GB to prevent soft hang. */ + if (((copied % SZ_4G) == 0) && !use_dma) { + set_current_state(TASK_INTERRUPTIBLE); + schedule_timeout(1); + } + } + + if (use_dma) { + pr_info("%s: All DMA descriptors submitted\n", current->comm); + while (atomic_read(&pctx->dma_sync) != 0) + msleep(20); + } + + kstop = ktime_get(); + kdiff = ktime_sub(kstop, kstart); + diff_us = ktime_to_us(kdiff); + + pr_info("%s: copied %llu bytes\n", current->comm, copied); + + pr_info("%s: lasted %llu usecs\n", current->comm, diff_us); + + perf = div64_u64(copied, diff_us); + + pr_info("%s: MBytes/s: %llu\n", current->comm, perf); + + return 0; +} + +static bool perf_dma_filter_fn(struct dma_chan *chan, void *node) +{ + return dev_to_node(&chan->dev->device) == (int)(unsigned long)node; +} + +static int ntb_perf_thread(void *data) +{ + struct pthr_ctx *pctx = data; + struct perf_ctx *perf = pctx->perf; + struct pci_dev *pdev = perf->ntb->pdev; + struct perf_mw *mw = &perf->mw; + char *dst; + u64 win_size, buf_size, total; + void *src; + int rc, node, i; + struct dma_chan *dma_chan = NULL; + + pr_info("kthread %s starting...\n", current->comm); + + node = dev_to_node(&pdev->dev); + + if (use_dma && !pctx->dma_chan) { + dma_cap_mask_t dma_mask; + + dma_cap_zero(dma_mask); + dma_cap_set(DMA_MEMCPY, dma_mask); + dma_chan = dma_request_channel(dma_mask, perf_dma_filter_fn, + (void *)(unsigned long)node); + if (!dma_chan) { + pr_warn("%s: cannot acquire DMA channel, quitting\n", + current->comm); + return -ENODEV; + } + pctx->dma_chan = dma_chan; + } + + for (i = 0; i < MAX_SRCS; i++) { + pctx->srcs[i] = kmalloc_node(MAX_TEST_SIZE, GFP_KERNEL, node); + if (!pctx->srcs[i]) { + rc = -ENOMEM; + goto err; + } + } + + win_size = mw->phys_size; + buf_size = 1ULL << seg_order; + total = 1ULL << run_order; + + if (buf_size > MAX_TEST_SIZE) + buf_size = MAX_TEST_SIZE; + + dst = (char *)mw->vbase; + + atomic_inc(&perf->tsync); + while (atomic_read(&perf->tsync) != perf->perf_threads) + schedule(); + + src = pctx->srcs[pctx->src_idx]; + pctx->src_idx = (pctx->src_idx + 1) & (MAX_SRCS - 1); + + rc = perf_move_data(pctx, dst, src, buf_size, win_size, total); + + atomic_dec(&perf->tsync); + + if (rc < 0) { + pr_err("%s: failed\n", current->comm); + rc = -ENXIO; + goto err; + } + + for (i = 0; i < MAX_SRCS; i++) { + kfree(pctx->srcs[i]); + pctx->srcs[i] = NULL; + } + + return 0; + +err: + for (i = 0; i < MAX_SRCS; i++) { + kfree(pctx->srcs[i]); + pctx->srcs[i] = NULL; + } + + if (dma_chan) { + dma_release_channel(dma_chan); + pctx->dma_chan = NULL; + } + + return rc; +} + +static void perf_free_mw(struct perf_ctx *perf) +{ + struct perf_mw *mw = &perf->mw; + struct pci_dev *pdev = perf->ntb->pdev; + + if (!mw->virt_addr) + return; + + ntb_mw_clear_trans(perf->ntb, 0); + dma_free_coherent(&pdev->dev, mw->buf_size, + mw->virt_addr, mw->dma_addr); + mw->xlat_size = 0; + mw->buf_size = 0; + mw->virt_addr = NULL; +} + +static int perf_set_mw(struct perf_ctx *perf, resource_size_t size) +{ + struct perf_mw *mw = &perf->mw; + size_t xlat_size, buf_size; + + if (!size) + return -EINVAL; + + xlat_size = round_up(size, mw->xlat_align_size); + buf_size = round_up(size, mw->xlat_align); + + if (mw->xlat_size == xlat_size) + return 0; + + if (mw->buf_size) + perf_free_mw(perf); + + mw->xlat_size = xlat_size; + mw->buf_size = buf_size; + + mw->virt_addr = dma_alloc_coherent(&perf->ntb->pdev->dev, buf_size, + &mw->dma_addr, GFP_KERNEL); + if (!mw->virt_addr) { + mw->xlat_size = 0; + mw->buf_size = 0; + } + + return 0; +} + +static void perf_link_work(struct work_struct *work) +{ + struct perf_ctx *perf = + container_of(work, struct perf_ctx, link_work.work); + struct ntb_dev *ndev = perf->ntb; + struct pci_dev *pdev = ndev->pdev; + u32 val; + u64 size; + int rc; + + dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__); + + size = perf->mw.phys_size; + ntb_peer_spad_write(ndev, MW_SZ_HIGH, upper_32_bits(size)); + ntb_peer_spad_write(ndev, MW_SZ_LOW, lower_32_bits(size)); + ntb_peer_spad_write(ndev, VERSION, PERF_VERSION); + + /* now read what peer wrote */ + val = ntb_spad_read(ndev, VERSION); + if (val != PERF_VERSION) { + dev_dbg(&pdev->dev, "Remote version = %#x\n", val); + goto out; + } + + val = ntb_spad_read(ndev, MW_SZ_HIGH); + size = (u64)val << 32; + + val = ntb_spad_read(ndev, MW_SZ_LOW); + size |= val; + + dev_dbg(&pdev->dev, "Remote MW size = %#llx\n", size); + + rc = perf_set_mw(perf, size); + if (rc) + goto out1; + + perf->link_is_up = true; + + return; + +out1: + perf_free_mw(perf); + +out: + if (ntb_link_is_up(ndev, NULL, NULL) == 1) + schedule_delayed_work(&perf->link_work, + msecs_to_jiffies(PERF_LINK_DOWN_TIMEOUT)); +} + +static void perf_link_cleanup(struct work_struct *work) +{ + struct perf_ctx *perf = container_of(work, + struct perf_ctx, + link_cleanup); + + dev_dbg(&perf->ntb->pdev->dev, "%s called\n", __func__); + + if (!perf->link_is_up) + cancel_delayed_work_sync(&perf->link_work); +} + +static int perf_setup_mw(struct ntb_dev *ntb, struct perf_ctx *perf) +{ + struct perf_mw *mw; + int rc; + + mw = &perf->mw; + + rc = ntb_mw_get_range(ntb, 0, &mw->phys_addr, &mw->phys_size, + &mw->xlat_align, &mw->xlat_align_size); + if (rc) + return rc; + + perf->mw.vbase = ioremap_wc(mw->phys_addr, mw->phys_size); + if (!mw->vbase) + return -ENOMEM; + + return 0; +} + +static ssize_t debugfs_run_read(struct file *filp, char __user *ubuf, + size_t count, loff_t *offp) +{ + struct perf_ctx *perf = filp->private_data; + char *buf; + ssize_t ret, out_offset; + + if (!perf) + return 0; + + buf = kmalloc(64, GFP_KERNEL); + out_offset = snprintf(buf, 64, "%d\n", perf->run); + ret = simple_read_from_buffer(ubuf, count, offp, buf, out_offset); + kfree(buf); + + return ret; +} + +static ssize_t debugfs_run_write(struct file *filp, const char __user *ubuf, + size_t count, loff_t *offp) +{ + struct perf_ctx *perf = filp->private_data; + int node, i; + + if (!perf->link_is_up) + return 0; + + if (perf->perf_threads == 0) + return 0; + + if (atomic_read(&perf->tsync) == 0) + perf->run = false; + + if (perf->run) { + /* lets stop the threads */ + perf->run = false; + for (i = 0; i < MAX_THREADS; i++) { + if (perf->pthr_ctx[i].thread) { + kthread_stop(perf->pthr_ctx[i].thread); + perf->pthr_ctx[i].thread = NULL; + } else + break; + } + } else { + perf->run = true; + + if (perf->perf_threads > MAX_THREADS) { + perf->perf_threads = MAX_THREADS; + pr_info("Reset total threads to: %u\n", MAX_THREADS); + } + + /* no greater than 1M */ + if (seg_order > MAX_SEG_ORDER) { + seg_order = MAX_SEG_ORDER; + pr_info("Fix seg_order to %u\n", seg_order); + } + + if (run_order < seg_order) { + run_order = seg_order; + pr_info("Fix run_order to %u\n", run_order); + } + + node = dev_to_node(&perf->ntb->pdev->dev); + /* launch kernel thread */ + for (i = 0; i < perf->perf_threads; i++) { + struct pthr_ctx *pctx; + + pctx = &perf->pthr_ctx[i]; + atomic_set(&pctx->dma_sync, 0); + pctx->perf = perf; + pctx->thread = + kthread_create_on_node(ntb_perf_thread, + (void *)pctx, + node, "ntb_perf %d", i); + if (pctx->thread) + wake_up_process(pctx->thread); + else { + perf->run = false; + for (i = 0; i < MAX_THREADS; i++) { + if (pctx->thread) { + kthread_stop(pctx->thread); + pctx->thread = NULL; + } + } + } + + if (perf->run == false) + return -ENXIO; + } + + } + + return count; +} + +static const struct file_operations ntb_perf_debugfs_run = { + .owner = THIS_MODULE, + .open = simple_open, + .read = debugfs_run_read, + .write = debugfs_run_write, +}; + +static int perf_debugfs_setup(struct perf_ctx *perf) +{ + struct pci_dev *pdev = perf->ntb->pdev; + + if (!debugfs_initialized()) + return -ENODEV; + + if (!perf_debugfs_dir) { + perf_debugfs_dir = debugfs_create_dir(KBUILD_MODNAME, NULL); + if (!perf_debugfs_dir) + return -ENODEV; + } + + perf->debugfs_node_dir = debugfs_create_dir(pci_name(pdev), + perf_debugfs_dir); + if (!perf->debugfs_node_dir) + return -ENODEV; + + perf->debugfs_run = debugfs_create_file("run", S_IRUSR | S_IWUSR, + perf->debugfs_node_dir, perf, + &ntb_perf_debugfs_run); + if (!perf->debugfs_run) + return -ENODEV; + + perf->debugfs_threads = debugfs_create_u8("threads", S_IRUSR | S_IWUSR, + perf->debugfs_node_dir, + &perf->perf_threads); + if (!perf->debugfs_threads) + return -ENODEV; + + return 0; +} + +static int perf_probe(struct ntb_client *client, struct ntb_dev *ntb) +{ + struct pci_dev *pdev = ntb->pdev; + struct perf_ctx *perf; + int node; + int rc = 0; + + node = dev_to_node(&pdev->dev); + + perf = kzalloc_node(sizeof(*perf), GFP_KERNEL, node); + if (!perf) { + rc = -ENOMEM; + goto err_perf; + } + + perf->ntb = ntb; + perf->perf_threads = 1; + atomic_set(&perf->tsync, 0); + perf->run = false; + spin_lock_init(&perf->db_lock); + perf_setup_mw(ntb, perf); + INIT_DELAYED_WORK(&perf->link_work, perf_link_work); + INIT_WORK(&perf->link_cleanup, perf_link_cleanup); + + rc = ntb_set_ctx(ntb, perf, &perf_ops); + if (rc) + goto err_ctx; + + perf->link_is_up = false; + ntb_link_enable(ntb, NTB_SPEED_AUTO, NTB_WIDTH_AUTO); + ntb_link_event(ntb); + + rc = perf_debugfs_setup(perf); + if (rc) + goto err_ctx; + + return 0; + +err_ctx: + cancel_delayed_work_sync(&perf->link_work); + cancel_work_sync(&perf->link_cleanup); + kfree(perf); +err_perf: + return rc; +} + +static void perf_remove(struct ntb_client *client, struct ntb_dev *ntb) +{ + struct perf_ctx *perf = ntb->ctx; + int i; + + dev_dbg(&perf->ntb->dev, "%s called\n", __func__); + + cancel_delayed_work_sync(&perf->link_work); + cancel_work_sync(&perf->link_cleanup); + + ntb_clear_ctx(ntb); + ntb_link_disable(ntb); + + debugfs_remove_recursive(perf_debugfs_dir); + perf_debugfs_dir = NULL; + + if (use_dma) { + for (i = 0; i < MAX_THREADS; i++) { + struct pthr_ctx *pctx = &perf->pthr_ctx[i]; + + if (pctx->dma_chan) + dma_release_channel(pctx->dma_chan); + } + } + + kfree(perf); +} + +static struct ntb_client perf_client = { + .ops = { + .probe = perf_probe, + .remove = perf_remove, + }, +}; +module_ntb_client(perf_client); diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig index 002a94abdbc4..5d6237391dcd 100644 --- a/drivers/nvme/host/Kconfig +++ b/drivers/nvme/host/Kconfig @@ -8,3 +8,14 @@ config BLK_DEV_NVME To compile this driver as a module, choose M here: the module will be called nvme. + +config BLK_DEV_NVME_SCSI + bool "SCSI emulation for NVMe device nodes" + depends on BLK_DEV_NVME + ---help--- + This adds support for the SG_IO ioctl on the NVMe character + and block devices nodes, as well a a translation for a small + number of selected SCSI commands to NVMe commands to the NVMe + driver. If you don't know what this means you probably want + to say N here, and if you know what it means you probably + want to say N as well. diff --git a/drivers/nvme/host/Makefile b/drivers/nvme/host/Makefile index a5fe23952586..51bf90871549 100644 --- a/drivers/nvme/host/Makefile +++ b/drivers/nvme/host/Makefile @@ -1,5 +1,6 @@ obj-$(CONFIG_BLK_DEV_NVME) += nvme.o -lightnvm-$(CONFIG_NVM) := lightnvm.o -nvme-y += pci.o scsi.o $(lightnvm-y) +lightnvm-$(CONFIG_NVM) := lightnvm.o +nvme-y += core.o pci.o $(lightnvm-y) +nvme-$(CONFIG_BLK_DEV_NVME_SCSI) += scsi.o diff --git a/drivers/nvme/host/core.c b/drivers/nvme/host/core.c new file mode 100644 index 000000000000..c5bf001af559 --- /dev/null +++ b/drivers/nvme/host/core.c @@ -0,0 +1,1472 @@ +/* + * NVM Express device driver + * Copyright (c) 2011-2014, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + */ + +#include <linux/blkdev.h> +#include <linux/blk-mq.h> +#include <linux/delay.h> +#include <linux/errno.h> +#include <linux/hdreg.h> +#include <linux/kernel.h> +#include <linux/module.h> +#include <linux/list_sort.h> +#include <linux/slab.h> +#include <linux/types.h> +#include <linux/pr.h> +#include <linux/ptrace.h> +#include <linux/nvme_ioctl.h> +#include <linux/t10-pi.h> +#include <scsi/sg.h> +#include <asm/unaligned.h> + +#include "nvme.h" + +#define NVME_MINORS (1U << MINORBITS) + +static int nvme_major; +module_param(nvme_major, int, 0); + +static int nvme_char_major; +module_param(nvme_char_major, int, 0); + +static LIST_HEAD(nvme_ctrl_list); +DEFINE_SPINLOCK(dev_list_lock); + +static struct class *nvme_class; + +static void nvme_free_ns(struct kref *kref) +{ + struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); + + if (ns->type == NVME_NS_LIGHTNVM) + nvme_nvm_unregister(ns->queue, ns->disk->disk_name); + + spin_lock(&dev_list_lock); + ns->disk->private_data = NULL; + spin_unlock(&dev_list_lock); + + nvme_put_ctrl(ns->ctrl); + put_disk(ns->disk); + kfree(ns); +} + +static void nvme_put_ns(struct nvme_ns *ns) +{ + kref_put(&ns->kref, nvme_free_ns); +} + +static struct nvme_ns *nvme_get_ns_from_disk(struct gendisk *disk) +{ + struct nvme_ns *ns; + + spin_lock(&dev_list_lock); + ns = disk->private_data; + if (ns && !kref_get_unless_zero(&ns->kref)) + ns = NULL; + spin_unlock(&dev_list_lock); + + return ns; +} + +void nvme_requeue_req(struct request *req) +{ + unsigned long flags; + + blk_mq_requeue_request(req); + spin_lock_irqsave(req->q->queue_lock, flags); + if (!blk_queue_stopped(req->q)) + blk_mq_kick_requeue_list(req->q); + spin_unlock_irqrestore(req->q->queue_lock, flags); +} + +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags) +{ + bool write = cmd->common.opcode & 1; + struct request *req; + + req = blk_mq_alloc_request(q, write, flags); + if (IS_ERR(req)) + return req; + + req->cmd_type = REQ_TYPE_DRV_PRIV; + req->cmd_flags |= REQ_FAILFAST_DRIVER; + req->__data_len = 0; + req->__sector = (sector_t) -1; + req->bio = req->biotail = NULL; + + req->cmd = (unsigned char *)cmd; + req->cmd_len = sizeof(struct nvme_command); + req->special = (void *)0; + + return req; +} + +/* + * Returns 0 on success. If the result is negative, it's a Linux error code; + * if the result is positive, it's an NVM Express status code + */ +int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen, u32 *result, unsigned timeout) +{ + struct request *req; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + if (buffer && bufflen) { + ret = blk_rq_map_kern(q, req, buffer, bufflen, GFP_KERNEL); + if (ret) + goto out; + } + + blk_execute_rq(req->q, NULL, req, 0); + if (result) + *result = (u32)(uintptr_t)req->special; + ret = req->errors; + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, + void *buffer, unsigned bufflen) +{ + return __nvme_submit_sync_cmd(q, cmd, buffer, bufflen, NULL, 0); +} + +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, + u32 *result, unsigned timeout) +{ + bool write = cmd->common.opcode & 1; + struct nvme_ns *ns = q->queuedata; + struct gendisk *disk = ns ? ns->disk : NULL; + struct request *req; + struct bio *bio = NULL; + void *meta = NULL; + int ret; + + req = nvme_alloc_request(q, cmd, 0); + if (IS_ERR(req)) + return PTR_ERR(req); + + req->timeout = timeout ? timeout : ADMIN_TIMEOUT; + + if (ubuffer && bufflen) { + ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, + GFP_KERNEL); + if (ret) + goto out; + bio = req->bio; + + if (!disk) + goto submit; + bio->bi_bdev = bdget_disk(disk, 0); + if (!bio->bi_bdev) { + ret = -ENODEV; + goto out_unmap; + } + + if (meta_buffer) { + struct bio_integrity_payload *bip; + + meta = kmalloc(meta_len, GFP_KERNEL); + if (!meta) { + ret = -ENOMEM; + goto out_unmap; + } + + if (write) { + if (copy_from_user(meta, meta_buffer, + meta_len)) { + ret = -EFAULT; + goto out_free_meta; + } + } + + bip = bio_integrity_alloc(bio, GFP_KERNEL, 1); + if (IS_ERR(bip)) { + ret = PTR_ERR(bip); + goto out_free_meta; + } + + bip->bip_iter.bi_size = meta_len; + bip->bip_iter.bi_sector = meta_seed; + + ret = bio_integrity_add_page(bio, virt_to_page(meta), + meta_len, offset_in_page(meta)); + if (ret != meta_len) { + ret = -ENOMEM; + goto out_free_meta; + } + } + } + submit: + blk_execute_rq(req->q, disk, req, 0); + ret = req->errors; + if (result) + *result = (u32)(uintptr_t)req->special; + if (meta && !ret && !write) { + if (copy_to_user(meta_buffer, meta, meta_len)) + ret = -EFAULT; + } + out_free_meta: + kfree(meta); + out_unmap: + if (bio) { + if (disk && bio->bi_bdev) + bdput(bio->bi_bdev); + blk_rq_unmap_user(bio); + } + out: + blk_mq_free_request(req); + return ret; +} + +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout) +{ + return __nvme_submit_user_cmd(q, cmd, ubuffer, bufflen, NULL, 0, 0, + result, timeout); +} + +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(1); + + *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ctrl)); + if (error) + kfree(*id); + return error; +} + +static int nvme_identify_ns_list(struct nvme_ctrl *dev, unsigned nsid, __le32 *ns_list) +{ + struct nvme_command c = { }; + + c.identify.opcode = nvme_admin_identify; + c.identify.cns = cpu_to_le32(2); + c.identify.nsid = cpu_to_le32(nsid); + return nvme_submit_sync_cmd(dev->admin_q, &c, ns_list, 0x1000); +} + +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, + struct nvme_id_ns **id) +{ + struct nvme_command c = { }; + int error; + + /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ + c.identify.opcode = nvme_admin_identify, + c.identify.nsid = cpu_to_le32(nsid), + + *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); + if (!*id) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, + sizeof(struct nvme_id_ns)); + if (error) + kfree(*id); + return error; +} + +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_get_features; + c.features.nsid = cpu_to_le32(nsid); + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); +} + +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, + dma_addr_t dma_addr, u32 *result) +{ + struct nvme_command c; + + memset(&c, 0, sizeof(c)); + c.features.opcode = nvme_admin_set_features; + c.features.prp1 = cpu_to_le64(dma_addr); + c.features.fid = cpu_to_le32(fid); + c.features.dword11 = cpu_to_le32(dword11); + + return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0, result, 0); +} + +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log) +{ + struct nvme_command c = { }; + int error; + + c.common.opcode = nvme_admin_get_log_page, + c.common.nsid = cpu_to_le32(0xFFFFFFFF), + c.common.cdw10[0] = cpu_to_le32( + (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | + NVME_LOG_SMART), + + *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); + if (!*log) + return -ENOMEM; + + error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, + sizeof(struct nvme_smart_log)); + if (error) + kfree(*log); + return error; +} + +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count) +{ + u32 q_count = (*count - 1) | ((*count - 1) << 16); + u32 result; + int status, nr_io_queues; + + status = nvme_set_features(ctrl, NVME_FEAT_NUM_QUEUES, q_count, 0, + &result); + if (status) + return status; + + nr_io_queues = min(result & 0xffff, result >> 16) + 1; + *count = min(*count, nr_io_queues); + return 0; +} + +static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) +{ + struct nvme_user_io io; + struct nvme_command c; + unsigned length, meta_len; + void __user *metadata; + + if (copy_from_user(&io, uio, sizeof(io))) + return -EFAULT; + + switch (io.opcode) { + case nvme_cmd_write: + case nvme_cmd_read: + case nvme_cmd_compare: + break; + default: + return -EINVAL; + } + + length = (io.nblocks + 1) << ns->lba_shift; + meta_len = (io.nblocks + 1) * ns->ms; + metadata = (void __user *)(uintptr_t)io.metadata; + + if (ns->ext) { + length += meta_len; + meta_len = 0; + } else if (meta_len) { + if ((io.metadata & 3) || !io.metadata) + return -EINVAL; + } + + memset(&c, 0, sizeof(c)); + c.rw.opcode = io.opcode; + c.rw.flags = io.flags; + c.rw.nsid = cpu_to_le32(ns->ns_id); + c.rw.slba = cpu_to_le64(io.slba); + c.rw.length = cpu_to_le16(io.nblocks); + c.rw.control = cpu_to_le16(io.control); + c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); + c.rw.reftag = cpu_to_le32(io.reftag); + c.rw.apptag = cpu_to_le16(io.apptag); + c.rw.appmask = cpu_to_le16(io.appmask); + + return __nvme_submit_user_cmd(ns->queue, &c, + (void __user *)(uintptr_t)io.addr, length, + metadata, meta_len, io.slba, NULL, 0); +} + +static int nvme_user_cmd(struct nvme_ctrl *ctrl, struct nvme_ns *ns, + struct nvme_passthru_cmd __user *ucmd) +{ + struct nvme_passthru_cmd cmd; + struct nvme_command c; + unsigned timeout = 0; + int status; + + if (!capable(CAP_SYS_ADMIN)) + return -EACCES; + if (copy_from_user(&cmd, ucmd, sizeof(cmd))) + return -EFAULT; + + memset(&c, 0, sizeof(c)); + c.common.opcode = cmd.opcode; + c.common.flags = cmd.flags; + c.common.nsid = cpu_to_le32(cmd.nsid); + c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); + c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); + c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); + c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); + c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); + c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); + c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); + c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); + + if (cmd.timeout_ms) + timeout = msecs_to_jiffies(cmd.timeout_ms); + + status = nvme_submit_user_cmd(ns ? ns->queue : ctrl->admin_q, &c, + (void __user *)(uintptr_t)cmd.addr, cmd.data_len, + &cmd.result, timeout); + if (status >= 0) { + if (put_user(cmd.result, &ucmd->result)) + return -EFAULT; + } + + return status; +} + +static int nvme_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + + switch (cmd) { + case NVME_IOCTL_ID: + force_successful_syscall_return(); + return ns->ns_id; + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ns->ctrl, NULL, (void __user *)arg); + case NVME_IOCTL_IO_CMD: + return nvme_user_cmd(ns->ctrl, ns, (void __user *)arg); + case NVME_IOCTL_SUBMIT_IO: + return nvme_submit_io(ns, (void __user *)arg); +#ifdef CONFIG_BLK_DEV_NVME_SCSI + case SG_GET_VERSION_NUM: + return nvme_sg_get_version_num((void __user *)arg); + case SG_IO: + return nvme_sg_io(ns, (void __user *)arg); +#endif + default: + return -ENOTTY; + } +} + +#ifdef CONFIG_COMPAT +static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, + unsigned int cmd, unsigned long arg) +{ + switch (cmd) { + case SG_IO: + return -ENOIOCTLCMD; + } + return nvme_ioctl(bdev, mode, cmd, arg); +} +#else +#define nvme_compat_ioctl NULL +#endif + +static int nvme_open(struct block_device *bdev, fmode_t mode) +{ + return nvme_get_ns_from_disk(bdev->bd_disk) ? 0 : -ENXIO; +} + +static void nvme_release(struct gendisk *disk, fmode_t mode) +{ + nvme_put_ns(disk->private_data); +} + +static int nvme_getgeo(struct block_device *bdev, struct hd_geometry *geo) +{ + /* some standard values */ + geo->heads = 1 << 6; + geo->sectors = 1 << 5; + geo->cylinders = get_capacity(bdev->bd_disk) >> 11; + return 0; +} + +#ifdef CONFIG_BLK_DEV_INTEGRITY +static void nvme_init_integrity(struct nvme_ns *ns) +{ + struct blk_integrity integrity; + + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + integrity.profile = &t10_pi_type3_crc; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + integrity.profile = &t10_pi_type1_crc; + break; + default: + integrity.profile = NULL; + break; + } + integrity.tuple_size = ns->ms; + blk_integrity_register(ns->disk, &integrity); + blk_queue_max_integrity_segments(ns->queue, 1); +} +#else +static void nvme_init_integrity(struct nvme_ns *ns) +{ +} +#endif /* CONFIG_BLK_DEV_INTEGRITY */ + +static void nvme_config_discard(struct nvme_ns *ns) +{ + u32 logical_block_size = queue_logical_block_size(ns->queue); + ns->queue->limits.discard_zeroes_data = 0; + ns->queue->limits.discard_alignment = logical_block_size; + ns->queue->limits.discard_granularity = logical_block_size; + blk_queue_max_discard_sectors(ns->queue, 0xffffffff); + queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); +} + +static int nvme_revalidate_disk(struct gendisk *disk) +{ + struct nvme_ns *ns = disk->private_data; + struct nvme_id_ns *id; + u8 lbaf, pi_type; + u16 old_ms; + unsigned short bs; + + if (nvme_identify_ns(ns->ctrl, ns->ns_id, &id)) { + dev_warn(ns->ctrl->dev, "%s: Identify failure nvme%dn%d\n", + __func__, ns->ctrl->instance, ns->ns_id); + return -ENODEV; + } + if (id->ncap == 0) { + kfree(id); + return -ENODEV; + } + + if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { + if (nvme_nvm_register(ns->queue, disk->disk_name)) { + dev_warn(ns->ctrl->dev, + "%s: LightNVM init failure\n", __func__); + kfree(id); + return -ENODEV; + } + ns->type = NVME_NS_LIGHTNVM; + } + + if (ns->ctrl->vs >= NVME_VS(1, 1)) + memcpy(ns->eui, id->eui64, sizeof(ns->eui)); + if (ns->ctrl->vs >= NVME_VS(1, 2)) + memcpy(ns->uuid, id->nguid, sizeof(ns->uuid)); + + old_ms = ns->ms; + lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; + ns->lba_shift = id->lbaf[lbaf].ds; + ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); + ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); + + /* + * If identify namespace failed, use default 512 byte block size so + * block layer can use before failing read/write for 0 capacity. + */ + if (ns->lba_shift == 0) + ns->lba_shift = 9; + bs = 1 << ns->lba_shift; + /* XXX: PI implementation requires metadata equal t10 pi tuple size */ + pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? + id->dps & NVME_NS_DPS_PI_MASK : 0; + + blk_mq_freeze_queue(disk->queue); + if (blk_get_integrity(disk) && (ns->pi_type != pi_type || + ns->ms != old_ms || + bs != queue_logical_block_size(disk->queue) || + (ns->ms && ns->ext))) + blk_integrity_unregister(disk); + + ns->pi_type = pi_type; + blk_queue_logical_block_size(ns->queue, bs); + + if (ns->ms && !blk_get_integrity(disk) && !ns->ext) + nvme_init_integrity(ns); + if (ns->ms && !(ns->ms == 8 && ns->pi_type) && !blk_get_integrity(disk)) + set_capacity(disk, 0); + else + set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); + + if (ns->ctrl->oncs & NVME_CTRL_ONCS_DSM) + nvme_config_discard(ns); + blk_mq_unfreeze_queue(disk->queue); + + kfree(id); + return 0; +} + +static char nvme_pr_type(enum pr_type type) +{ + switch (type) { + case PR_WRITE_EXCLUSIVE: + return 1; + case PR_EXCLUSIVE_ACCESS: + return 2; + case PR_WRITE_EXCLUSIVE_REG_ONLY: + return 3; + case PR_EXCLUSIVE_ACCESS_REG_ONLY: + return 4; + case PR_WRITE_EXCLUSIVE_ALL_REGS: + return 5; + case PR_EXCLUSIVE_ACCESS_ALL_REGS: + return 6; + default: + return 0; + } +}; + +static int nvme_pr_command(struct block_device *bdev, u32 cdw10, + u64 key, u64 sa_key, u8 op) +{ + struct nvme_ns *ns = bdev->bd_disk->private_data; + struct nvme_command c; + u8 data[16] = { 0, }; + + put_unaligned_le64(key, &data[0]); + put_unaligned_le64(sa_key, &data[8]); + + memset(&c, 0, sizeof(c)); + c.common.opcode = op; + c.common.nsid = cpu_to_le32(ns->ns_id); + c.common.cdw10[0] = cpu_to_le32(cdw10); + + return nvme_submit_sync_cmd(ns->queue, &c, data, 16); +} + +static int nvme_pr_register(struct block_device *bdev, u64 old, + u64 new, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = old ? 2 : 0; + cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; + cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); +} + +static int nvme_pr_reserve(struct block_device *bdev, u64 key, + enum pr_type type, unsigned flags) +{ + u32 cdw10; + + if (flags & ~PR_FL_IGNORE_KEY) + return -EOPNOTSUPP; + + cdw10 = nvme_pr_type(type) << 8; + cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); +} + +static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, + enum pr_type type, bool abort) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; + return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); +} + +static int nvme_pr_clear(struct block_device *bdev, u64 key) +{ + u32 cdw10 = 1 | (key ? 1 << 3 : 0); + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); +} + +static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) +{ + u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; + return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); +} + +static const struct pr_ops nvme_pr_ops = { + .pr_register = nvme_pr_register, + .pr_reserve = nvme_pr_reserve, + .pr_release = nvme_pr_release, + .pr_preempt = nvme_pr_preempt, + .pr_clear = nvme_pr_clear, +}; + +static const struct block_device_operations nvme_fops = { + .owner = THIS_MODULE, + .ioctl = nvme_ioctl, + .compat_ioctl = nvme_compat_ioctl, + .open = nvme_open, + .release = nvme_release, + .getgeo = nvme_getgeo, + .revalidate_disk= nvme_revalidate_disk, + .pr_ops = &nvme_pr_ops, +}; + +static int nvme_wait_ready(struct nvme_ctrl *ctrl, u64 cap, bool enabled) +{ + unsigned long timeout = + ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; + u32 csts, bit = enabled ? NVME_CSTS_RDY : 0; + int ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_RDY) == bit) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->dev, + "Device not ready; aborting %s\n", enabled ? + "initialisation" : "reset"); + return -ENODEV; + } + } + + return ret; +} + +/* + * If the device has been passed off to us in an enabled state, just clear + * the enabled bit. The spec says we should set the 'shutdown notification + * bits', but doing so may cause the device to complete commands to the + * admin queue ... and we don't know what memory that might be pointing at! + */ +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config &= ~NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, false); +} + +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap) +{ + /* + * Default to a 4K page size, with the intention to update this + * path in the future to accomodate architectures with differing + * kernel and IO page sizes. + */ + unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12, page_shift = 12; + int ret; + + if (page_shift < dev_page_min) { + dev_err(ctrl->dev, + "Minimum device page size %u too large for host (%u)\n", + 1 << dev_page_min, 1 << page_shift); + return -ENODEV; + } + + ctrl->page_size = 1 << page_shift; + + ctrl->ctrl_config = NVME_CC_CSS_NVM; + ctrl->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; + ctrl->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; + ctrl->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; + ctrl->ctrl_config |= NVME_CC_ENABLE; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + return nvme_wait_ready(ctrl, cap, true); +} + +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl) +{ + unsigned long timeout = SHUTDOWN_TIMEOUT + jiffies; + u32 csts; + int ret; + + ctrl->ctrl_config &= ~NVME_CC_SHN_MASK; + ctrl->ctrl_config |= NVME_CC_SHN_NORMAL; + + ret = ctrl->ops->reg_write32(ctrl, NVME_REG_CC, ctrl->ctrl_config); + if (ret) + return ret; + + while ((ret = ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &csts)) == 0) { + if ((csts & NVME_CSTS_SHST_MASK) == NVME_CSTS_SHST_CMPLT) + break; + + msleep(100); + if (fatal_signal_pending(current)) + return -EINTR; + if (time_after(jiffies, timeout)) { + dev_err(ctrl->dev, + "Device shutdown incomplete; abort shutdown\n"); + return -ENODEV; + } + } + + return ret; +} + +/* + * Initialize the cached copies of the Identify data and various controller + * register in our nvme_ctrl structure. This should be called as soon as + * the admin queue is fully up and running. + */ +int nvme_init_identify(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + u64 cap; + int ret, page_shift; + + ret = ctrl->ops->reg_read32(ctrl, NVME_REG_VS, &ctrl->vs); + if (ret) { + dev_err(ctrl->dev, "Reading VS failed (%d)\n", ret); + return ret; + } + + ret = ctrl->ops->reg_read64(ctrl, NVME_REG_CAP, &cap); + if (ret) { + dev_err(ctrl->dev, "Reading CAP failed (%d)\n", ret); + return ret; + } + page_shift = NVME_CAP_MPSMIN(cap) + 12; + + if (ctrl->vs >= NVME_VS(1, 1)) + ctrl->subsystem = NVME_CAP_NSSRC(cap); + + ret = nvme_identify_ctrl(ctrl, &id); + if (ret) { + dev_err(ctrl->dev, "Identify Controller failed (%d)\n", ret); + return -EIO; + } + + ctrl->oncs = le16_to_cpup(&id->oncs); + atomic_set(&ctrl->abort_limit, id->acl + 1); + ctrl->vwc = id->vwc; + memcpy(ctrl->serial, id->sn, sizeof(id->sn)); + memcpy(ctrl->model, id->mn, sizeof(id->mn)); + memcpy(ctrl->firmware_rev, id->fr, sizeof(id->fr)); + if (id->mdts) + ctrl->max_hw_sectors = 1 << (id->mdts + page_shift - 9); + else + ctrl->max_hw_sectors = UINT_MAX; + + if ((ctrl->quirks & NVME_QUIRK_STRIPE_SIZE) && id->vs[3]) { + unsigned int max_hw_sectors; + + ctrl->stripe_size = 1 << (id->vs[3] + page_shift); + max_hw_sectors = ctrl->stripe_size >> (page_shift - 9); + if (ctrl->max_hw_sectors) { + ctrl->max_hw_sectors = min(max_hw_sectors, + ctrl->max_hw_sectors); + } else { + ctrl->max_hw_sectors = max_hw_sectors; + } + } + + kfree(id); + return 0; +} + +static int nvme_dev_open(struct inode *inode, struct file *file) +{ + struct nvme_ctrl *ctrl; + int instance = iminor(inode); + int ret = -ENODEV; + + spin_lock(&dev_list_lock); + list_for_each_entry(ctrl, &nvme_ctrl_list, node) { + if (ctrl->instance != instance) + continue; + + if (!ctrl->admin_q) { + ret = -EWOULDBLOCK; + break; + } + if (!kref_get_unless_zero(&ctrl->kref)) + break; + file->private_data = ctrl; + ret = 0; + break; + } + spin_unlock(&dev_list_lock); + + return ret; +} + +static int nvme_dev_release(struct inode *inode, struct file *file) +{ + nvme_put_ctrl(file->private_data); + return 0; +} + +static int nvme_dev_user_cmd(struct nvme_ctrl *ctrl, void __user *argp) +{ + struct nvme_ns *ns; + int ret; + + mutex_lock(&ctrl->namespaces_mutex); + if (list_empty(&ctrl->namespaces)) { + ret = -ENOTTY; + goto out_unlock; + } + + ns = list_first_entry(&ctrl->namespaces, struct nvme_ns, list); + if (ns != list_last_entry(&ctrl->namespaces, struct nvme_ns, list)) { + dev_warn(ctrl->dev, + "NVME_IOCTL_IO_CMD not supported when multiple namespaces present!\n"); + ret = -EINVAL; + goto out_unlock; + } + + dev_warn(ctrl->dev, + "using deprecated NVME_IOCTL_IO_CMD ioctl on the char device!\n"); + kref_get(&ns->kref); + mutex_unlock(&ctrl->namespaces_mutex); + + ret = nvme_user_cmd(ctrl, ns, argp); + nvme_put_ns(ns); + return ret; + +out_unlock: + mutex_unlock(&ctrl->namespaces_mutex); + return ret; +} + +static long nvme_dev_ioctl(struct file *file, unsigned int cmd, + unsigned long arg) +{ + struct nvme_ctrl *ctrl = file->private_data; + void __user *argp = (void __user *)arg; + + switch (cmd) { + case NVME_IOCTL_ADMIN_CMD: + return nvme_user_cmd(ctrl, NULL, argp); + case NVME_IOCTL_IO_CMD: + return nvme_dev_user_cmd(ctrl, argp); + case NVME_IOCTL_RESET: + dev_warn(ctrl->dev, "resetting controller\n"); + return ctrl->ops->reset_ctrl(ctrl); + case NVME_IOCTL_SUBSYS_RESET: + return nvme_reset_subsystem(ctrl); + default: + return -ENOTTY; + } +} + +static const struct file_operations nvme_dev_fops = { + .owner = THIS_MODULE, + .open = nvme_dev_open, + .release = nvme_dev_release, + .unlocked_ioctl = nvme_dev_ioctl, + .compat_ioctl = nvme_dev_ioctl, +}; + +static ssize_t nvme_sysfs_reset(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); + int ret; + + ret = ctrl->ops->reset_ctrl(ctrl); + if (ret < 0) + return ret; + return count; +} +static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static ssize_t uuid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%pU\n", ns->uuid); +} +static DEVICE_ATTR(uuid, S_IRUGO, uuid_show, NULL); + +static ssize_t eui_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%8phd\n", ns->eui); +} +static DEVICE_ATTR(eui, S_IRUGO, eui_show, NULL); + +static ssize_t nsid_show(struct device *dev, struct device_attribute *attr, + char *buf) +{ + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + return sprintf(buf, "%d\n", ns->ns_id); +} +static DEVICE_ATTR(nsid, S_IRUGO, nsid_show, NULL); + +static struct attribute *nvme_ns_attrs[] = { + &dev_attr_uuid.attr, + &dev_attr_eui.attr, + &dev_attr_nsid.attr, + NULL, +}; + +static umode_t nvme_attrs_are_visible(struct kobject *kobj, + struct attribute *a, int n) +{ + struct device *dev = container_of(kobj, struct device, kobj); + struct nvme_ns *ns = dev_to_disk(dev)->private_data; + + if (a == &dev_attr_uuid.attr) { + if (!memchr_inv(ns->uuid, 0, sizeof(ns->uuid))) + return 0; + } + if (a == &dev_attr_eui.attr) { + if (!memchr_inv(ns->eui, 0, sizeof(ns->eui))) + return 0; + } + return a->mode; +} + +static const struct attribute_group nvme_ns_attr_group = { + .attrs = nvme_ns_attrs, + .is_visible = nvme_attrs_are_visible, +}; + +#define nvme_show_function(field) \ +static ssize_t field##_show(struct device *dev, \ + struct device_attribute *attr, char *buf) \ +{ \ + struct nvme_ctrl *ctrl = dev_get_drvdata(dev); \ + return sprintf(buf, "%.*s\n", (int)sizeof(ctrl->field), ctrl->field); \ +} \ +static DEVICE_ATTR(field, S_IRUGO, field##_show, NULL); + +nvme_show_function(model); +nvme_show_function(serial); +nvme_show_function(firmware_rev); + +static struct attribute *nvme_dev_attrs[] = { + &dev_attr_reset_controller.attr, + &dev_attr_model.attr, + &dev_attr_serial.attr, + &dev_attr_firmware_rev.attr, + NULL +}; + +static struct attribute_group nvme_dev_attrs_group = { + .attrs = nvme_dev_attrs, +}; + +static const struct attribute_group *nvme_dev_attr_groups[] = { + &nvme_dev_attrs_group, + NULL, +}; + +static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +{ + struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); + struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + + return nsa->ns_id - nsb->ns_id; +} + +static struct nvme_ns *nvme_find_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + lockdep_assert_held(&ctrl->namespaces_mutex); + + list_for_each_entry(ns, &ctrl->namespaces, list) { + if (ns->ns_id == nsid) + return ns; + if (ns->ns_id > nsid) + break; + } + return NULL; +} + +static void nvme_alloc_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + struct gendisk *disk; + int node = dev_to_node(ctrl->dev); + + lockdep_assert_held(&ctrl->namespaces_mutex); + + ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); + if (!ns) + return; + + ns->queue = blk_mq_init_queue(ctrl->tagset); + if (IS_ERR(ns->queue)) + goto out_free_ns; + queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); + queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); + ns->queue->queuedata = ns; + ns->ctrl = ctrl; + + disk = alloc_disk_node(0, node); + if (!disk) + goto out_free_queue; + + kref_init(&ns->kref); + ns->ns_id = nsid; + ns->disk = disk; + ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ + + blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); + if (ctrl->max_hw_sectors) { + blk_queue_max_hw_sectors(ns->queue, ctrl->max_hw_sectors); + blk_queue_max_segments(ns->queue, + (ctrl->max_hw_sectors / (ctrl->page_size >> 9)) + 1); + } + if (ctrl->stripe_size) + blk_queue_chunk_sectors(ns->queue, ctrl->stripe_size >> 9); + if (ctrl->vwc & NVME_CTRL_VWC_PRESENT) + blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); + blk_queue_virt_boundary(ns->queue, ctrl->page_size - 1); + + disk->major = nvme_major; + disk->first_minor = 0; + disk->fops = &nvme_fops; + disk->private_data = ns; + disk->queue = ns->queue; + disk->driverfs_dev = ctrl->device; + disk->flags = GENHD_FL_EXT_DEVT; + sprintf(disk->disk_name, "nvme%dn%d", ctrl->instance, nsid); + + if (nvme_revalidate_disk(ns->disk)) + goto out_free_disk; + + list_add_tail(&ns->list, &ctrl->namespaces); + kref_get(&ctrl->kref); + if (ns->type == NVME_NS_LIGHTNVM) + return; + + add_disk(ns->disk); + if (sysfs_create_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group)) + pr_warn("%s: failed to create sysfs group for identification\n", + ns->disk->disk_name); + return; + out_free_disk: + kfree(disk); + out_free_queue: + blk_cleanup_queue(ns->queue); + out_free_ns: + kfree(ns); +} + +static void nvme_ns_remove(struct nvme_ns *ns) +{ + bool kill = nvme_io_incapable(ns->ctrl) && + !blk_queue_dying(ns->queue); + + lockdep_assert_held(&ns->ctrl->namespaces_mutex); + + if (kill) { + blk_set_queue_dying(ns->queue); + + /* + * The controller was shutdown first if we got here through + * device removal. The shutdown may requeue outstanding + * requests. These need to be aborted immediately so + * del_gendisk doesn't block indefinitely for their completion. + */ + blk_mq_abort_requeue_list(ns->queue); + } + if (ns->disk->flags & GENHD_FL_UP) { + if (blk_get_integrity(ns->disk)) + blk_integrity_unregister(ns->disk); + sysfs_remove_group(&disk_to_dev(ns->disk)->kobj, + &nvme_ns_attr_group); + del_gendisk(ns->disk); + } + if (kill || !blk_queue_dying(ns->queue)) { + blk_mq_abort_requeue_list(ns->queue); + blk_cleanup_queue(ns->queue); + } + list_del_init(&ns->list); + nvme_put_ns(ns); +} + +static void nvme_validate_ns(struct nvme_ctrl *ctrl, unsigned nsid) +{ + struct nvme_ns *ns; + + ns = nvme_find_ns(ctrl, nsid); + if (ns) { + if (revalidate_disk(ns->disk)) + nvme_ns_remove(ns); + } else + nvme_alloc_ns(ctrl, nsid); +} + +static int nvme_scan_ns_list(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns; + __le32 *ns_list; + unsigned i, j, nsid, prev = 0, num_lists = DIV_ROUND_UP(nn, 1024); + int ret = 0; + + ns_list = kzalloc(0x1000, GFP_KERNEL); + if (!ns_list) + return -ENOMEM; + + for (i = 0; i < num_lists; i++) { + ret = nvme_identify_ns_list(ctrl, prev, ns_list); + if (ret) + goto out; + + for (j = 0; j < min(nn, 1024U); j++) { + nsid = le32_to_cpu(ns_list[j]); + if (!nsid) + goto out; + + nvme_validate_ns(ctrl, nsid); + + while (++prev < nsid) { + ns = nvme_find_ns(ctrl, prev); + if (ns) + nvme_ns_remove(ns); + } + } + nn -= j; + } + out: + kfree(ns_list); + return ret; +} + +static void __nvme_scan_namespaces(struct nvme_ctrl *ctrl, unsigned nn) +{ + struct nvme_ns *ns, *next; + unsigned i; + + lockdep_assert_held(&ctrl->namespaces_mutex); + + for (i = 1; i <= nn; i++) + nvme_validate_ns(ctrl, i); + + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) { + if (ns->ns_id > nn) + nvme_ns_remove(ns); + } +} + +void nvme_scan_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_id_ctrl *id; + unsigned nn; + + if (nvme_identify_ctrl(ctrl, &id)) + return; + + mutex_lock(&ctrl->namespaces_mutex); + nn = le32_to_cpu(id->nn); + if (ctrl->vs >= NVME_VS(1, 1) && + !(ctrl->quirks & NVME_QUIRK_IDENTIFY_CNS)) { + if (!nvme_scan_ns_list(ctrl, nn)) + goto done; + } + __nvme_scan_namespaces(ctrl, le32_to_cpup(&id->nn)); + done: + list_sort(NULL, &ctrl->namespaces, ns_cmp); + mutex_unlock(&ctrl->namespaces_mutex); + kfree(id); +} + +void nvme_remove_namespaces(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns, *next; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry_safe(ns, next, &ctrl->namespaces, list) + nvme_ns_remove(ns); + mutex_unlock(&ctrl->namespaces_mutex); +} + +static DEFINE_IDA(nvme_instance_ida); + +static int nvme_set_instance(struct nvme_ctrl *ctrl) +{ + int instance, error; + + do { + if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) + return -ENODEV; + + spin_lock(&dev_list_lock); + error = ida_get_new(&nvme_instance_ida, &instance); + spin_unlock(&dev_list_lock); + } while (error == -EAGAIN); + + if (error) + return -ENODEV; + + ctrl->instance = instance; + return 0; +} + +static void nvme_release_instance(struct nvme_ctrl *ctrl) +{ + spin_lock(&dev_list_lock); + ida_remove(&nvme_instance_ida, ctrl->instance); + spin_unlock(&dev_list_lock); +} + +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl) + { + device_destroy(nvme_class, MKDEV(nvme_char_major, ctrl->instance)); + + spin_lock(&dev_list_lock); + list_del(&ctrl->node); + spin_unlock(&dev_list_lock); +} + +static void nvme_free_ctrl(struct kref *kref) +{ + struct nvme_ctrl *ctrl = container_of(kref, struct nvme_ctrl, kref); + + put_device(ctrl->device); + nvme_release_instance(ctrl); + + ctrl->ops->free_ctrl(ctrl); +} + +void nvme_put_ctrl(struct nvme_ctrl *ctrl) +{ + kref_put(&ctrl->kref, nvme_free_ctrl); +} + +/* + * Initialize a NVMe controller structures. This needs to be called during + * earliest initialization so that we have the initialized structured around + * during probing. + */ +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks) +{ + int ret; + + INIT_LIST_HEAD(&ctrl->namespaces); + mutex_init(&ctrl->namespaces_mutex); + kref_init(&ctrl->kref); + ctrl->dev = dev; + ctrl->ops = ops; + ctrl->quirks = quirks; + + ret = nvme_set_instance(ctrl); + if (ret) + goto out; + + ctrl->device = device_create_with_groups(nvme_class, ctrl->dev, + MKDEV(nvme_char_major, ctrl->instance), + dev, nvme_dev_attr_groups, + "nvme%d", ctrl->instance); + if (IS_ERR(ctrl->device)) { + ret = PTR_ERR(ctrl->device); + goto out_release_instance; + } + get_device(ctrl->device); + dev_set_drvdata(ctrl->device, ctrl); + + spin_lock(&dev_list_lock); + list_add_tail(&ctrl->node, &nvme_ctrl_list); + spin_unlock(&dev_list_lock); + + return 0; +out_release_instance: + nvme_release_instance(ctrl); +out: + return ret; +} + +void nvme_stop_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + spin_lock_irq(ns->queue->queue_lock); + queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); + spin_unlock_irq(ns->queue->queue_lock); + + blk_mq_cancel_requeue_work(ns->queue); + blk_mq_stop_hw_queues(ns->queue); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +void nvme_start_queues(struct nvme_ctrl *ctrl) +{ + struct nvme_ns *ns; + + mutex_lock(&ctrl->namespaces_mutex); + list_for_each_entry(ns, &ctrl->namespaces, list) { + queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); + blk_mq_start_stopped_hw_queues(ns->queue, true); + blk_mq_kick_requeue_list(ns->queue); + } + mutex_unlock(&ctrl->namespaces_mutex); +} + +int __init nvme_core_init(void) +{ + int result; + + result = register_blkdev(nvme_major, "nvme"); + if (result < 0) + return result; + else if (result > 0) + nvme_major = result; + + result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", + &nvme_dev_fops); + if (result < 0) + goto unregister_blkdev; + else if (result > 0) + nvme_char_major = result; + + nvme_class = class_create(THIS_MODULE, "nvme"); + if (IS_ERR(nvme_class)) { + result = PTR_ERR(nvme_class); + goto unregister_chrdev; + } + + return 0; + + unregister_chrdev: + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); + unregister_blkdev: + unregister_blkdev(nvme_major, "nvme"); + return result; +} + +void nvme_core_exit(void) +{ + unregister_blkdev(nvme_major, "nvme"); + class_destroy(nvme_class); + __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); +} diff --git a/drivers/nvme/host/lightnvm.c b/drivers/nvme/host/lightnvm.c index 1af54ea20e7b..5cd3725e2fa4 100644 --- a/drivers/nvme/host/lightnvm.c +++ b/drivers/nvme/host/lightnvm.c @@ -146,6 +146,16 @@ struct nvme_nvm_command { }; }; +struct nvme_nvm_lp_mlc { + __u16 num_pairs; + __u8 pairs[886]; +}; + +struct nvme_nvm_lp_tbl { + __u8 id[8]; + struct nvme_nvm_lp_mlc mlc; +}; + struct nvme_nvm_id_group { __u8 mtype; __u8 fmtype; @@ -169,7 +179,8 @@ struct nvme_nvm_id_group { __le32 mpos; __le32 mccap; __le16 cpar; - __u8 reserved[906]; + __u8 reserved[10]; + struct nvme_nvm_lp_tbl lptbl; } __packed; struct nvme_nvm_addr_format { @@ -266,6 +277,15 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) dst->mccap = le32_to_cpu(src->mccap); dst->cpar = le16_to_cpu(src->cpar); + + if (dst->fmtype == NVM_ID_FMTYPE_MLC) { + memcpy(dst->lptbl.id, src->lptbl.id, 8); + dst->lptbl.mlc.num_pairs = + le16_to_cpu(src->lptbl.mlc.num_pairs); + /* 4 bits per pair */ + memcpy(dst->lptbl.mlc.pairs, src->lptbl.mlc.pairs, + dst->lptbl.mlc.num_pairs >> 1); + } } return 0; @@ -274,7 +294,6 @@ static int init_grps(struct nvm_id *nvm_id, struct nvme_nvm_id *nvme_nvm_id) static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_id *nvme_nvm_id; struct nvme_nvm_command c = {}; int ret; @@ -287,7 +306,7 @@ static int nvme_nvm_identity(struct nvm_dev *nvmdev, struct nvm_id *nvm_id) if (!nvme_nvm_id) return -ENOMEM; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, nvme_nvm_id, sizeof(struct nvme_nvm_id)); if (ret) { ret = -EIO; @@ -312,9 +331,8 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, nvm_l2p_update_fn *update_l2p, void *priv) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; - u32 len = queue_max_hw_sectors(dev->admin_q) << 9; + u32 len = queue_max_hw_sectors(ns->ctrl->admin_q) << 9; u32 nlb_pr_rq = len / sizeof(u64); u64 cmd_slba = slba; void *entries; @@ -332,10 +350,10 @@ static int nvme_nvm_get_l2p_tbl(struct nvm_dev *nvmdev, u64 slba, u32 nlb, c.l2p.slba = cpu_to_le64(cmd_slba); c.l2p.nlb = cpu_to_le32(cmd_nlb); - ret = nvme_submit_sync_cmd(dev->admin_q, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, entries, len); if (ret) { - dev_err(dev->dev, "L2P table transfer failed (%d)\n", + dev_err(ns->ctrl->dev, "L2P table transfer failed (%d)\n", ret); ret = -EIO; goto out; @@ -361,7 +379,7 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, { struct request_queue *q = nvmdev->q; struct nvme_ns *ns = q->queuedata; - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_nvm_command c = {}; struct nvme_nvm_bb_tbl *bb_tbl; int tblsz = sizeof(struct nvme_nvm_bb_tbl) + nr_blocks; @@ -375,41 +393,36 @@ static int nvme_nvm_get_bb_tbl(struct nvm_dev *nvmdev, struct ppa_addr ppa, if (!bb_tbl) return -ENOMEM; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ctrl->admin_q, (struct nvme_command *)&c, bb_tbl, tblsz); if (ret) { - dev_err(dev->dev, "get bad block table failed (%d)\n", ret); + dev_err(ctrl->dev, "get bad block table failed (%d)\n", ret); ret = -EIO; goto out; } if (bb_tbl->tblid[0] != 'B' || bb_tbl->tblid[1] != 'B' || bb_tbl->tblid[2] != 'L' || bb_tbl->tblid[3] != 'T') { - dev_err(dev->dev, "bbt format mismatch\n"); + dev_err(ctrl->dev, "bbt format mismatch\n"); ret = -EINVAL; goto out; } if (le16_to_cpu(bb_tbl->verid) != 1) { ret = -EINVAL; - dev_err(dev->dev, "bbt version not supported\n"); + dev_err(ctrl->dev, "bbt version not supported\n"); goto out; } if (le32_to_cpu(bb_tbl->tblks) != nr_blocks) { ret = -EINVAL; - dev_err(dev->dev, "bbt unsuspected blocks returned (%u!=%u)", + dev_err(ctrl->dev, "bbt unsuspected blocks returned (%u!=%u)", le32_to_cpu(bb_tbl->tblks), nr_blocks); goto out; } ppa = dev_to_generic_addr(nvmdev, ppa); ret = update_bbtbl(ppa, nr_blocks, bb_tbl->blk, priv); - if (ret) { - ret = -EINTR; - goto out; - } - out: kfree(bb_tbl); return ret; @@ -419,7 +432,6 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, int type) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; struct nvme_nvm_command c = {}; int ret = 0; @@ -429,10 +441,10 @@ static int nvme_nvm_set_bb_tbl(struct nvm_dev *nvmdev, struct nvm_rq *rqd, c.set_bb.nlb = cpu_to_le16(rqd->nr_pages - 1); c.set_bb.value = type; - ret = nvme_submit_sync_cmd(dev->admin_q, (struct nvme_command *)&c, + ret = nvme_submit_sync_cmd(ns->ctrl->admin_q, (struct nvme_command *)&c, NULL, 0); if (ret) - dev_err(dev->dev, "set bad block table failed (%d)\n", ret); + dev_err(ns->ctrl->dev, "set bad block table failed (%d)\n", ret); return ret; } @@ -453,11 +465,8 @@ static inline void nvme_nvm_rqtocmd(struct request *rq, struct nvm_rq *rqd, static void nvme_nvm_end_io(struct request *rq, int error) { struct nvm_rq *rqd = rq->end_io_data; - struct nvm_dev *dev = rqd->dev; - if (dev->mt && dev->mt->end_io(rqd, error)) - pr_err("nvme: err status: %x result: %lx\n", - rq->errors, (unsigned long)rq->special); + nvm_end_io(rqd, error); kfree(rq->cmd); blk_mq_free_request(rq); @@ -520,9 +529,8 @@ static int nvme_nvm_erase_block(struct nvm_dev *dev, struct nvm_rq *rqd) static void *nvme_nvm_create_dma_pool(struct nvm_dev *nvmdev, char *name) { struct nvme_ns *ns = nvmdev->q->queuedata; - struct nvme_dev *dev = ns->dev; - return dma_pool_create(name, dev->dev, PAGE_SIZE, PAGE_SIZE, 0); + return dma_pool_create(name, ns->ctrl->dev, PAGE_SIZE, PAGE_SIZE, 0); } static void nvme_nvm_destroy_dma_pool(void *pool) @@ -580,8 +588,9 @@ void nvme_nvm_unregister(struct request_queue *q, char *disk_name) int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *id) { - struct nvme_dev *dev = ns->dev; - struct pci_dev *pdev = to_pci_dev(dev->dev); + struct nvme_ctrl *ctrl = ns->ctrl; + /* XXX: this is poking into PCI structures from generic code! */ + struct pci_dev *pdev = to_pci_dev(ctrl->dev); /* QEMU NVMe simulator - PCI ID + Vendor specific bit */ if (pdev->vendor == PCI_VENDOR_ID_CNEX && diff --git a/drivers/nvme/host/nvme.h b/drivers/nvme/host/nvme.h index 044253dca30a..4fb5bb737868 100644 --- a/drivers/nvme/host/nvme.h +++ b/drivers/nvme/host/nvme.h @@ -19,58 +19,77 @@ #include <linux/kref.h> #include <linux/blk-mq.h> +enum { + /* + * Driver internal status code for commands that were cancelled due + * to timeouts or controller shutdown. The value is negative so + * that it a) doesn't overlap with the unsigned hardware error codes, + * and b) can easily be tested for. + */ + NVME_SC_CANCELLED = -EINTR, +}; + extern unsigned char nvme_io_timeout; #define NVME_IO_TIMEOUT (nvme_io_timeout * HZ) +extern unsigned char admin_timeout; +#define ADMIN_TIMEOUT (admin_timeout * HZ) + +extern unsigned char shutdown_timeout; +#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + enum { NVME_NS_LBA = 0, NVME_NS_LIGHTNVM = 1, }; /* - * Represents an NVM Express device. Each nvme_dev is a PCI function. + * List of workarounds for devices that required behavior not specified in + * the standard. */ -struct nvme_dev { - struct list_head node; - struct nvme_queue **queues; +enum nvme_quirks { + /* + * Prefers I/O aligned to a stripe size specified in a vendor + * specific Identify field. + */ + NVME_QUIRK_STRIPE_SIZE = (1 << 0), + + /* + * The controller doesn't handle Identify value others than 0 or 1 + * correctly. + */ + NVME_QUIRK_IDENTIFY_CNS = (1 << 1), +}; + +struct nvme_ctrl { + const struct nvme_ctrl_ops *ops; struct request_queue *admin_q; - struct blk_mq_tag_set tagset; - struct blk_mq_tag_set admin_tagset; - u32 __iomem *dbs; struct device *dev; - struct dma_pool *prp_page_pool; - struct dma_pool *prp_small_pool; + struct kref kref; int instance; - unsigned queue_count; - unsigned online_queues; - unsigned max_qid; - int q_depth; - u32 db_stride; - u32 ctrl_config; - struct msix_entry *entry; - struct nvme_bar __iomem *bar; + struct blk_mq_tag_set *tagset; struct list_head namespaces; - struct kref kref; - struct device *device; - struct work_struct reset_work; - struct work_struct probe_work; - struct work_struct scan_work; + struct mutex namespaces_mutex; + struct device *device; /* char device */ + struct list_head node; + char name[12]; char serial[20]; char model[40]; char firmware_rev[8]; - bool subsystem; + + u32 ctrl_config; + + u32 page_size; u32 max_hw_sectors; u32 stripe_size; - u32 page_size; - void __iomem *cmb; - dma_addr_t cmb_dma_addr; - u64 cmb_size; - u32 cmbsz; u16 oncs; - u16 abort_limit; + atomic_t abort_limit; u8 event_limit; u8 vwc; + u32 vs; + bool subsystem; + unsigned long quirks; }; /* @@ -79,11 +98,14 @@ struct nvme_dev { struct nvme_ns { struct list_head list; - struct nvme_dev *dev; + struct nvme_ctrl *ctrl; struct request_queue *queue; struct gendisk *disk; struct kref kref; + u8 eui[8]; + u8 uuid[16]; + unsigned ns_id; int lba_shift; u16 ms; @@ -94,41 +116,156 @@ struct nvme_ns { u32 mode_select_block_len; }; -/* - * The nvme_iod describes the data in an I/O, including the list of PRP - * entries. You can't see it in this data structure because C doesn't let - * me express that. Use nvme_alloc_iod to ensure there's enough space - * allocated to store the PRP list. - */ -struct nvme_iod { - unsigned long private; /* For the use of the submitter of the I/O */ - int npages; /* In the PRP list. 0 means small pool in use */ - int offset; /* Of PRP list */ - int nents; /* Used in scatterlist */ - int length; /* Of data, in bytes */ - dma_addr_t first_dma; - struct scatterlist meta_sg[1]; /* metadata requires single contiguous buffer */ - struct scatterlist sg[0]; +struct nvme_ctrl_ops { + int (*reg_read32)(struct nvme_ctrl *ctrl, u32 off, u32 *val); + int (*reg_write32)(struct nvme_ctrl *ctrl, u32 off, u32 val); + int (*reg_read64)(struct nvme_ctrl *ctrl, u32 off, u64 *val); + bool (*io_incapable)(struct nvme_ctrl *ctrl); + int (*reset_ctrl)(struct nvme_ctrl *ctrl); + void (*free_ctrl)(struct nvme_ctrl *ctrl); }; +static inline bool nvme_ctrl_ready(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_RDY; +} + +static inline bool nvme_io_incapable(struct nvme_ctrl *ctrl) +{ + u32 val = 0; + + if (ctrl->ops->io_incapable(ctrl)) + return false; + if (ctrl->ops->reg_read32(ctrl, NVME_REG_CSTS, &val)) + return false; + return val & NVME_CSTS_CFS; +} + +static inline int nvme_reset_subsystem(struct nvme_ctrl *ctrl) +{ + if (!ctrl->subsystem) + return -ENOTTY; + return ctrl->ops->reg_write32(ctrl, NVME_REG_NSSR, 0x4E564D65); +} + static inline u64 nvme_block_nr(struct nvme_ns *ns, sector_t sector) { return (sector >> (ns->lba_shift - 9)); } +static inline void nvme_setup_flush(struct nvme_ns *ns, + struct nvme_command *cmnd) +{ + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->common.opcode = nvme_cmd_flush; + cmnd->common.nsid = cpu_to_le32(ns->ns_id); +} + +static inline void nvme_setup_rw(struct nvme_ns *ns, struct request *req, + struct nvme_command *cmnd) +{ + u16 control = 0; + u32 dsmgmt = 0; + + if (req->cmd_flags & REQ_FUA) + control |= NVME_RW_FUA; + if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) + control |= NVME_RW_LR; + + if (req->cmd_flags & REQ_RAHEAD) + dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); + cmnd->rw.command_id = req->tag; + cmnd->rw.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + cmnd->rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); + + if (ns->ms) { + switch (ns->pi_type) { + case NVME_NS_DPS_PI_TYPE3: + control |= NVME_RW_PRINFO_PRCHK_GUARD; + break; + case NVME_NS_DPS_PI_TYPE1: + case NVME_NS_DPS_PI_TYPE2: + control |= NVME_RW_PRINFO_PRCHK_GUARD | + NVME_RW_PRINFO_PRCHK_REF; + cmnd->rw.reftag = cpu_to_le32( + nvme_block_nr(ns, blk_rq_pos(req))); + break; + } + if (!blk_integrity_rq(req)) + control |= NVME_RW_PRINFO_PRACT; + } + + cmnd->rw.control = cpu_to_le16(control); + cmnd->rw.dsmgmt = cpu_to_le32(dsmgmt); +} + + +static inline int nvme_error_status(u16 status) +{ + switch (status & 0x7ff) { + case NVME_SC_SUCCESS: + return 0; + case NVME_SC_CAP_EXCEEDED: + return -ENOSPC; + default: + return -EIO; + } +} + +static inline bool nvme_req_needs_retry(struct request *req, u16 status) +{ + return !(status & NVME_SC_DNR || blk_noretry_request(req)) && + (jiffies - req->start_time) < req->timeout; +} + +int nvme_disable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_enable_ctrl(struct nvme_ctrl *ctrl, u64 cap); +int nvme_shutdown_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_ctrl(struct nvme_ctrl *ctrl, struct device *dev, + const struct nvme_ctrl_ops *ops, unsigned long quirks); +void nvme_uninit_ctrl(struct nvme_ctrl *ctrl); +void nvme_put_ctrl(struct nvme_ctrl *ctrl); +int nvme_init_identify(struct nvme_ctrl *ctrl); + +void nvme_scan_namespaces(struct nvme_ctrl *ctrl); +void nvme_remove_namespaces(struct nvme_ctrl *ctrl); + +void nvme_stop_queues(struct nvme_ctrl *ctrl); +void nvme_start_queues(struct nvme_ctrl *ctrl); + +struct request *nvme_alloc_request(struct request_queue *q, + struct nvme_command *cmd, unsigned int flags); +void nvme_requeue_req(struct request *req); int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, void *buf, unsigned bufflen); int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, + void *buffer, unsigned bufflen, u32 *result, unsigned timeout); +int nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, u32 *result, + unsigned timeout); +int __nvme_submit_user_cmd(struct request_queue *q, struct nvme_command *cmd, + void __user *ubuffer, unsigned bufflen, + void __user *meta_buffer, unsigned meta_len, u32 meta_seed, u32 *result, unsigned timeout); -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id); -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, +int nvme_identify_ctrl(struct nvme_ctrl *dev, struct nvme_id_ctrl **id); +int nvme_identify_ns(struct nvme_ctrl *dev, unsigned nsid, struct nvme_id_ns **id); -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log); -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, +int nvme_get_log_page(struct nvme_ctrl *dev, struct nvme_smart_log **log); +int nvme_get_features(struct nvme_ctrl *dev, unsigned fid, unsigned nsid, dma_addr_t dma_addr, u32 *result); -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, +int nvme_set_features(struct nvme_ctrl *dev, unsigned fid, unsigned dword11, dma_addr_t dma_addr, u32 *result); +int nvme_set_queue_count(struct nvme_ctrl *ctrl, int *count); + +extern spinlock_t dev_list_lock; struct sg_io_hdr; @@ -154,4 +291,7 @@ static inline int nvme_nvm_ns_supported(struct nvme_ns *ns, struct nvme_id_ns *i } #endif /* CONFIG_NVM */ +int __init nvme_core_init(void); +void nvme_core_exit(void); + #endif /* _NVME_H */ diff --git a/drivers/nvme/host/pci.c b/drivers/nvme/host/pci.c index f5c0e2613c7c..72ef8322d32a 100644 --- a/drivers/nvme/host/pci.c +++ b/drivers/nvme/host/pci.c @@ -12,6 +12,7 @@ * more details. */ +#include <linux/aer.h> #include <linux/bitops.h> #include <linux/blkdev.h> #include <linux/blk-mq.h> @@ -28,10 +29,10 @@ #include <linux/kdev_t.h> #include <linux/kthread.h> #include <linux/kernel.h> -#include <linux/list_sort.h> #include <linux/mm.h> #include <linux/module.h> #include <linux/moduleparam.h> +#include <linux/mutex.h> #include <linux/pci.h> #include <linux/poison.h> #include <linux/ptrace.h> @@ -39,23 +40,24 @@ #include <linux/slab.h> #include <linux/t10-pi.h> #include <linux/types.h> -#include <linux/pr.h> -#include <scsi/sg.h> #include <linux/io-64-nonatomic-lo-hi.h> #include <asm/unaligned.h> -#include <uapi/linux/nvme_ioctl.h> #include "nvme.h" -#define NVME_MINORS (1U << MINORBITS) #define NVME_Q_DEPTH 1024 #define NVME_AQ_DEPTH 256 #define SQ_SIZE(depth) (depth * sizeof(struct nvme_command)) #define CQ_SIZE(depth) (depth * sizeof(struct nvme_completion)) -#define ADMIN_TIMEOUT (admin_timeout * HZ) -#define SHUTDOWN_TIMEOUT (shutdown_timeout * HZ) + +/* + * We handle AEN commands ourselves and don't even let the + * block layer know about them. + */ +#define NVME_NR_AEN_COMMANDS 1 +#define NVME_AQ_BLKMQ_DEPTH (NVME_AQ_DEPTH - NVME_NR_AEN_COMMANDS) -static unsigned char admin_timeout = 60; +unsigned char admin_timeout = 60; module_param(admin_timeout, byte, 0644); MODULE_PARM_DESC(admin_timeout, "timeout in seconds for admin commands"); @@ -63,16 +65,10 @@ unsigned char nvme_io_timeout = 30; module_param_named(io_timeout, nvme_io_timeout, byte, 0644); MODULE_PARM_DESC(io_timeout, "timeout in seconds for I/O"); -static unsigned char shutdown_timeout = 5; +unsigned char shutdown_timeout = 5; module_param(shutdown_timeout, byte, 0644); MODULE_PARM_DESC(shutdown_timeout, "timeout in seconds for controller shutdown"); -static int nvme_major; -module_param(nvme_major, int, 0); - -static int nvme_char_major; -module_param(nvme_char_major, int, 0); - static int use_threaded_interrupts; module_param(use_threaded_interrupts, int, 0); @@ -80,28 +76,60 @@ static bool use_cmb_sqes = true; module_param(use_cmb_sqes, bool, 0644); MODULE_PARM_DESC(use_cmb_sqes, "use controller's memory buffer for I/O SQes"); -static DEFINE_SPINLOCK(dev_list_lock); static LIST_HEAD(dev_list); static struct task_struct *nvme_thread; static struct workqueue_struct *nvme_workq; static wait_queue_head_t nvme_kthread_wait; -static struct class *nvme_class; +struct nvme_dev; +struct nvme_queue; -static int __nvme_reset(struct nvme_dev *dev); static int nvme_reset(struct nvme_dev *dev); static void nvme_process_cq(struct nvme_queue *nvmeq); -static void nvme_dead_ctrl(struct nvme_dev *dev); +static void nvme_remove_dead_ctrl(struct nvme_dev *dev); +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown); -struct async_cmd_info { - struct kthread_work work; - struct kthread_worker *worker; - struct request *req; - u32 result; - int status; - void *ctx; +/* + * Represents an NVM Express device. Each nvme_dev is a PCI function. + */ +struct nvme_dev { + struct list_head node; + struct nvme_queue **queues; + struct blk_mq_tag_set tagset; + struct blk_mq_tag_set admin_tagset; + u32 __iomem *dbs; + struct device *dev; + struct dma_pool *prp_page_pool; + struct dma_pool *prp_small_pool; + unsigned queue_count; + unsigned online_queues; + unsigned max_qid; + int q_depth; + u32 db_stride; + struct msix_entry *entry; + void __iomem *bar; + struct work_struct reset_work; + struct work_struct scan_work; + struct work_struct remove_work; + struct mutex shutdown_lock; + bool subsystem; + void __iomem *cmb; + dma_addr_t cmb_dma_addr; + u64 cmb_size; + u32 cmbsz; + unsigned long flags; + +#define NVME_CTRL_RESETTING 0 + + struct nvme_ctrl ctrl; + struct completion ioq_wait; }; +static inline struct nvme_dev *to_nvme_dev(struct nvme_ctrl *ctrl) +{ + return container_of(ctrl, struct nvme_dev, ctrl); +} + /* * An NVM Express queue. Each device has at least two (one for admin * commands and one for I/O commands). @@ -126,7 +154,24 @@ struct nvme_queue { u16 qid; u8 cq_phase; u8 cqe_seen; - struct async_cmd_info cmdinfo; +}; + +/* + * The nvme_iod describes the data in an I/O, including the list of PRP + * entries. You can't see it in this data structure because C doesn't let + * me express that. Use nvme_init_iod to ensure there's enough space + * allocated to store the PRP list. + */ +struct nvme_iod { + struct nvme_queue *nvmeq; + int aborted; + int npages; /* In the PRP list. 0 means small pool in use */ + int nents; /* Used in scatterlist */ + int length; /* Of data, in bytes */ + dma_addr_t first_dma; + struct scatterlist meta_sg; /* metadata requires single contiguous buffer */ + struct scatterlist *sg; + struct scatterlist inline_sg[0]; }; /* @@ -148,23 +193,11 @@ static inline void _nvme_check_size(void) BUILD_BUG_ON(sizeof(struct nvme_smart_log) != 512); } -typedef void (*nvme_completion_fn)(struct nvme_queue *, void *, - struct nvme_completion *); - -struct nvme_cmd_info { - nvme_completion_fn fn; - void *ctx; - int aborted; - struct nvme_queue *nvmeq; - struct nvme_iod iod[0]; -}; - /* * Max size of iod being embedded in the request payload */ #define NVME_INT_PAGES 2 -#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->page_size) -#define NVME_INT_MASK 0x01 +#define NVME_INT_BYTES(dev) (NVME_INT_PAGES * (dev)->ctrl.page_size) /* * Will slightly overestimate the number of pages needed. This is OK @@ -173,19 +206,22 @@ struct nvme_cmd_info { */ static int nvme_npages(unsigned size, struct nvme_dev *dev) { - unsigned nprps = DIV_ROUND_UP(size + dev->page_size, dev->page_size); + unsigned nprps = DIV_ROUND_UP(size + dev->ctrl.page_size, + dev->ctrl.page_size); return DIV_ROUND_UP(8 * nprps, PAGE_SIZE - 8); } -static unsigned int nvme_cmd_size(struct nvme_dev *dev) +static unsigned int nvme_iod_alloc_size(struct nvme_dev *dev, + unsigned int size, unsigned int nseg) { - unsigned int ret = sizeof(struct nvme_cmd_info); - - ret += sizeof(struct nvme_iod); - ret += sizeof(__le64 *) * nvme_npages(NVME_INT_BYTES(dev), dev); - ret += sizeof(struct scatterlist) * NVME_INT_PAGES; + return sizeof(__le64 *) * nvme_npages(size, dev) + + sizeof(struct scatterlist) * nseg; +} - return ret; +static unsigned int nvme_cmd_size(struct nvme_dev *dev) +{ + return sizeof(struct nvme_iod) + + nvme_iod_alloc_size(dev, NVME_INT_BYTES(dev), NVME_INT_PAGES); } static int nvme_admin_init_hctx(struct blk_mq_hw_ctx *hctx, void *data, @@ -215,11 +251,11 @@ static int nvme_admin_init_request(void *data, struct request *req, unsigned int numa_node) { struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = dev->queues[0]; BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; + iod->nvmeq = nvmeq; return 0; } @@ -242,148 +278,36 @@ static int nvme_init_request(void *data, struct request *req, unsigned int numa_node) { struct nvme_dev *dev = data; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct nvme_queue *nvmeq = dev->queues[hctx_idx + 1]; BUG_ON(!nvmeq); - cmd->nvmeq = nvmeq; + iod->nvmeq = nvmeq; return 0; } -static void nvme_set_info(struct nvme_cmd_info *cmd, void *ctx, - nvme_completion_fn handler) -{ - cmd->fn = handler; - cmd->ctx = ctx; - cmd->aborted = 0; - blk_mq_start_request(blk_mq_rq_from_pdu(cmd)); -} - -static void *iod_get_private(struct nvme_iod *iod) -{ - return (void *) (iod->private & ~0x1UL); -} - -/* - * If bit 0 is set, the iod is embedded in the request payload. - */ -static bool iod_should_kfree(struct nvme_iod *iod) -{ - return (iod->private & NVME_INT_MASK) == 0; -} - -/* Special values must be less than 0x1000 */ -#define CMD_CTX_BASE ((void *)POISON_POINTER_DELTA) -#define CMD_CTX_CANCELLED (0x30C + CMD_CTX_BASE) -#define CMD_CTX_COMPLETED (0x310 + CMD_CTX_BASE) -#define CMD_CTX_INVALID (0x314 + CMD_CTX_BASE) - -static void special_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - if (ctx == CMD_CTX_CANCELLED) - return; - if (ctx == CMD_CTX_COMPLETED) { - dev_warn(nvmeq->q_dmadev, - "completed id %d twice on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - if (ctx == CMD_CTX_INVALID) { - dev_warn(nvmeq->q_dmadev, - "invalid id %d completed on queue %d\n", - cqe->command_id, le16_to_cpup(&cqe->sq_id)); - return; - } - dev_warn(nvmeq->q_dmadev, "Unknown special completion %p\n", ctx); -} - -static void *cancel_cmd_info(struct nvme_cmd_info *cmd, nvme_completion_fn *fn) -{ - void *ctx; - - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_CANCELLED; - return ctx; -} - -static void async_req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) +static void nvme_complete_async_event(struct nvme_dev *dev, + struct nvme_completion *cqe) { - u32 result = le32_to_cpup(&cqe->result); - u16 status = le16_to_cpup(&cqe->status) >> 1; + u16 status = le16_to_cpu(cqe->status) >> 1; + u32 result = le32_to_cpu(cqe->result); if (status == NVME_SC_SUCCESS || status == NVME_SC_ABORT_REQ) - ++nvmeq->dev->event_limit; + ++dev->ctrl.event_limit; if (status != NVME_SC_SUCCESS) return; switch (result & 0xff07) { case NVME_AER_NOTICE_NS_CHANGED: - dev_info(nvmeq->q_dmadev, "rescanning\n"); - schedule_work(&nvmeq->dev->scan_work); + dev_info(dev->dev, "rescanning\n"); + queue_work(nvme_workq, &dev->scan_work); default: - dev_warn(nvmeq->q_dmadev, "async event result %08x\n", result); + dev_warn(dev->dev, "async event result %08x\n", result); } } -static void abort_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct request *req = ctx; - - u16 status = le16_to_cpup(&cqe->status) >> 1; - u32 result = le32_to_cpup(&cqe->result); - - blk_mq_free_request(req); - - dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); - ++nvmeq->dev->abort_limit; -} - -static void async_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct async_cmd_info *cmdinfo = ctx; - cmdinfo->result = le32_to_cpup(&cqe->result); - cmdinfo->status = le16_to_cpup(&cqe->status) >> 1; - queue_kthread_work(cmdinfo->worker, &cmdinfo->work); - blk_mq_free_request(cmdinfo->req); -} - -static inline struct nvme_cmd_info *get_cmd_from_tag(struct nvme_queue *nvmeq, - unsigned int tag) -{ - struct request *req = blk_mq_tag_to_rq(*nvmeq->tags, tag); - - return blk_mq_rq_to_pdu(req); -} - -/* - * Called with local interrupts disabled and the q_lock held. May not sleep. - */ -static void *nvme_finish_cmd(struct nvme_queue *nvmeq, int tag, - nvme_completion_fn *fn) -{ - struct nvme_cmd_info *cmd = get_cmd_from_tag(nvmeq, tag); - void *ctx; - if (tag >= nvmeq->q_depth) { - *fn = special_completion; - return CMD_CTX_INVALID; - } - if (fn) - *fn = cmd->fn; - ctx = cmd->ctx; - cmd->fn = special_completion; - cmd->ctx = CMD_CTX_COMPLETED; - return ctx; -} - /** - * nvme_submit_cmd() - Copy a command into a queue and ring the doorbell + * __nvme_submit_cmd() - Copy a command into a queue and ring the doorbell * @nvmeq: The queue to use * @cmd: The command to send * @@ -405,69 +329,44 @@ static void __nvme_submit_cmd(struct nvme_queue *nvmeq, nvmeq->sq_tail = tail; } -static void nvme_submit_cmd(struct nvme_queue *nvmeq, struct nvme_command *cmd) -{ - unsigned long flags; - spin_lock_irqsave(&nvmeq->q_lock, flags); - __nvme_submit_cmd(nvmeq, cmd); - spin_unlock_irqrestore(&nvmeq->q_lock, flags); -} - -static __le64 **iod_list(struct nvme_iod *iod) +static __le64 **iod_list(struct request *req) { - return ((void *)iod) + iod->offset; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + return (__le64 **)(iod->sg + req->nr_phys_segments); } -static inline void iod_init(struct nvme_iod *iod, unsigned nbytes, - unsigned nseg, unsigned long private) +static int nvme_init_iod(struct request *rq, struct nvme_dev *dev) { - iod->private = private; - iod->offset = offsetof(struct nvme_iod, sg[nseg]); - iod->npages = -1; - iod->length = nbytes; - iod->nents = 0; -} - -static struct nvme_iod * -__nvme_alloc_iod(unsigned nseg, unsigned bytes, struct nvme_dev *dev, - unsigned long priv, gfp_t gfp) -{ - struct nvme_iod *iod = kmalloc(sizeof(struct nvme_iod) + - sizeof(__le64 *) * nvme_npages(bytes, dev) + - sizeof(struct scatterlist) * nseg, gfp); - - if (iod) - iod_init(iod, bytes, nseg, priv); - - return iod; -} - -static struct nvme_iod *nvme_alloc_iod(struct request *rq, struct nvme_dev *dev, - gfp_t gfp) -{ - unsigned size = !(rq->cmd_flags & REQ_DISCARD) ? blk_rq_bytes(rq) : - sizeof(struct nvme_dsm_range); - struct nvme_iod *iod; + struct nvme_iod *iod = blk_mq_rq_to_pdu(rq); + int nseg = rq->nr_phys_segments; + unsigned size; - if (rq->nr_phys_segments <= NVME_INT_PAGES && - size <= NVME_INT_BYTES(dev)) { - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(rq); + if (rq->cmd_flags & REQ_DISCARD) + size = sizeof(struct nvme_dsm_range); + else + size = blk_rq_bytes(rq); - iod = cmd->iod; - iod_init(iod, size, rq->nr_phys_segments, - (unsigned long) rq | NVME_INT_MASK); - return iod; + if (nseg > NVME_INT_PAGES || size > NVME_INT_BYTES(dev)) { + iod->sg = kmalloc(nvme_iod_alloc_size(dev, size, nseg), GFP_ATOMIC); + if (!iod->sg) + return BLK_MQ_RQ_QUEUE_BUSY; + } else { + iod->sg = iod->inline_sg; } - return __nvme_alloc_iod(rq->nr_phys_segments, size, dev, - (unsigned long) rq, gfp); + iod->aborted = 0; + iod->npages = -1; + iod->nents = 0; + iod->length = size; + return 0; } -static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) +static void nvme_free_iod(struct nvme_dev *dev, struct request *req) { - const int last_prp = dev->page_size / 8 - 1; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + const int last_prp = dev->ctrl.page_size / 8 - 1; int i; - __le64 **list = iod_list(iod); + __le64 **list = iod_list(req); dma_addr_t prp_dma = iod->first_dma; if (iod->npages == 0) @@ -479,20 +378,8 @@ static void nvme_free_iod(struct nvme_dev *dev, struct nvme_iod *iod) prp_dma = next_prp_dma; } - if (iod_should_kfree(iod)) - kfree(iod); -} - -static int nvme_error_status(u16 status) -{ - switch (status & 0x7ff) { - case NVME_SC_SUCCESS: - return 0; - case NVME_SC_CAP_EXCEEDED: - return -ENOSPC; - default: - return -EIO; - } + if (iod->sg != iod->inline_sg) + kfree(iod->sg); } #ifdef CONFIG_BLK_DEV_INTEGRITY @@ -549,27 +436,6 @@ static void nvme_dif_remap(struct request *req, } kunmap_atomic(pmap); } - -static void nvme_init_integrity(struct nvme_ns *ns) -{ - struct blk_integrity integrity; - - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - integrity.profile = &t10_pi_type3_crc; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - integrity.profile = &t10_pi_type1_crc; - break; - default: - integrity.profile = NULL; - break; - } - integrity.tuple_size = ns->ms; - blk_integrity_register(ns->disk, &integrity); - blk_queue_max_integrity_segments(ns->queue, 1); -} #else /* CONFIG_BLK_DEV_INTEGRITY */ static void nvme_dif_remap(struct request *req, void (*dif_swap)(u32 p, u32 v, struct t10_pi_tuple *pi)) @@ -581,91 +447,27 @@ static void nvme_dif_prep(u32 p, u32 v, struct t10_pi_tuple *pi) static void nvme_dif_complete(u32 p, u32 v, struct t10_pi_tuple *pi) { } -static void nvme_init_integrity(struct nvme_ns *ns) -{ -} #endif -static void req_completion(struct nvme_queue *nvmeq, void *ctx, - struct nvme_completion *cqe) -{ - struct nvme_iod *iod = ctx; - struct request *req = iod_get_private(iod); - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - u16 status = le16_to_cpup(&cqe->status) >> 1; - bool requeue = false; - int error = 0; - - if (unlikely(status)) { - if (!(status & NVME_SC_DNR || blk_noretry_request(req)) - && (jiffies - req->start_time) < req->timeout) { - unsigned long flags; - - requeue = true; - blk_mq_requeue_request(req); - spin_lock_irqsave(req->q->queue_lock, flags); - if (!blk_queue_stopped(req->q)) - blk_mq_kick_requeue_list(req->q); - spin_unlock_irqrestore(req->q->queue_lock, flags); - goto release_iod; - } - - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - if (cmd_rq->ctx == CMD_CTX_CANCELLED) - error = -EINTR; - else - error = status; - } else { - error = nvme_error_status(status); - } - } - - if (req->cmd_type == REQ_TYPE_DRV_PRIV) { - u32 result = le32_to_cpup(&cqe->result); - req->special = (void *)(uintptr_t)result; - } - - if (cmd_rq->aborted) - dev_warn(nvmeq->dev->dev, - "completing aborted command with status:%04x\n", - error); - -release_iod: - if (iod->nents) { - dma_unmap_sg(nvmeq->dev->dev, iod->sg, iod->nents, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - if (blk_integrity_rq(req)) { - if (!rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_complete); - dma_unmap_sg(nvmeq->dev->dev, iod->meta_sg, 1, - rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE); - } - } - nvme_free_iod(nvmeq->dev, iod); - - if (likely(!requeue)) - blk_mq_complete_request(req, error); -} - -/* length is in bytes. gfp flags indicates whether we may sleep. */ -static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, - int total_len, gfp_t gfp) +static bool nvme_setup_prps(struct nvme_dev *dev, struct request *req, + int total_len) { + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); struct dma_pool *pool; int length = total_len; struct scatterlist *sg = iod->sg; int dma_len = sg_dma_len(sg); u64 dma_addr = sg_dma_address(sg); - u32 page_size = dev->page_size; + u32 page_size = dev->ctrl.page_size; int offset = dma_addr & (page_size - 1); __le64 *prp_list; - __le64 **list = iod_list(iod); + __le64 **list = iod_list(req); dma_addr_t prp_dma; int nprps, i; length -= (page_size - offset); if (length <= 0) - return total_len; + return true; dma_len -= (page_size - offset); if (dma_len) { @@ -678,7 +480,7 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, if (length <= page_size) { iod->first_dma = dma_addr; - return total_len; + return true; } nprps = DIV_ROUND_UP(length, page_size); @@ -690,11 +492,11 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, iod->npages = 1; } - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) { iod->first_dma = dma_addr; iod->npages = -1; - return (total_len - length) + page_size; + return false; } list[0] = prp_list; iod->first_dma = prp_dma; @@ -702,9 +504,9 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, for (;;) { if (i == page_size >> 3) { __le64 *old_prp_list = prp_list; - prp_list = dma_pool_alloc(pool, gfp, &prp_dma); + prp_list = dma_pool_alloc(pool, GFP_ATOMIC, &prp_dma); if (!prp_list) - return total_len - length; + return false; list[iod->npages++] = prp_list; prp_list[0] = old_prp_list[i - 1]; old_prp_list[i - 1] = cpu_to_le64(prp_dma); @@ -724,115 +526,105 @@ static int nvme_setup_prps(struct nvme_dev *dev, struct nvme_iod *iod, dma_len = sg_dma_len(sg); } - return total_len; + return true; } -static void nvme_submit_priv(struct nvme_queue *nvmeq, struct request *req, - struct nvme_iod *iod) +static int nvme_map_data(struct nvme_dev *dev, struct request *req, + struct nvme_command *cmnd) { - struct nvme_command cmnd; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct request_queue *q = req->q; + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + int ret = BLK_MQ_RQ_QUEUE_ERROR; - memcpy(&cmnd, req->cmd, sizeof(cmnd)); - cmnd.rw.command_id = req->tag; - if (req->nr_phys_segments) { - cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); - } + sg_init_table(iod->sg, req->nr_phys_segments); + iod->nents = blk_rq_map_sg(q, req, iod->sg); + if (!iod->nents) + goto out; - __nvme_submit_cmd(nvmeq, &cmnd); -} + ret = BLK_MQ_RQ_QUEUE_BUSY; + if (!dma_map_sg(dev->dev, iod->sg, iod->nents, dma_dir)) + goto out; -/* - * We reuse the small pool to allocate the 16-byte range here as it is not - * worth having a special pool for these or additional cases to handle freeing - * the iod. - */ -static void nvme_submit_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, - struct request *req, struct nvme_iod *iod) -{ - struct nvme_dsm_range *range = - (struct nvme_dsm_range *)iod_list(iod)[0]; - struct nvme_command cmnd; + if (!nvme_setup_prps(dev, req, blk_rq_bytes(req))) + goto out_unmap; - range->cattr = cpu_to_le32(0); - range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); - range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + ret = BLK_MQ_RQ_QUEUE_ERROR; + if (blk_integrity_rq(req)) { + if (blk_rq_count_integrity_sg(q, req->bio) != 1) + goto out_unmap; - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.dsm.opcode = nvme_cmd_dsm; - cmnd.dsm.command_id = req->tag; - cmnd.dsm.nsid = cpu_to_le32(ns->ns_id); - cmnd.dsm.prp1 = cpu_to_le64(iod->first_dma); - cmnd.dsm.nr = 0; - cmnd.dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + sg_init_table(&iod->meta_sg, 1); + if (blk_rq_map_integrity_sg(q, req->bio, &iod->meta_sg) != 1) + goto out_unmap; - __nvme_submit_cmd(nvmeq, &cmnd); -} + if (rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_prep); -static void nvme_submit_flush(struct nvme_queue *nvmeq, struct nvme_ns *ns, - int cmdid) -{ - struct nvme_command cmnd; + if (!dma_map_sg(dev->dev, &iod->meta_sg, 1, dma_dir)) + goto out_unmap; + } - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.common.opcode = nvme_cmd_flush; - cmnd.common.command_id = cmdid; - cmnd.common.nsid = cpu_to_le32(ns->ns_id); + cmnd->rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); + cmnd->rw.prp2 = cpu_to_le64(iod->first_dma); + if (blk_integrity_rq(req)) + cmnd->rw.metadata = cpu_to_le64(sg_dma_address(&iod->meta_sg)); + return BLK_MQ_RQ_QUEUE_OK; - __nvme_submit_cmd(nvmeq, &cmnd); +out_unmap: + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); +out: + return ret; } -static int nvme_submit_iod(struct nvme_queue *nvmeq, struct nvme_iod *iod, - struct nvme_ns *ns) +static void nvme_unmap_data(struct nvme_dev *dev, struct request *req) { - struct request *req = iod_get_private(iod); - struct nvme_command cmnd; - u16 control = 0; - u32 dsmgmt = 0; - - if (req->cmd_flags & REQ_FUA) - control |= NVME_RW_FUA; - if (req->cmd_flags & (REQ_FAILFAST_DEV | REQ_RAHEAD)) - control |= NVME_RW_LR; - - if (req->cmd_flags & REQ_RAHEAD) - dsmgmt |= NVME_RW_DSM_FREQ_PREFETCH; - - memset(&cmnd, 0, sizeof(cmnd)); - cmnd.rw.opcode = (rq_data_dir(req) ? nvme_cmd_write : nvme_cmd_read); - cmnd.rw.command_id = req->tag; - cmnd.rw.nsid = cpu_to_le32(ns->ns_id); - cmnd.rw.prp1 = cpu_to_le64(sg_dma_address(iod->sg)); - cmnd.rw.prp2 = cpu_to_le64(iod->first_dma); - cmnd.rw.slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); - cmnd.rw.length = cpu_to_le16((blk_rq_bytes(req) >> ns->lba_shift) - 1); - - if (ns->ms) { - switch (ns->pi_type) { - case NVME_NS_DPS_PI_TYPE3: - control |= NVME_RW_PRINFO_PRCHK_GUARD; - break; - case NVME_NS_DPS_PI_TYPE1: - case NVME_NS_DPS_PI_TYPE2: - control |= NVME_RW_PRINFO_PRCHK_GUARD | - NVME_RW_PRINFO_PRCHK_REF; - cmnd.rw.reftag = cpu_to_le32( - nvme_block_nr(ns, blk_rq_pos(req))); - break; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + enum dma_data_direction dma_dir = rq_data_dir(req) ? + DMA_TO_DEVICE : DMA_FROM_DEVICE; + + if (iod->nents) { + dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); + if (blk_integrity_rq(req)) { + if (!rq_data_dir(req)) + nvme_dif_remap(req, nvme_dif_complete); + dma_unmap_sg(dev->dev, &iod->meta_sg, 1, dma_dir); } - if (blk_integrity_rq(req)) - cmnd.rw.metadata = - cpu_to_le64(sg_dma_address(iod->meta_sg)); - else - control |= NVME_RW_PRINFO_PRACT; } - cmnd.rw.control = cpu_to_le16(control); - cmnd.rw.dsmgmt = cpu_to_le32(dsmgmt); + nvme_free_iod(dev, req); +} - __nvme_submit_cmd(nvmeq, &cmnd); +/* + * We reuse the small pool to allocate the 16-byte range here as it is not + * worth having a special pool for these or additional cases to handle freeing + * the iod. + */ +static int nvme_setup_discard(struct nvme_queue *nvmeq, struct nvme_ns *ns, + struct request *req, struct nvme_command *cmnd) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dsm_range *range; - return 0; + range = dma_pool_alloc(nvmeq->dev->prp_small_pool, GFP_ATOMIC, + &iod->first_dma); + if (!range) + return BLK_MQ_RQ_QUEUE_BUSY; + iod_list(req)[0] = (__le64 *)range; + iod->npages = 0; + + range->cattr = cpu_to_le32(0); + range->nlb = cpu_to_le32(blk_rq_bytes(req) >> ns->lba_shift); + range->slba = cpu_to_le64(nvme_block_nr(ns, blk_rq_pos(req))); + + memset(cmnd, 0, sizeof(*cmnd)); + cmnd->dsm.opcode = nvme_cmd_dsm; + cmnd->dsm.nsid = cpu_to_le32(ns->ns_id); + cmnd->dsm.prp1 = cpu_to_le64(iod->first_dma); + cmnd->dsm.nr = 0; + cmnd->dsm.attributes = cpu_to_le32(NVME_DSMGMT_AD); + return BLK_MQ_RQ_QUEUE_OK; } /* @@ -845,9 +637,8 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, struct nvme_queue *nvmeq = hctx->driver_data; struct nvme_dev *dev = nvmeq->dev; struct request *req = bd->rq; - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_iod *iod; - enum dma_data_direction dma_dir; + struct nvme_command cmnd; + int ret = BLK_MQ_RQ_QUEUE_OK; /* * If formated with metadata, require the block layer provide a buffer @@ -857,91 +648,72 @@ static int nvme_queue_rq(struct blk_mq_hw_ctx *hctx, if (ns && ns->ms && !blk_integrity_rq(req)) { if (!(ns->pi_type && ns->ms == 8) && req->cmd_type != REQ_TYPE_DRV_PRIV) { - blk_mq_complete_request(req, -EFAULT); + blk_mq_end_request(req, -EFAULT); return BLK_MQ_RQ_QUEUE_OK; } } - iod = nvme_alloc_iod(req, dev, GFP_ATOMIC); - if (!iod) - return BLK_MQ_RQ_QUEUE_BUSY; + ret = nvme_init_iod(req, dev); + if (ret) + return ret; if (req->cmd_flags & REQ_DISCARD) { - void *range; - /* - * We reuse the small pool to allocate the 16-byte range here - * as it is not worth having a special pool for these or - * additional cases to handle freeing the iod. - */ - range = dma_pool_alloc(dev->prp_small_pool, GFP_ATOMIC, - &iod->first_dma); - if (!range) - goto retry_cmd; - iod_list(iod)[0] = (__le64 *)range; - iod->npages = 0; - } else if (req->nr_phys_segments) { - dma_dir = rq_data_dir(req) ? DMA_TO_DEVICE : DMA_FROM_DEVICE; + ret = nvme_setup_discard(nvmeq, ns, req, &cmnd); + } else { + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + memcpy(&cmnd, req->cmd, sizeof(cmnd)); + else if (req->cmd_flags & REQ_FLUSH) + nvme_setup_flush(ns, &cmnd); + else + nvme_setup_rw(ns, req, &cmnd); - sg_init_table(iod->sg, req->nr_phys_segments); - iod->nents = blk_rq_map_sg(req->q, req, iod->sg); - if (!iod->nents) - goto error_cmd; + if (req->nr_phys_segments) + ret = nvme_map_data(dev, req, &cmnd); + } - if (!dma_map_sg(nvmeq->q_dmadev, iod->sg, iod->nents, dma_dir)) - goto retry_cmd; + if (ret) + goto out; - if (blk_rq_bytes(req) != - nvme_setup_prps(dev, iod, blk_rq_bytes(req), GFP_ATOMIC)) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, dma_dir); - goto retry_cmd; - } - if (blk_integrity_rq(req)) { - if (blk_rq_count_integrity_sg(req->q, req->bio) != 1) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } + cmnd.common.command_id = req->tag; + blk_mq_start_request(req); - sg_init_table(iod->meta_sg, 1); - if (blk_rq_map_integrity_sg( - req->q, req->bio, iod->meta_sg) != 1) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } + spin_lock_irq(&nvmeq->q_lock); + __nvme_submit_cmd(nvmeq, &cmnd); + nvme_process_cq(nvmeq); + spin_unlock_irq(&nvmeq->q_lock); + return BLK_MQ_RQ_QUEUE_OK; +out: + nvme_free_iod(dev, req); + return ret; +} - if (rq_data_dir(req)) - nvme_dif_remap(req, nvme_dif_prep); +static void nvme_complete_rq(struct request *req) +{ + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_dev *dev = iod->nvmeq->dev; + int error = 0; - if (!dma_map_sg(nvmeq->q_dmadev, iod->meta_sg, 1, dma_dir)) { - dma_unmap_sg(dev->dev, iod->sg, iod->nents, - dma_dir); - goto error_cmd; - } + nvme_unmap_data(dev, req); + + if (unlikely(req->errors)) { + if (nvme_req_needs_retry(req, req->errors)) { + nvme_requeue_req(req); + return; } - } - nvme_set_info(cmd, iod, req_completion); - spin_lock_irq(&nvmeq->q_lock); - if (req->cmd_type == REQ_TYPE_DRV_PRIV) - nvme_submit_priv(nvmeq, req, iod); - else if (req->cmd_flags & REQ_DISCARD) - nvme_submit_discard(nvmeq, ns, req, iod); - else if (req->cmd_flags & REQ_FLUSH) - nvme_submit_flush(nvmeq, ns, req->tag); - else - nvme_submit_iod(nvmeq, iod, ns); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) + error = req->errors; + else + error = nvme_error_status(req->errors); + } - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); - return BLK_MQ_RQ_QUEUE_OK; + if (unlikely(iod->aborted)) { + dev_warn(dev->dev, + "completing aborted command with status: %04x\n", + req->errors); + } - error_cmd: - nvme_free_iod(dev, iod); - return BLK_MQ_RQ_QUEUE_ERROR; - retry_cmd: - nvme_free_iod(dev, iod); - return BLK_MQ_RQ_QUEUE_BUSY; + blk_mq_end_request(req, error); } static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) @@ -952,20 +724,47 @@ static void __nvme_process_cq(struct nvme_queue *nvmeq, unsigned int *tag) phase = nvmeq->cq_phase; for (;;) { - void *ctx; - nvme_completion_fn fn; struct nvme_completion cqe = nvmeq->cqes[head]; - if ((le16_to_cpu(cqe.status) & 1) != phase) + u16 status = le16_to_cpu(cqe.status); + struct request *req; + + if ((status & 1) != phase) break; nvmeq->sq_head = le16_to_cpu(cqe.sq_head); if (++head == nvmeq->q_depth) { head = 0; phase = !phase; } + if (tag && *tag == cqe.command_id) *tag = -1; - ctx = nvme_finish_cmd(nvmeq, cqe.command_id, &fn); - fn(nvmeq, ctx, &cqe); + + if (unlikely(cqe.command_id >= nvmeq->q_depth)) { + dev_warn(nvmeq->q_dmadev, + "invalid id %d completed on queue %d\n", + cqe.command_id, le16_to_cpu(cqe.sq_id)); + continue; + } + + /* + * AEN requests are special as they don't time out and can + * survive any kind of queue freeze and often don't respond to + * aborts. We don't even bother to allocate a struct request + * for them but rather special case them here. + */ + if (unlikely(nvmeq->qid == 0 && + cqe.command_id >= NVME_AQ_BLKMQ_DEPTH)) { + nvme_complete_async_event(nvmeq->dev, &cqe); + continue; + } + + req = blk_mq_tag_to_rq(*nvmeq->tags, cqe.command_id); + if (req->cmd_type == REQ_TYPE_DRV_PRIV) { + u32 result = le32_to_cpu(cqe.result); + req->special = (void *)(uintptr_t)result; + } + blk_mq_complete_request(req, status >> 1); + } /* If the controller ignores the cq head doorbell and continuously @@ -1028,112 +827,15 @@ static int nvme_poll(struct blk_mq_hw_ctx *hctx, unsigned int tag) return 0; } -/* - * Returns 0 on success. If the result is negative, it's a Linux error code; - * if the result is positive, it's an NVM Express status code - */ -int __nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, void __user *ubuffer, unsigned bufflen, - u32 *result, unsigned timeout) -{ - bool write = cmd->common.opcode & 1; - struct bio *bio = NULL; - struct request *req; - int ret; - - req = blk_mq_alloc_request(q, write, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_type = REQ_TYPE_DRV_PRIV; - req->cmd_flags |= REQ_FAILFAST_DRIVER; - req->__data_len = 0; - req->__sector = (sector_t) -1; - req->bio = req->biotail = NULL; - - req->timeout = timeout ? timeout : ADMIN_TIMEOUT; - - req->cmd = (unsigned char *)cmd; - req->cmd_len = sizeof(struct nvme_command); - req->special = (void *)0; - - if (buffer && bufflen) { - ret = blk_rq_map_kern(q, req, buffer, bufflen, - __GFP_DIRECT_RECLAIM); - if (ret) - goto out; - } else if (ubuffer && bufflen) { - ret = blk_rq_map_user(q, req, NULL, ubuffer, bufflen, - __GFP_DIRECT_RECLAIM); - if (ret) - goto out; - bio = req->bio; - } - - blk_execute_rq(req->q, NULL, req, 0); - if (bio) - blk_rq_unmap_user(bio); - if (result) - *result = (u32)(uintptr_t)req->special; - ret = req->errors; - out: - blk_mq_free_request(req); - return ret; -} - -int nvme_submit_sync_cmd(struct request_queue *q, struct nvme_command *cmd, - void *buffer, unsigned bufflen) -{ - return __nvme_submit_sync_cmd(q, cmd, buffer, NULL, bufflen, NULL, 0); -} - -static int nvme_submit_async_admin_req(struct nvme_dev *dev) +static void nvme_submit_async_event(struct nvme_dev *dev) { - struct nvme_queue *nvmeq = dev->queues[0]; struct nvme_command c; - struct nvme_cmd_info *cmd_info; - struct request *req; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, - BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->cmd_flags |= REQ_NO_TIMEOUT; - cmd_info = blk_mq_rq_to_pdu(req); - nvme_set_info(cmd_info, NULL, async_req_completion); memset(&c, 0, sizeof(c)); c.common.opcode = nvme_admin_async_event; - c.common.command_id = req->tag; - - blk_mq_free_request(req); - __nvme_submit_cmd(nvmeq, &c); - return 0; -} - -static int nvme_submit_admin_async_cmd(struct nvme_dev *dev, - struct nvme_command *cmd, - struct async_cmd_info *cmdinfo, unsigned timeout) -{ - struct nvme_queue *nvmeq = dev->queues[0]; - struct request *req; - struct nvme_cmd_info *cmd_rq; - - req = blk_mq_alloc_request(dev->admin_q, WRITE, 0); - if (IS_ERR(req)) - return PTR_ERR(req); - - req->timeout = timeout; - cmd_rq = blk_mq_rq_to_pdu(req); - cmdinfo->req = req; - nvme_set_info(cmd_rq, cmdinfo, async_completion); - cmdinfo->status = -EINTR; + c.common.command_id = NVME_AQ_BLKMQ_DEPTH + --dev->ctrl.event_limit; - cmd->common.command_id = req->tag; - - nvme_submit_cmd(nvmeq, cmd); - return 0; + __nvme_submit_cmd(dev->queues[0], &c); } static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) @@ -1144,7 +846,7 @@ static int adapter_delete_queue(struct nvme_dev *dev, u8 opcode, u16 id) c.delete_queue.opcode = opcode; c.delete_queue.qid = cpu_to_le16(id); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, @@ -1165,7 +867,7 @@ static int adapter_alloc_cq(struct nvme_dev *dev, u16 qid, c.create_cq.cq_flags = cpu_to_le16(flags); c.create_cq.irq_vector = cpu_to_le16(nvmeq->cq_vector); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, @@ -1186,7 +888,7 @@ static int adapter_alloc_sq(struct nvme_dev *dev, u16 qid, c.create_sq.sq_flags = cpu_to_le16(flags); c.create_sq.cqid = cpu_to_le16(qid); - return nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + return nvme_submit_sync_cmd(dev->ctrl.admin_q, &c, NULL, 0); } static int adapter_delete_cq(struct nvme_dev *dev, u16 cqid) @@ -1199,195 +901,111 @@ static int adapter_delete_sq(struct nvme_dev *dev, u16 sqid) return adapter_delete_queue(dev, nvme_admin_delete_sq, sqid); } -int nvme_identify_ctrl(struct nvme_dev *dev, struct nvme_id_ctrl **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify; - c.identify.cns = cpu_to_le32(1); - - *id = kmalloc(sizeof(struct nvme_id_ctrl), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ctrl)); - if (error) - kfree(*id); - return error; -} - -int nvme_identify_ns(struct nvme_dev *dev, unsigned nsid, - struct nvme_id_ns **id) -{ - struct nvme_command c = { }; - int error; - - /* gcc-4.4.4 (at least) has issues with initializers and anon unions */ - c.identify.opcode = nvme_admin_identify, - c.identify.nsid = cpu_to_le32(nsid), - - *id = kmalloc(sizeof(struct nvme_id_ns), GFP_KERNEL); - if (!*id) - return -ENOMEM; - - error = nvme_submit_sync_cmd(dev->admin_q, &c, *id, - sizeof(struct nvme_id_ns)); - if (error) - kfree(*id); - return error; -} - -int nvme_get_features(struct nvme_dev *dev, unsigned fid, unsigned nsid, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_get_features; - c.features.nsid = cpu_to_le32(nsid); - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); -} - -int nvme_set_features(struct nvme_dev *dev, unsigned fid, unsigned dword11, - dma_addr_t dma_addr, u32 *result) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.features.opcode = nvme_admin_set_features; - c.features.prp1 = cpu_to_le64(dma_addr); - c.features.fid = cpu_to_le32(fid); - c.features.dword11 = cpu_to_le32(dword11); - - return __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, NULL, 0, - result, 0); -} - -int nvme_get_log_page(struct nvme_dev *dev, struct nvme_smart_log **log) +static void abort_endio(struct request *req, int error) { - struct nvme_command c = { }; - int error; - - c.common.opcode = nvme_admin_get_log_page, - c.common.nsid = cpu_to_le32(0xFFFFFFFF), - c.common.cdw10[0] = cpu_to_le32( - (((sizeof(struct nvme_smart_log) / 4) - 1) << 16) | - NVME_LOG_SMART), + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; + u32 result = (u32)(uintptr_t)req->special; + u16 status = req->errors; - *log = kmalloc(sizeof(struct nvme_smart_log), GFP_KERNEL); - if (!*log) - return -ENOMEM; + dev_warn(nvmeq->q_dmadev, "Abort status:%x result:%x", status, result); + atomic_inc(&nvmeq->dev->ctrl.abort_limit); - error = nvme_submit_sync_cmd(dev->admin_q, &c, *log, - sizeof(struct nvme_smart_log)); - if (error) - kfree(*log); - return error; + blk_mq_free_request(req); } -/** - * nvme_abort_req - Attempt aborting a request - * - * Schedule controller reset if the command was already aborted once before and - * still hasn't been returned to the driver, or if this is the admin queue. - */ -static void nvme_abort_req(struct request *req) +static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) { - struct nvme_cmd_info *cmd_rq = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd_rq->nvmeq; + struct nvme_iod *iod = blk_mq_rq_to_pdu(req); + struct nvme_queue *nvmeq = iod->nvmeq; struct nvme_dev *dev = nvmeq->dev; struct request *abort_req; - struct nvme_cmd_info *abort_cmd; struct nvme_command cmd; - if (!nvmeq->qid || cmd_rq->aborted) { - spin_lock(&dev_list_lock); - if (!__nvme_reset(dev)) { - dev_warn(dev->dev, - "I/O %d QID %d timeout, reset controller\n", - req->tag, nvmeq->qid); - } - spin_unlock(&dev_list_lock); - return; + /* + * Shutdown immediately if controller times out while starting. The + * reset work will see the pci device disabled when it gets the forced + * cancellation error. All outstanding requests are completed on + * shutdown, so we return BLK_EH_HANDLED. + */ + if (test_bit(NVME_CTRL_RESETTING, &dev->flags)) { + dev_warn(dev->dev, + "I/O %d QID %d timeout, disable controller\n", + req->tag, nvmeq->qid); + nvme_dev_disable(dev, false); + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; } - if (!dev->abort_limit) - return; + /* + * Shutdown the controller immediately and schedule a reset if the + * command was already aborted once before and still hasn't been + * returned to the driver, or if this is the admin queue. + */ + if (!nvmeq->qid || iod->aborted) { + dev_warn(dev->dev, + "I/O %d QID %d timeout, reset controller\n", + req->tag, nvmeq->qid); + nvme_dev_disable(dev, false); + queue_work(nvme_workq, &dev->reset_work); - abort_req = blk_mq_alloc_request(dev->admin_q, WRITE, - BLK_MQ_REQ_NOWAIT); - if (IS_ERR(abort_req)) - return; + /* + * Mark the request as handled, since the inline shutdown + * forces all outstanding requests to complete. + */ + req->errors = NVME_SC_CANCELLED; + return BLK_EH_HANDLED; + } - abort_cmd = blk_mq_rq_to_pdu(abort_req); - nvme_set_info(abort_cmd, abort_req, abort_completion); + iod->aborted = 1; + + if (atomic_dec_return(&dev->ctrl.abort_limit) < 0) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } memset(&cmd, 0, sizeof(cmd)); cmd.abort.opcode = nvme_admin_abort_cmd; cmd.abort.cid = req->tag; cmd.abort.sqid = cpu_to_le16(nvmeq->qid); - cmd.abort.command_id = abort_req->tag; - --dev->abort_limit; - cmd_rq->aborted = 1; + dev_warn(nvmeq->q_dmadev, "I/O %d QID %d timeout, aborting\n", + req->tag, nvmeq->qid); - dev_warn(nvmeq->q_dmadev, "Aborting I/O %d QID %d\n", req->tag, - nvmeq->qid); - nvme_submit_cmd(dev->queues[0], &cmd); + abort_req = nvme_alloc_request(dev->ctrl.admin_q, &cmd, + BLK_MQ_REQ_NOWAIT); + if (IS_ERR(abort_req)) { + atomic_inc(&dev->ctrl.abort_limit); + return BLK_EH_RESET_TIMER; + } + + abort_req->timeout = ADMIN_TIMEOUT; + abort_req->end_io_data = NULL; + blk_execute_rq_nowait(abort_req->q, NULL, abort_req, 0, abort_endio); + + /* + * The aborted req will be completed on receiving the abort req. + * We enable the timer again. If hit twice, it'll cause a device reset, + * as the device then is in a faulty state. + */ + return BLK_EH_RESET_TIMER; } static void nvme_cancel_queue_ios(struct request *req, void *data, bool reserved) { struct nvme_queue *nvmeq = data; - void *ctx; - nvme_completion_fn fn; - struct nvme_cmd_info *cmd; - struct nvme_completion cqe; + int status; if (!blk_mq_request_started(req)) return; - cmd = blk_mq_rq_to_pdu(req); - - if (cmd->ctx == CMD_CTX_CANCELLED) - return; + dev_warn(nvmeq->q_dmadev, + "Cancelling I/O %d QID %d\n", req->tag, nvmeq->qid); + status = NVME_SC_ABORT_REQ; if (blk_queue_dying(req->q)) - cqe.status = cpu_to_le16((NVME_SC_ABORT_REQ | NVME_SC_DNR) << 1); - else - cqe.status = cpu_to_le16(NVME_SC_ABORT_REQ << 1); - - - dev_warn(nvmeq->q_dmadev, "Cancelling I/O %d QID %d\n", - req->tag, nvmeq->qid); - ctx = cancel_cmd_info(cmd, &fn); - fn(nvmeq, ctx, &cqe); -} - -static enum blk_eh_timer_return nvme_timeout(struct request *req, bool reserved) -{ - struct nvme_cmd_info *cmd = blk_mq_rq_to_pdu(req); - struct nvme_queue *nvmeq = cmd->nvmeq; - - dev_warn(nvmeq->q_dmadev, "Timeout I/O %d QID %d\n", req->tag, - nvmeq->qid); - spin_lock_irq(&nvmeq->q_lock); - nvme_abort_req(req); - spin_unlock_irq(&nvmeq->q_lock); - - /* - * The aborted req will be completed on receiving the abort req. - * We enable the timer again. If hit twice, it'll cause a device reset, - * as the device then is in a faulty state. - */ - return BLK_EH_RESET_TIMER; + status |= NVME_SC_DNR; + blk_mq_complete_request(req, status); } static void nvme_free_queue(struct nvme_queue *nvmeq) @@ -1430,8 +1048,8 @@ static int nvme_suspend_queue(struct nvme_queue *nvmeq) nvmeq->cq_vector = -1; spin_unlock_irq(&nvmeq->q_lock); - if (!nvmeq->qid && nvmeq->dev->admin_q) - blk_mq_freeze_queue_start(nvmeq->dev->admin_q); + if (!nvmeq->qid && nvmeq->dev->ctrl.admin_q) + blk_mq_stop_hw_queues(nvmeq->dev->ctrl.admin_q); irq_set_affinity_hint(vector, NULL); free_irq(vector, nvmeq); @@ -1447,21 +1065,20 @@ static void nvme_clear_queue(struct nvme_queue *nvmeq) spin_unlock_irq(&nvmeq->q_lock); } -static void nvme_disable_queue(struct nvme_dev *dev, int qid) +static void nvme_disable_admin_queue(struct nvme_dev *dev, bool shutdown) { - struct nvme_queue *nvmeq = dev->queues[qid]; + struct nvme_queue *nvmeq = dev->queues[0]; if (!nvmeq) return; if (nvme_suspend_queue(nvmeq)) return; - /* Don't tell the adapter to delete the admin queue. - * Don't tell a removed adapter to delete IO queues. */ - if (qid && readl(&dev->bar->csts) != -1) { - adapter_delete_sq(dev, qid); - adapter_delete_cq(dev, qid); - } + if (shutdown) + nvme_shutdown_ctrl(&dev->ctrl); + else + nvme_disable_ctrl(&dev->ctrl, lo_hi_readq( + dev->bar + NVME_REG_CAP)); spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); @@ -1472,11 +1089,12 @@ static int nvme_cmb_qdepth(struct nvme_dev *dev, int nr_io_queues, int entry_size) { int q_depth = dev->q_depth; - unsigned q_size_aligned = roundup(q_depth * entry_size, dev->page_size); + unsigned q_size_aligned = roundup(q_depth * entry_size, + dev->ctrl.page_size); if (q_size_aligned * nr_io_queues > dev->cmb_size) { u64 mem_per_q = div_u64(dev->cmb_size, nr_io_queues); - mem_per_q = round_down(mem_per_q, dev->page_size); + mem_per_q = round_down(mem_per_q, dev->ctrl.page_size); q_depth = div_u64(mem_per_q, entry_size); /* @@ -1495,8 +1113,8 @@ static int nvme_alloc_sq_cmds(struct nvme_dev *dev, struct nvme_queue *nvmeq, int qid, int depth) { if (qid && dev->cmb && use_cmb_sqes && NVME_CMB_SQS(dev->cmbsz)) { - unsigned offset = (qid - 1) * - roundup(SQ_SIZE(depth), dev->page_size); + unsigned offset = (qid - 1) * roundup(SQ_SIZE(depth), + dev->ctrl.page_size); nvmeq->sq_dma_addr = dev->cmb_dma_addr + offset; nvmeq->sq_cmds_io = dev->cmb + offset; } else { @@ -1527,7 +1145,7 @@ static struct nvme_queue *nvme_alloc_queue(struct nvme_dev *dev, int qid, nvmeq->q_dmadev = dev->dev; nvmeq->dev = dev; snprintf(nvmeq->irqname, sizeof(nvmeq->irqname), "nvme%dq%d", - dev->instance, qid); + dev->ctrl.instance, qid); spin_lock_init(&nvmeq->q_lock); nvmeq->cq_head = 0; nvmeq->cq_phase = 1; @@ -1604,79 +1222,9 @@ static int nvme_create_queue(struct nvme_queue *nvmeq, int qid) return result; } -static int nvme_wait_ready(struct nvme_dev *dev, u64 cap, bool enabled) -{ - unsigned long timeout; - u32 bit = enabled ? NVME_CSTS_RDY : 0; - - timeout = ((NVME_CAP_TIMEOUT(cap) + 1) * HZ / 2) + jiffies; - - while ((readl(&dev->bar->csts) & NVME_CSTS_RDY) != bit) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(dev->dev, - "Device not ready; aborting %s\n", enabled ? - "initialisation" : "reset"); - return -ENODEV; - } - } - - return 0; -} - -/* - * If the device has been passed off to us in an enabled state, just clear - * the enabled bit. The spec says we should set the 'shutdown notification - * bits', but doing so may cause the device to complete commands to the - * admin queue ... and we don't know what memory that might be pointing at! - */ -static int nvme_disable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config &= ~NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); - - return nvme_wait_ready(dev, cap, false); -} - -static int nvme_enable_ctrl(struct nvme_dev *dev, u64 cap) -{ - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_ENABLE; - writel(dev->ctrl_config, &dev->bar->cc); - - return nvme_wait_ready(dev, cap, true); -} - -static int nvme_shutdown_ctrl(struct nvme_dev *dev) -{ - unsigned long timeout; - - dev->ctrl_config &= ~NVME_CC_SHN_MASK; - dev->ctrl_config |= NVME_CC_SHN_NORMAL; - - writel(dev->ctrl_config, &dev->bar->cc); - - timeout = SHUTDOWN_TIMEOUT + jiffies; - while ((readl(&dev->bar->csts) & NVME_CSTS_SHST_MASK) != - NVME_CSTS_SHST_CMPLT) { - msleep(100); - if (fatal_signal_pending(current)) - return -EINTR; - if (time_after(jiffies, timeout)) { - dev_err(dev->dev, - "Device shutdown incomplete; abort shutdown\n"); - return -ENODEV; - } - } - - return 0; -} - static struct blk_mq_ops nvme_mq_admin_ops = { .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_admin_init_hctx, .exit_hctx = nvme_admin_exit_hctx, @@ -1686,6 +1234,7 @@ static struct blk_mq_ops nvme_mq_admin_ops = { static struct blk_mq_ops nvme_mq_ops = { .queue_rq = nvme_queue_rq, + .complete = nvme_complete_rq, .map_queue = blk_mq_map_queue, .init_hctx = nvme_init_hctx, .init_request = nvme_init_request, @@ -1695,19 +1244,23 @@ static struct blk_mq_ops nvme_mq_ops = { static void nvme_dev_remove_admin(struct nvme_dev *dev) { - if (dev->admin_q && !blk_queue_dying(dev->admin_q)) { - blk_cleanup_queue(dev->admin_q); + if (dev->ctrl.admin_q && !blk_queue_dying(dev->ctrl.admin_q)) { + blk_cleanup_queue(dev->ctrl.admin_q); blk_mq_free_tag_set(&dev->admin_tagset); } } static int nvme_alloc_admin_tags(struct nvme_dev *dev) { - if (!dev->admin_q) { + if (!dev->ctrl.admin_q) { dev->admin_tagset.ops = &nvme_mq_admin_ops; dev->admin_tagset.nr_hw_queues = 1; - dev->admin_tagset.queue_depth = NVME_AQ_DEPTH - 1; - dev->admin_tagset.reserved_tags = 1; + + /* + * Subtract one to leave an empty queue entry for 'Full Queue' + * condition. See NVM-Express 1.2 specification, section 4.1.2. + */ + dev->admin_tagset.queue_depth = NVME_AQ_BLKMQ_DEPTH - 1; dev->admin_tagset.timeout = ADMIN_TIMEOUT; dev->admin_tagset.numa_node = dev_to_node(dev->dev); dev->admin_tagset.cmd_size = nvme_cmd_size(dev); @@ -1716,18 +1269,18 @@ static int nvme_alloc_admin_tags(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->admin_tagset)) return -ENOMEM; - dev->admin_q = blk_mq_init_queue(&dev->admin_tagset); - if (IS_ERR(dev->admin_q)) { + dev->ctrl.admin_q = blk_mq_init_queue(&dev->admin_tagset); + if (IS_ERR(dev->ctrl.admin_q)) { blk_mq_free_tag_set(&dev->admin_tagset); return -ENOMEM; } - if (!blk_get_queue(dev->admin_q)) { + if (!blk_get_queue(dev->ctrl.admin_q)) { nvme_dev_remove_admin(dev); - dev->admin_q = NULL; + dev->ctrl.admin_q = NULL; return -ENODEV; } } else - blk_mq_unfreeze_queue(dev->admin_q); + blk_mq_start_stopped_hw_queues(dev->ctrl.admin_q, true); return 0; } @@ -1736,31 +1289,17 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) { int result; u32 aqa; - u64 cap = lo_hi_readq(&dev->bar->cap); + u64 cap = lo_hi_readq(dev->bar + NVME_REG_CAP); struct nvme_queue *nvmeq; - /* - * default to a 4K page size, with the intention to update this - * path in the future to accomodate architectures with differing - * kernel and IO page sizes. - */ - unsigned page_shift = 12; - unsigned dev_page_min = NVME_CAP_MPSMIN(cap) + 12; - - if (page_shift < dev_page_min) { - dev_err(dev->dev, - "Minimum device page size (%u) too large for " - "host (%u)\n", 1 << dev_page_min, - 1 << page_shift); - return -ENODEV; - } - dev->subsystem = readl(&dev->bar->vs) >= NVME_VS(1, 1) ? + dev->subsystem = readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 1) ? NVME_CAP_NSSRC(cap) : 0; - if (dev->subsystem && (readl(&dev->bar->csts) & NVME_CSTS_NSSRO)) - writel(NVME_CSTS_NSSRO, &dev->bar->csts); + if (dev->subsystem && + (readl(dev->bar + NVME_REG_CSTS) & NVME_CSTS_NSSRO)) + writel(NVME_CSTS_NSSRO, dev->bar + NVME_REG_CSTS); - result = nvme_disable_ctrl(dev, cap); + result = nvme_disable_ctrl(&dev->ctrl, cap); if (result < 0) return result; @@ -1774,18 +1313,11 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) aqa = nvmeq->q_depth - 1; aqa |= aqa << 16; - dev->page_size = 1 << page_shift; - - dev->ctrl_config = NVME_CC_CSS_NVM; - dev->ctrl_config |= (page_shift - 12) << NVME_CC_MPS_SHIFT; - dev->ctrl_config |= NVME_CC_ARB_RR | NVME_CC_SHN_NONE; - dev->ctrl_config |= NVME_CC_IOSQES | NVME_CC_IOCQES; - - writel(aqa, &dev->bar->aqa); - lo_hi_writeq(nvmeq->sq_dma_addr, &dev->bar->asq); - lo_hi_writeq(nvmeq->cq_dma_addr, &dev->bar->acq); + writel(aqa, dev->bar + NVME_REG_AQA); + lo_hi_writeq(nvmeq->sq_dma_addr, dev->bar + NVME_REG_ASQ); + lo_hi_writeq(nvmeq->cq_dma_addr, dev->bar + NVME_REG_ACQ); - result = nvme_enable_ctrl(dev, cap); + result = nvme_enable_ctrl(&dev->ctrl, cap); if (result) goto free_nvmeq; @@ -1803,406 +1335,6 @@ static int nvme_configure_admin_queue(struct nvme_dev *dev) return result; } -static int nvme_submit_io(struct nvme_ns *ns, struct nvme_user_io __user *uio) -{ - struct nvme_dev *dev = ns->dev; - struct nvme_user_io io; - struct nvme_command c; - unsigned length, meta_len; - int status, write; - dma_addr_t meta_dma = 0; - void *meta = NULL; - void __user *metadata; - - if (copy_from_user(&io, uio, sizeof(io))) - return -EFAULT; - - switch (io.opcode) { - case nvme_cmd_write: - case nvme_cmd_read: - case nvme_cmd_compare: - break; - default: - return -EINVAL; - } - - length = (io.nblocks + 1) << ns->lba_shift; - meta_len = (io.nblocks + 1) * ns->ms; - metadata = (void __user *)(uintptr_t)io.metadata; - write = io.opcode & 1; - - if (ns->ext) { - length += meta_len; - meta_len = 0; - } - if (meta_len) { - if (((io.metadata & 3) || !io.metadata) && !ns->ext) - return -EINVAL; - - meta = dma_alloc_coherent(dev->dev, meta_len, - &meta_dma, GFP_KERNEL); - - if (!meta) { - status = -ENOMEM; - goto unmap; - } - if (write) { - if (copy_from_user(meta, metadata, meta_len)) { - status = -EFAULT; - goto unmap; - } - } - } - - memset(&c, 0, sizeof(c)); - c.rw.opcode = io.opcode; - c.rw.flags = io.flags; - c.rw.nsid = cpu_to_le32(ns->ns_id); - c.rw.slba = cpu_to_le64(io.slba); - c.rw.length = cpu_to_le16(io.nblocks); - c.rw.control = cpu_to_le16(io.control); - c.rw.dsmgmt = cpu_to_le32(io.dsmgmt); - c.rw.reftag = cpu_to_le32(io.reftag); - c.rw.apptag = cpu_to_le16(io.apptag); - c.rw.appmask = cpu_to_le16(io.appmask); - c.rw.metadata = cpu_to_le64(meta_dma); - - status = __nvme_submit_sync_cmd(ns->queue, &c, NULL, - (void __user *)(uintptr_t)io.addr, length, NULL, 0); - unmap: - if (meta) { - if (status == NVME_SC_SUCCESS && !write) { - if (copy_to_user(metadata, meta, meta_len)) - status = -EFAULT; - } - dma_free_coherent(dev->dev, meta_len, meta, meta_dma); - } - return status; -} - -static int nvme_user_cmd(struct nvme_dev *dev, struct nvme_ns *ns, - struct nvme_passthru_cmd __user *ucmd) -{ - struct nvme_passthru_cmd cmd; - struct nvme_command c; - unsigned timeout = 0; - int status; - - if (!capable(CAP_SYS_ADMIN)) - return -EACCES; - if (copy_from_user(&cmd, ucmd, sizeof(cmd))) - return -EFAULT; - - memset(&c, 0, sizeof(c)); - c.common.opcode = cmd.opcode; - c.common.flags = cmd.flags; - c.common.nsid = cpu_to_le32(cmd.nsid); - c.common.cdw2[0] = cpu_to_le32(cmd.cdw2); - c.common.cdw2[1] = cpu_to_le32(cmd.cdw3); - c.common.cdw10[0] = cpu_to_le32(cmd.cdw10); - c.common.cdw10[1] = cpu_to_le32(cmd.cdw11); - c.common.cdw10[2] = cpu_to_le32(cmd.cdw12); - c.common.cdw10[3] = cpu_to_le32(cmd.cdw13); - c.common.cdw10[4] = cpu_to_le32(cmd.cdw14); - c.common.cdw10[5] = cpu_to_le32(cmd.cdw15); - - if (cmd.timeout_ms) - timeout = msecs_to_jiffies(cmd.timeout_ms); - - status = __nvme_submit_sync_cmd(ns ? ns->queue : dev->admin_q, &c, - NULL, (void __user *)(uintptr_t)cmd.addr, cmd.data_len, - &cmd.result, timeout); - if (status >= 0) { - if (put_user(cmd.result, &ucmd->result)) - return -EFAULT; - } - - return status; -} - -static int nvme_subsys_reset(struct nvme_dev *dev) -{ - if (!dev->subsystem) - return -ENOTTY; - - writel(0x4E564D65, &dev->bar->nssr); /* "NVMe" */ - return 0; -} - -static int nvme_ioctl(struct block_device *bdev, fmode_t mode, unsigned int cmd, - unsigned long arg) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - - switch (cmd) { - case NVME_IOCTL_ID: - force_successful_syscall_return(); - return ns->ns_id; - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(ns->dev, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - return nvme_user_cmd(ns->dev, ns, (void __user *)arg); - case NVME_IOCTL_SUBMIT_IO: - return nvme_submit_io(ns, (void __user *)arg); - case SG_GET_VERSION_NUM: - return nvme_sg_get_version_num((void __user *)arg); - case SG_IO: - return nvme_sg_io(ns, (void __user *)arg); - default: - return -ENOTTY; - } -} - -#ifdef CONFIG_COMPAT -static int nvme_compat_ioctl(struct block_device *bdev, fmode_t mode, - unsigned int cmd, unsigned long arg) -{ - switch (cmd) { - case SG_IO: - return -ENOIOCTLCMD; - } - return nvme_ioctl(bdev, mode, cmd, arg); -} -#else -#define nvme_compat_ioctl NULL -#endif - -static void nvme_free_dev(struct kref *kref); -static void nvme_free_ns(struct kref *kref) -{ - struct nvme_ns *ns = container_of(kref, struct nvme_ns, kref); - - if (ns->type == NVME_NS_LIGHTNVM) - nvme_nvm_unregister(ns->queue, ns->disk->disk_name); - - spin_lock(&dev_list_lock); - ns->disk->private_data = NULL; - spin_unlock(&dev_list_lock); - - kref_put(&ns->dev->kref, nvme_free_dev); - put_disk(ns->disk); - kfree(ns); -} - -static int nvme_open(struct block_device *bdev, fmode_t mode) -{ - int ret = 0; - struct nvme_ns *ns; - - spin_lock(&dev_list_lock); - ns = bdev->bd_disk->private_data; - if (!ns) - ret = -ENXIO; - else if (!kref_get_unless_zero(&ns->kref)) - ret = -ENXIO; - spin_unlock(&dev_list_lock); - - return ret; -} - -static void nvme_release(struct gendisk *disk, fmode_t mode) -{ - struct nvme_ns *ns = disk->private_data; - kref_put(&ns->kref, nvme_free_ns); -} - -static int nvme_getgeo(struct block_device *bd, struct hd_geometry *geo) -{ - /* some standard values */ - geo->heads = 1 << 6; - geo->sectors = 1 << 5; - geo->cylinders = get_capacity(bd->bd_disk) >> 11; - return 0; -} - -static void nvme_config_discard(struct nvme_ns *ns) -{ - u32 logical_block_size = queue_logical_block_size(ns->queue); - ns->queue->limits.discard_zeroes_data = 0; - ns->queue->limits.discard_alignment = logical_block_size; - ns->queue->limits.discard_granularity = logical_block_size; - blk_queue_max_discard_sectors(ns->queue, 0xffffffff); - queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, ns->queue); -} - -static int nvme_revalidate_disk(struct gendisk *disk) -{ - struct nvme_ns *ns = disk->private_data; - struct nvme_dev *dev = ns->dev; - struct nvme_id_ns *id; - u8 lbaf, pi_type; - u16 old_ms; - unsigned short bs; - - if (nvme_identify_ns(dev, ns->ns_id, &id)) { - dev_warn(dev->dev, "%s: Identify failure nvme%dn%d\n", __func__, - dev->instance, ns->ns_id); - return -ENODEV; - } - if (id->ncap == 0) { - kfree(id); - return -ENODEV; - } - - if (nvme_nvm_ns_supported(ns, id) && ns->type != NVME_NS_LIGHTNVM) { - if (nvme_nvm_register(ns->queue, disk->disk_name)) { - dev_warn(dev->dev, - "%s: LightNVM init failure\n", __func__); - kfree(id); - return -ENODEV; - } - ns->type = NVME_NS_LIGHTNVM; - } - - old_ms = ns->ms; - lbaf = id->flbas & NVME_NS_FLBAS_LBA_MASK; - ns->lba_shift = id->lbaf[lbaf].ds; - ns->ms = le16_to_cpu(id->lbaf[lbaf].ms); - ns->ext = ns->ms && (id->flbas & NVME_NS_FLBAS_META_EXT); - - /* - * If identify namespace failed, use default 512 byte block size so - * block layer can use before failing read/write for 0 capacity. - */ - if (ns->lba_shift == 0) - ns->lba_shift = 9; - bs = 1 << ns->lba_shift; - - /* XXX: PI implementation requires metadata equal t10 pi tuple size */ - pi_type = ns->ms == sizeof(struct t10_pi_tuple) ? - id->dps & NVME_NS_DPS_PI_MASK : 0; - - blk_mq_freeze_queue(disk->queue); - if (blk_get_integrity(disk) && (ns->pi_type != pi_type || - ns->ms != old_ms || - bs != queue_logical_block_size(disk->queue) || - (ns->ms && ns->ext))) - blk_integrity_unregister(disk); - - ns->pi_type = pi_type; - blk_queue_logical_block_size(ns->queue, bs); - - if (ns->ms && !ns->ext) - nvme_init_integrity(ns); - - if ((ns->ms && !(ns->ms == 8 && ns->pi_type) && - !blk_get_integrity(disk)) || - ns->type == NVME_NS_LIGHTNVM) - set_capacity(disk, 0); - else - set_capacity(disk, le64_to_cpup(&id->nsze) << (ns->lba_shift - 9)); - - if (dev->oncs & NVME_CTRL_ONCS_DSM) - nvme_config_discard(ns); - blk_mq_unfreeze_queue(disk->queue); - - kfree(id); - return 0; -} - -static char nvme_pr_type(enum pr_type type) -{ - switch (type) { - case PR_WRITE_EXCLUSIVE: - return 1; - case PR_EXCLUSIVE_ACCESS: - return 2; - case PR_WRITE_EXCLUSIVE_REG_ONLY: - return 3; - case PR_EXCLUSIVE_ACCESS_REG_ONLY: - return 4; - case PR_WRITE_EXCLUSIVE_ALL_REGS: - return 5; - case PR_EXCLUSIVE_ACCESS_ALL_REGS: - return 6; - default: - return 0; - } -}; - -static int nvme_pr_command(struct block_device *bdev, u32 cdw10, - u64 key, u64 sa_key, u8 op) -{ - struct nvme_ns *ns = bdev->bd_disk->private_data; - struct nvme_command c; - u8 data[16] = { 0, }; - - put_unaligned_le64(key, &data[0]); - put_unaligned_le64(sa_key, &data[8]); - - memset(&c, 0, sizeof(c)); - c.common.opcode = op; - c.common.nsid = cpu_to_le32(ns->ns_id); - c.common.cdw10[0] = cpu_to_le32(cdw10); - - return nvme_submit_sync_cmd(ns->queue, &c, data, 16); -} - -static int nvme_pr_register(struct block_device *bdev, u64 old, - u64 new, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = old ? 2 : 0; - cdw10 |= (flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0; - cdw10 |= (1 << 30) | (1 << 31); /* PTPL=1 */ - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_register); -} - -static int nvme_pr_reserve(struct block_device *bdev, u64 key, - enum pr_type type, unsigned flags) -{ - u32 cdw10; - - if (flags & ~PR_FL_IGNORE_KEY) - return -EOPNOTSUPP; - - cdw10 = nvme_pr_type(type) << 8; - cdw10 |= ((flags & PR_FL_IGNORE_KEY) ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_acquire); -} - -static int nvme_pr_preempt(struct block_device *bdev, u64 old, u64 new, - enum pr_type type, bool abort) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | abort ? 2 : 1; - return nvme_pr_command(bdev, cdw10, old, new, nvme_cmd_resv_acquire); -} - -static int nvme_pr_clear(struct block_device *bdev, u64 key) -{ - u32 cdw10 = 1 | (key ? 1 << 3 : 0); - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_register); -} - -static int nvme_pr_release(struct block_device *bdev, u64 key, enum pr_type type) -{ - u32 cdw10 = nvme_pr_type(type) << 8 | key ? 1 << 3 : 0; - return nvme_pr_command(bdev, cdw10, key, 0, nvme_cmd_resv_release); -} - -static const struct pr_ops nvme_pr_ops = { - .pr_register = nvme_pr_register, - .pr_reserve = nvme_pr_reserve, - .pr_release = nvme_pr_release, - .pr_preempt = nvme_pr_preempt, - .pr_clear = nvme_pr_clear, -}; - -static const struct block_device_operations nvme_fops = { - .owner = THIS_MODULE, - .ioctl = nvme_ioctl, - .compat_ioctl = nvme_compat_ioctl, - .open = nvme_open, - .release = nvme_release, - .getgeo = nvme_getgeo, - .revalidate_disk= nvme_revalidate_disk, - .pr_ops = &nvme_pr_ops, -}; - static int nvme_kthread(void *data) { struct nvme_dev *dev, *next; @@ -2212,14 +1344,20 @@ static int nvme_kthread(void *data) spin_lock(&dev_list_lock); list_for_each_entry_safe(dev, next, &dev_list, node) { int i; - u32 csts = readl(&dev->bar->csts); + u32 csts = readl(dev->bar + NVME_REG_CSTS); + + /* + * Skip controllers currently under reset. + */ + if (work_pending(&dev->reset_work) || work_busy(&dev->reset_work)) + continue; if ((dev->subsystem && (csts & NVME_CSTS_NSSRO)) || csts & NVME_CSTS_CFS) { - if (!__nvme_reset(dev)) { + if (queue_work(nvme_workq, &dev->reset_work)) { dev_warn(dev->dev, "Failed status: %x, reset controller\n", - readl(&dev->bar->csts)); + readl(dev->bar + NVME_REG_CSTS)); } continue; } @@ -2230,11 +1368,8 @@ static int nvme_kthread(void *data) spin_lock_irq(&nvmeq->q_lock); nvme_process_cq(nvmeq); - while ((i == 0) && (dev->event_limit > 0)) { - if (nvme_submit_async_admin_req(dev)) - break; - dev->event_limit--; - } + while (i == 0 && dev->ctrl.event_limit > 0) + nvme_submit_async_event(dev); spin_unlock_irq(&nvmeq->q_lock); } } @@ -2244,127 +1379,33 @@ static int nvme_kthread(void *data) return 0; } -static void nvme_alloc_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; - struct gendisk *disk; - int node = dev_to_node(dev->dev); - - ns = kzalloc_node(sizeof(*ns), GFP_KERNEL, node); - if (!ns) - return; - - ns->queue = blk_mq_init_queue(&dev->tagset); - if (IS_ERR(ns->queue)) - goto out_free_ns; - queue_flag_set_unlocked(QUEUE_FLAG_NOMERGES, ns->queue); - queue_flag_set_unlocked(QUEUE_FLAG_NONROT, ns->queue); - ns->dev = dev; - ns->queue->queuedata = ns; - - disk = alloc_disk_node(0, node); - if (!disk) - goto out_free_queue; - - kref_init(&ns->kref); - ns->ns_id = nsid; - ns->disk = disk; - ns->lba_shift = 9; /* set to a default value for 512 until disk is validated */ - list_add_tail(&ns->list, &dev->namespaces); - - blk_queue_logical_block_size(ns->queue, 1 << ns->lba_shift); - if (dev->max_hw_sectors) { - blk_queue_max_hw_sectors(ns->queue, dev->max_hw_sectors); - blk_queue_max_segments(ns->queue, - (dev->max_hw_sectors / (dev->page_size >> 9)) + 1); - } - if (dev->stripe_size) - blk_queue_chunk_sectors(ns->queue, dev->stripe_size >> 9); - if (dev->vwc & NVME_CTRL_VWC_PRESENT) - blk_queue_flush(ns->queue, REQ_FLUSH | REQ_FUA); - blk_queue_virt_boundary(ns->queue, dev->page_size - 1); - - disk->major = nvme_major; - disk->first_minor = 0; - disk->fops = &nvme_fops; - disk->private_data = ns; - disk->queue = ns->queue; - disk->driverfs_dev = dev->device; - disk->flags = GENHD_FL_EXT_DEVT; - sprintf(disk->disk_name, "nvme%dn%d", dev->instance, nsid); - - /* - * Initialize capacity to 0 until we establish the namespace format and - * setup integrity extentions if necessary. The revalidate_disk after - * add_disk allows the driver to register with integrity if the format - * requires it. - */ - set_capacity(disk, 0); - if (nvme_revalidate_disk(ns->disk)) - goto out_free_disk; - - kref_get(&dev->kref); - if (ns->type != NVME_NS_LIGHTNVM) { - add_disk(ns->disk); - if (ns->ms) { - struct block_device *bd = bdget_disk(ns->disk, 0); - if (!bd) - return; - if (blkdev_get(bd, FMODE_READ, NULL)) { - bdput(bd); - return; - } - blkdev_reread_part(bd); - blkdev_put(bd, FMODE_READ); - } - } - return; - out_free_disk: - kfree(disk); - list_del(&ns->list); - out_free_queue: - blk_cleanup_queue(ns->queue); - out_free_ns: - kfree(ns); -} - -/* - * Create I/O queues. Failing to create an I/O queue is not an issue, - * we can continue with less than the desired amount of queues, and - * even a controller without I/O queues an still be used to issue - * admin commands. This might be useful to upgrade a buggy firmware - * for example. - */ -static void nvme_create_io_queues(struct nvme_dev *dev) +static int nvme_create_io_queues(struct nvme_dev *dev) { unsigned i; + int ret = 0; - for (i = dev->queue_count; i <= dev->max_qid; i++) - if (!nvme_alloc_queue(dev, i, dev->q_depth)) + for (i = dev->queue_count; i <= dev->max_qid; i++) { + if (!nvme_alloc_queue(dev, i, dev->q_depth)) { + ret = -ENOMEM; break; + } + } - for (i = dev->online_queues; i <= dev->queue_count - 1; i++) - if (nvme_create_queue(dev->queues[i], i)) { + for (i = dev->online_queues; i <= dev->queue_count - 1; i++) { + ret = nvme_create_queue(dev->queues[i], i); + if (ret) { nvme_free_queues(dev, i); break; } -} - -static int set_queue_count(struct nvme_dev *dev, int count) -{ - int status; - u32 result; - u32 q_count = (count - 1) | ((count - 1) << 16); - - status = nvme_set_features(dev, NVME_FEAT_NUM_QUEUES, q_count, 0, - &result); - if (status < 0) - return status; - if (status > 0) { - dev_err(dev->dev, "Could not set queue count (%d)\n", status); - return 0; } - return min(result & 0xffff, result >> 16) + 1; + + /* + * Ignore failing Create SQ/CQ commands, we can continue with less + * than the desired aount of queues, and even a controller without + * I/O queues an still be used to issue admin commands. This might + * be useful to upgrade a buggy firmware for example. + */ + return ret >= 0 ? 0 : ret; } static void __iomem *nvme_map_cmb(struct nvme_dev *dev) @@ -2379,11 +1420,11 @@ static void __iomem *nvme_map_cmb(struct nvme_dev *dev) if (!use_cmb_sqes) return NULL; - dev->cmbsz = readl(&dev->bar->cmbsz); + dev->cmbsz = readl(dev->bar + NVME_REG_CMBSZ); if (!(NVME_CMB_SZ(dev->cmbsz))) return NULL; - cmbloc = readl(&dev->bar->cmbloc); + cmbloc = readl(dev->bar + NVME_REG_CMBLOC); szu = (u64)1 << (12 + 4 * NVME_CMB_SZU(dev->cmbsz)); size = szu * NVME_CMB_SZ(dev->cmbsz); @@ -2431,11 +1472,20 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) int result, i, vecs, nr_io_queues, size; nr_io_queues = num_possible_cpus(); - result = set_queue_count(dev, nr_io_queues); - if (result <= 0) + result = nvme_set_queue_count(&dev->ctrl, &nr_io_queues); + if (result < 0) return result; - if (result < nr_io_queues) - nr_io_queues = result; + + /* + * Degraded controllers might return an error when setting the queue + * count. We still want to be able to bring them online and offer + * access to the admin queue, as that might be only way to fix them up. + */ + if (result > 0) { + dev_err(dev->dev, "Could not set queue count (%d)\n", result); + nr_io_queues = 0; + result = 0; + } if (dev->cmb && NVME_CMB_SQS(dev->cmbsz)) { result = nvme_cmb_qdepth(dev, nr_io_queues, @@ -2457,7 +1507,7 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) return -ENOMEM; size = db_bar_size(dev, nr_io_queues); } while (1); - dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->dbs = dev->bar + 4096; adminq->q_db = dev->dbs; } @@ -2501,115 +1551,115 @@ static int nvme_setup_io_queues(struct nvme_dev *dev) /* Free previously allocated queues that are no longer usable */ nvme_free_queues(dev, nr_io_queues + 1); - nvme_create_io_queues(dev); - - return 0; + return nvme_create_io_queues(dev); free_queues: nvme_free_queues(dev, 1); return result; } -static int ns_cmp(void *priv, struct list_head *a, struct list_head *b) +static void nvme_set_irq_hints(struct nvme_dev *dev) { - struct nvme_ns *nsa = container_of(a, struct nvme_ns, list); - struct nvme_ns *nsb = container_of(b, struct nvme_ns, list); + struct nvme_queue *nvmeq; + int i; - return nsa->ns_id - nsb->ns_id; -} + for (i = 0; i < dev->online_queues; i++) { + nvmeq = dev->queues[i]; -static struct nvme_ns *nvme_find_ns(struct nvme_dev *dev, unsigned nsid) -{ - struct nvme_ns *ns; + if (!nvmeq->tags || !(*nvmeq->tags)) + continue; - list_for_each_entry(ns, &dev->namespaces, list) { - if (ns->ns_id == nsid) - return ns; - if (ns->ns_id > nsid) - break; + irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, + blk_mq_tags_cpumask(*nvmeq->tags)); } - return NULL; } -static inline bool nvme_io_incapable(struct nvme_dev *dev) +static void nvme_dev_scan(struct work_struct *work) { - return (!dev->bar || readl(&dev->bar->csts) & NVME_CSTS_CFS || - dev->online_queues < 2); + struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); + + if (!dev->tagset.tags) + return; + nvme_scan_namespaces(&dev->ctrl); + nvme_set_irq_hints(dev); } -static void nvme_ns_remove(struct nvme_ns *ns) +static void nvme_del_queue_end(struct request *req, int error) { - bool kill = nvme_io_incapable(ns->dev) && !blk_queue_dying(ns->queue); + struct nvme_queue *nvmeq = req->end_io_data; - if (kill) { - blk_set_queue_dying(ns->queue); - - /* - * The controller was shutdown first if we got here through - * device removal. The shutdown may requeue outstanding - * requests. These need to be aborted immediately so - * del_gendisk doesn't block indefinitely for their completion. - */ - blk_mq_abort_requeue_list(ns->queue); - } - if (ns->disk->flags & GENHD_FL_UP) - del_gendisk(ns->disk); - if (kill || !blk_queue_dying(ns->queue)) { - blk_mq_abort_requeue_list(ns->queue); - blk_cleanup_queue(ns->queue); - } - list_del_init(&ns->list); - kref_put(&ns->kref, nvme_free_ns); + blk_mq_free_request(req); + complete(&nvmeq->dev->ioq_wait); } -static void nvme_scan_namespaces(struct nvme_dev *dev, unsigned nn) +static void nvme_del_cq_end(struct request *req, int error) { - struct nvme_ns *ns, *next; - unsigned i; + struct nvme_queue *nvmeq = req->end_io_data; - for (i = 1; i <= nn; i++) { - ns = nvme_find_ns(dev, i); - if (ns) { - if (revalidate_disk(ns->disk)) - nvme_ns_remove(ns); - } else - nvme_alloc_ns(dev, i); - } - list_for_each_entry_safe(ns, next, &dev->namespaces, list) { - if (ns->ns_id > nn) - nvme_ns_remove(ns); + if (!error) { + unsigned long flags; + + spin_lock_irqsave(&nvmeq->q_lock, flags); + nvme_process_cq(nvmeq); + spin_unlock_irqrestore(&nvmeq->q_lock, flags); } - list_sort(NULL, &dev->namespaces, ns_cmp); + + nvme_del_queue_end(req, error); } -static void nvme_set_irq_hints(struct nvme_dev *dev) +static int nvme_delete_queue(struct nvme_queue *nvmeq, u8 opcode) { - struct nvme_queue *nvmeq; - int i; + struct request_queue *q = nvmeq->dev->ctrl.admin_q; + struct request *req; + struct nvme_command cmd; - for (i = 0; i < dev->online_queues; i++) { - nvmeq = dev->queues[i]; + memset(&cmd, 0, sizeof(cmd)); + cmd.delete_queue.opcode = opcode; + cmd.delete_queue.qid = cpu_to_le16(nvmeq->qid); - if (!nvmeq->tags || !(*nvmeq->tags)) - continue; + req = nvme_alloc_request(q, &cmd, BLK_MQ_REQ_NOWAIT); + if (IS_ERR(req)) + return PTR_ERR(req); - irq_set_affinity_hint(dev->entry[nvmeq->cq_vector].vector, - blk_mq_tags_cpumask(*nvmeq->tags)); - } + req->timeout = ADMIN_TIMEOUT; + req->end_io_data = nvmeq; + + blk_execute_rq_nowait(q, NULL, req, false, + opcode == nvme_admin_delete_cq ? + nvme_del_cq_end : nvme_del_queue_end); + return 0; } -static void nvme_dev_scan(struct work_struct *work) +static void nvme_disable_io_queues(struct nvme_dev *dev) { - struct nvme_dev *dev = container_of(work, struct nvme_dev, scan_work); - struct nvme_id_ctrl *ctrl; + int pass; + unsigned long timeout; + u8 opcode = nvme_admin_delete_sq; - if (!dev->tagset.tags) - return; - if (nvme_identify_ctrl(dev, &ctrl)) - return; - nvme_scan_namespaces(dev, le32_to_cpup(&ctrl->nn)); - kfree(ctrl); - nvme_set_irq_hints(dev); + for (pass = 0; pass < 2; pass++) { + int sent = 0, i = dev->queue_count - 1; + + reinit_completion(&dev->ioq_wait); + retry: + timeout = ADMIN_TIMEOUT; + for (; i > 0; i--) { + struct nvme_queue *nvmeq = dev->queues[i]; + + if (!pass) + nvme_suspend_queue(nvmeq); + if (nvme_delete_queue(nvmeq, opcode)) + break; + ++sent; + } + while (sent--) { + timeout = wait_for_completion_io_timeout(&dev->ioq_wait, timeout); + if (timeout == 0) + return; + if (i) + goto retry; + } + opcode = nvme_admin_delete_cq; + } } /* @@ -2620,42 +1670,7 @@ static void nvme_dev_scan(struct work_struct *work) */ static int nvme_dev_add(struct nvme_dev *dev) { - struct pci_dev *pdev = to_pci_dev(dev->dev); - int res; - struct nvme_id_ctrl *ctrl; - int shift = NVME_CAP_MPSMIN(lo_hi_readq(&dev->bar->cap)) + 12; - - res = nvme_identify_ctrl(dev, &ctrl); - if (res) { - dev_err(dev->dev, "Identify Controller failed (%d)\n", res); - return -EIO; - } - - dev->oncs = le16_to_cpup(&ctrl->oncs); - dev->abort_limit = ctrl->acl + 1; - dev->vwc = ctrl->vwc; - memcpy(dev->serial, ctrl->sn, sizeof(ctrl->sn)); - memcpy(dev->model, ctrl->mn, sizeof(ctrl->mn)); - memcpy(dev->firmware_rev, ctrl->fr, sizeof(ctrl->fr)); - if (ctrl->mdts) - dev->max_hw_sectors = 1 << (ctrl->mdts + shift - 9); - else - dev->max_hw_sectors = UINT_MAX; - if ((pdev->vendor == PCI_VENDOR_ID_INTEL) && - (pdev->device == 0x0953) && ctrl->vs[3]) { - unsigned int max_hw_sectors; - - dev->stripe_size = 1 << (ctrl->vs[3] + shift); - max_hw_sectors = dev->stripe_size >> (shift - 9); - if (dev->max_hw_sectors) { - dev->max_hw_sectors = min(max_hw_sectors, - dev->max_hw_sectors); - } else - dev->max_hw_sectors = max_hw_sectors; - } - kfree(ctrl); - - if (!dev->tagset.tags) { + if (!dev->ctrl.tagset) { dev->tagset.ops = &nvme_mq_ops; dev->tagset.nr_hw_queues = dev->online_queues - 1; dev->tagset.timeout = NVME_IO_TIMEOUT; @@ -2668,8 +1683,9 @@ static int nvme_dev_add(struct nvme_dev *dev) if (blk_mq_alloc_tag_set(&dev->tagset)) return 0; + dev->ctrl.tagset = &dev->tagset; } - schedule_work(&dev->scan_work); + queue_work(nvme_workq, &dev->scan_work); return 0; } @@ -2699,7 +1715,7 @@ static int nvme_dev_map(struct nvme_dev *dev) if (!dev->bar) goto disable; - if (readl(&dev->bar->csts) == -1) { + if (readl(dev->bar + NVME_REG_CSTS) == -1) { result = -ENODEV; goto unmap; } @@ -2714,10 +1730,11 @@ static int nvme_dev_map(struct nvme_dev *dev) goto unmap; } - cap = lo_hi_readq(&dev->bar->cap); + cap = lo_hi_readq(dev->bar + NVME_REG_CAP); + dev->q_depth = min_t(int, NVME_CAP_MQES(cap) + 1, NVME_Q_DEPTH); dev->db_stride = 1 << NVME_CAP_STRIDE(cap); - dev->dbs = ((void __iomem *)dev->bar) + 4096; + dev->dbs = dev->bar + 4096; /* * Temporary fix for the Apple controller found in the MacBook8,1 and @@ -2730,9 +1747,11 @@ static int nvme_dev_map(struct nvme_dev *dev) dev->q_depth); } - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) + if (readl(dev->bar + NVME_REG_VS) >= NVME_VS(1, 2)) dev->cmb = nvme_map_cmb(dev); + pci_enable_pcie_error_reporting(pdev); + pci_save_state(pdev); return 0; unmap: @@ -2760,152 +1779,34 @@ static void nvme_dev_unmap(struct nvme_dev *dev) pci_release_regions(pdev); } - if (pci_is_enabled(pdev)) + if (pci_is_enabled(pdev)) { + pci_disable_pcie_error_reporting(pdev); pci_disable_device(pdev); -} - -struct nvme_delq_ctx { - struct task_struct *waiter; - struct kthread_worker *worker; - atomic_t refcount; -}; - -static void nvme_wait_dq(struct nvme_delq_ctx *dq, struct nvme_dev *dev) -{ - dq->waiter = current; - mb(); - - for (;;) { - set_current_state(TASK_KILLABLE); - if (!atomic_read(&dq->refcount)) - break; - if (!schedule_timeout(ADMIN_TIMEOUT) || - fatal_signal_pending(current)) { - /* - * Disable the controller first since we can't trust it - * at this point, but leave the admin queue enabled - * until all queue deletion requests are flushed. - * FIXME: This may take a while if there are more h/w - * queues than admin tags. - */ - set_current_state(TASK_RUNNING); - nvme_disable_ctrl(dev, lo_hi_readq(&dev->bar->cap)); - nvme_clear_queue(dev->queues[0]); - flush_kthread_worker(dq->worker); - nvme_disable_queue(dev, 0); - return; - } } - set_current_state(TASK_RUNNING); -} - -static void nvme_put_dq(struct nvme_delq_ctx *dq) -{ - atomic_dec(&dq->refcount); - if (dq->waiter) - wake_up_process(dq->waiter); -} - -static struct nvme_delq_ctx *nvme_get_dq(struct nvme_delq_ctx *dq) -{ - atomic_inc(&dq->refcount); - return dq; -} - -static void nvme_del_queue_end(struct nvme_queue *nvmeq) -{ - struct nvme_delq_ctx *dq = nvmeq->cmdinfo.ctx; - nvme_put_dq(dq); - - spin_lock_irq(&nvmeq->q_lock); - nvme_process_cq(nvmeq); - spin_unlock_irq(&nvmeq->q_lock); -} - -static int adapter_async_del_queue(struct nvme_queue *nvmeq, u8 opcode, - kthread_work_func_t fn) -{ - struct nvme_command c; - - memset(&c, 0, sizeof(c)); - c.delete_queue.opcode = opcode; - c.delete_queue.qid = cpu_to_le16(nvmeq->qid); - - init_kthread_work(&nvmeq->cmdinfo.work, fn); - return nvme_submit_admin_async_cmd(nvmeq->dev, &c, &nvmeq->cmdinfo, - ADMIN_TIMEOUT); -} - -static void nvme_del_cq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_cq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_cq, - nvme_del_cq_work_handler); -} - -static void nvme_del_sq_work_handler(struct kthread_work *work) -{ - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - int status = nvmeq->cmdinfo.status; - - if (!status) - status = nvme_delete_cq(nvmeq); - if (status) - nvme_del_queue_end(nvmeq); -} - -static int nvme_delete_sq(struct nvme_queue *nvmeq) -{ - return adapter_async_del_queue(nvmeq, nvme_admin_delete_sq, - nvme_del_sq_work_handler); } -static void nvme_del_queue_start(struct kthread_work *work) +static int nvme_dev_list_add(struct nvme_dev *dev) { - struct nvme_queue *nvmeq = container_of(work, struct nvme_queue, - cmdinfo.work); - if (nvme_delete_sq(nvmeq)) - nvme_del_queue_end(nvmeq); -} + bool start_thread = false; -static void nvme_disable_io_queues(struct nvme_dev *dev) -{ - int i; - DEFINE_KTHREAD_WORKER_ONSTACK(worker); - struct nvme_delq_ctx dq; - struct task_struct *kworker_task = kthread_run(kthread_worker_fn, - &worker, "nvme%d", dev->instance); - - if (IS_ERR(kworker_task)) { - dev_err(dev->dev, - "Failed to create queue del task\n"); - for (i = dev->queue_count - 1; i > 0; i--) - nvme_disable_queue(dev, i); - return; + spin_lock(&dev_list_lock); + if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { + start_thread = true; + nvme_thread = NULL; } + list_add(&dev->node, &dev_list); + spin_unlock(&dev_list_lock); - dq.waiter = NULL; - atomic_set(&dq.refcount, 0); - dq.worker = &worker; - for (i = dev->queue_count - 1; i > 0; i--) { - struct nvme_queue *nvmeq = dev->queues[i]; + if (start_thread) { + nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); + wake_up_all(&nvme_kthread_wait); + } else + wait_event_killable(nvme_kthread_wait, nvme_thread); - if (nvme_suspend_queue(nvmeq)) - continue; - nvmeq->cmdinfo.ctx = nvme_get_dq(&dq); - nvmeq->cmdinfo.worker = dq.worker; - init_kthread_work(&nvmeq->cmdinfo.work, nvme_del_queue_start); - queue_kthread_work(dq.worker, &nvmeq->cmdinfo.work); - } - nvme_wait_dq(&dq, dev); - kthread_stop(kworker_task); + if (IS_ERR_OR_NULL(nvme_thread)) + return nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; + + return 0; } /* @@ -2928,44 +1829,17 @@ static void nvme_dev_list_remove(struct nvme_dev *dev) kthread_stop(tmp); } -static void nvme_freeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - blk_mq_freeze_queue_start(ns->queue); - - spin_lock_irq(ns->queue->queue_lock); - queue_flag_set(QUEUE_FLAG_STOPPED, ns->queue); - spin_unlock_irq(ns->queue->queue_lock); - - blk_mq_cancel_requeue_work(ns->queue); - blk_mq_stop_hw_queues(ns->queue); - } -} - -static void nvme_unfreeze_queues(struct nvme_dev *dev) -{ - struct nvme_ns *ns; - - list_for_each_entry(ns, &dev->namespaces, list) { - queue_flag_clear_unlocked(QUEUE_FLAG_STOPPED, ns->queue); - blk_mq_unfreeze_queue(ns->queue); - blk_mq_start_stopped_hw_queues(ns->queue, true); - blk_mq_kick_requeue_list(ns->queue); - } -} - -static void nvme_dev_shutdown(struct nvme_dev *dev) +static void nvme_dev_disable(struct nvme_dev *dev, bool shutdown) { int i; u32 csts = -1; nvme_dev_list_remove(dev); + mutex_lock(&dev->shutdown_lock); if (dev->bar) { - nvme_freeze_queues(dev); - csts = readl(&dev->bar->csts); + nvme_stop_queues(&dev->ctrl); + csts = readl(dev->bar + NVME_REG_CSTS); } if (csts & NVME_CSTS_CFS || !(csts & NVME_CSTS_RDY)) { for (i = dev->queue_count - 1; i >= 0; i--) { @@ -2974,30 +1848,13 @@ static void nvme_dev_shutdown(struct nvme_dev *dev) } } else { nvme_disable_io_queues(dev); - nvme_shutdown_ctrl(dev); - nvme_disable_queue(dev, 0); + nvme_disable_admin_queue(dev, shutdown); } nvme_dev_unmap(dev); for (i = dev->queue_count - 1; i >= 0; i--) nvme_clear_queue(dev->queues[i]); -} - -static void nvme_dev_remove(struct nvme_dev *dev) -{ - struct nvme_ns *ns, *next; - - if (nvme_io_incapable(dev)) { - /* - * If the device is not capable of IO (surprise hot-removal, - * for example), we need to quiesce prior to deleting the - * namespaces. This will end outstanding requests and prevent - * attempts to sync dirty data. - */ - nvme_dev_shutdown(dev); - } - list_for_each_entry_safe(ns, next, &dev->namespaces, list) - nvme_ns_remove(ns); + mutex_unlock(&dev->shutdown_lock); } static int nvme_setup_prp_pools(struct nvme_dev *dev) @@ -3023,119 +1880,36 @@ static void nvme_release_prp_pools(struct nvme_dev *dev) dma_pool_destroy(dev->prp_small_pool); } -static DEFINE_IDA(nvme_instance_ida); - -static int nvme_set_instance(struct nvme_dev *dev) -{ - int instance, error; - - do { - if (!ida_pre_get(&nvme_instance_ida, GFP_KERNEL)) - return -ENODEV; - - spin_lock(&dev_list_lock); - error = ida_get_new(&nvme_instance_ida, &instance); - spin_unlock(&dev_list_lock); - } while (error == -EAGAIN); - - if (error) - return -ENODEV; - - dev->instance = instance; - return 0; -} - -static void nvme_release_instance(struct nvme_dev *dev) +static void nvme_pci_free_ctrl(struct nvme_ctrl *ctrl) { - spin_lock(&dev_list_lock); - ida_remove(&nvme_instance_ida, dev->instance); - spin_unlock(&dev_list_lock); -} - -static void nvme_free_dev(struct kref *kref) -{ - struct nvme_dev *dev = container_of(kref, struct nvme_dev, kref); + struct nvme_dev *dev = to_nvme_dev(ctrl); put_device(dev->dev); - put_device(dev->device); - nvme_release_instance(dev); if (dev->tagset.tags) blk_mq_free_tag_set(&dev->tagset); - if (dev->admin_q) - blk_put_queue(dev->admin_q); + if (dev->ctrl.admin_q) + blk_put_queue(dev->ctrl.admin_q); kfree(dev->queues); kfree(dev->entry); kfree(dev); } -static int nvme_dev_open(struct inode *inode, struct file *f) +static void nvme_reset_work(struct work_struct *work) { - struct nvme_dev *dev; - int instance = iminor(inode); - int ret = -ENODEV; - - spin_lock(&dev_list_lock); - list_for_each_entry(dev, &dev_list, node) { - if (dev->instance == instance) { - if (!dev->admin_q) { - ret = -EWOULDBLOCK; - break; - } - if (!kref_get_unless_zero(&dev->kref)) - break; - f->private_data = dev; - ret = 0; - break; - } - } - spin_unlock(&dev_list_lock); - - return ret; -} + struct nvme_dev *dev = container_of(work, struct nvme_dev, reset_work); + int result; -static int nvme_dev_release(struct inode *inode, struct file *f) -{ - struct nvme_dev *dev = f->private_data; - kref_put(&dev->kref, nvme_free_dev); - return 0; -} + if (WARN_ON(test_bit(NVME_CTRL_RESETTING, &dev->flags))) + goto out; -static long nvme_dev_ioctl(struct file *f, unsigned int cmd, unsigned long arg) -{ - struct nvme_dev *dev = f->private_data; - struct nvme_ns *ns; - - switch (cmd) { - case NVME_IOCTL_ADMIN_CMD: - return nvme_user_cmd(dev, NULL, (void __user *)arg); - case NVME_IOCTL_IO_CMD: - if (list_empty(&dev->namespaces)) - return -ENOTTY; - ns = list_first_entry(&dev->namespaces, struct nvme_ns, list); - return nvme_user_cmd(dev, ns, (void __user *)arg); - case NVME_IOCTL_RESET: - dev_warn(dev->dev, "resetting controller\n"); - return nvme_reset(dev); - case NVME_IOCTL_SUBSYS_RESET: - return nvme_subsys_reset(dev); - default: - return -ENOTTY; - } -} + /* + * If we're called to reset a live controller first shut it down before + * moving on. + */ + if (dev->bar) + nvme_dev_disable(dev, false); -static const struct file_operations nvme_dev_fops = { - .owner = THIS_MODULE, - .open = nvme_dev_open, - .release = nvme_dev_release, - .unlocked_ioctl = nvme_dev_ioctl, - .compat_ioctl = nvme_dev_ioctl, -}; - -static void nvme_probe_work(struct work_struct *work) -{ - struct nvme_dev *dev = container_of(work, struct nvme_dev, probe_work); - bool start_thread = false; - int result; + set_bit(NVME_CTRL_RESETTING, &dev->flags); result = nvme_dev_map(dev); if (result) @@ -3145,35 +1919,24 @@ static void nvme_probe_work(struct work_struct *work) if (result) goto unmap; - spin_lock(&dev_list_lock); - if (list_empty(&dev_list) && IS_ERR_OR_NULL(nvme_thread)) { - start_thread = true; - nvme_thread = NULL; - } - list_add(&dev->node, &dev_list); - spin_unlock(&dev_list_lock); - - if (start_thread) { - nvme_thread = kthread_run(nvme_kthread, NULL, "nvme"); - wake_up_all(&nvme_kthread_wait); - } else - wait_event_killable(nvme_kthread_wait, nvme_thread); - - if (IS_ERR_OR_NULL(nvme_thread)) { - result = nvme_thread ? PTR_ERR(nvme_thread) : -EINTR; - goto disable; - } - nvme_init_queue(dev->queues[0], 0); result = nvme_alloc_admin_tags(dev); if (result) goto disable; + result = nvme_init_identify(&dev->ctrl); + if (result) + goto free_tags; + result = nvme_setup_io_queues(dev); if (result) goto free_tags; - dev->event_limit = 1; + dev->ctrl.event_limit = NVME_NR_AEN_COMMANDS; + + result = nvme_dev_list_add(dev); + if (result) + goto remove; /* * Keep the controller around but remove all namespaces if we don't have @@ -3181,117 +1944,98 @@ static void nvme_probe_work(struct work_struct *work) */ if (dev->online_queues < 2) { dev_warn(dev->dev, "IO queues not created\n"); - nvme_dev_remove(dev); + nvme_remove_namespaces(&dev->ctrl); } else { - nvme_unfreeze_queues(dev); + nvme_start_queues(&dev->ctrl); nvme_dev_add(dev); } + clear_bit(NVME_CTRL_RESETTING, &dev->flags); return; + remove: + nvme_dev_list_remove(dev); free_tags: nvme_dev_remove_admin(dev); - blk_put_queue(dev->admin_q); - dev->admin_q = NULL; + blk_put_queue(dev->ctrl.admin_q); + dev->ctrl.admin_q = NULL; dev->queues[0]->tags = NULL; disable: - nvme_disable_queue(dev, 0); - nvme_dev_list_remove(dev); + nvme_disable_admin_queue(dev, false); unmap: nvme_dev_unmap(dev); out: - if (!work_busy(&dev->reset_work)) - nvme_dead_ctrl(dev); + nvme_remove_dead_ctrl(dev); } -static int nvme_remove_dead_ctrl(void *arg) +static void nvme_remove_dead_ctrl_work(struct work_struct *work) { - struct nvme_dev *dev = (struct nvme_dev *)arg; + struct nvme_dev *dev = container_of(work, struct nvme_dev, remove_work); struct pci_dev *pdev = to_pci_dev(dev->dev); if (pci_get_drvdata(pdev)) pci_stop_and_remove_bus_device_locked(pdev); - kref_put(&dev->kref, nvme_free_dev); - return 0; + nvme_put_ctrl(&dev->ctrl); } -static void nvme_dead_ctrl(struct nvme_dev *dev) +static void nvme_remove_dead_ctrl(struct nvme_dev *dev) { - dev_warn(dev->dev, "Device failed to resume\n"); - kref_get(&dev->kref); - if (IS_ERR(kthread_run(nvme_remove_dead_ctrl, dev, "nvme%d", - dev->instance))) { - dev_err(dev->dev, - "Failed to start controller remove task\n"); - kref_put(&dev->kref, nvme_free_dev); - } + dev_warn(dev->dev, "Removing after probe failure\n"); + kref_get(&dev->ctrl.kref); + if (!schedule_work(&dev->remove_work)) + nvme_put_ctrl(&dev->ctrl); } -static void nvme_reset_work(struct work_struct *ws) +static int nvme_reset(struct nvme_dev *dev) { - struct nvme_dev *dev = container_of(ws, struct nvme_dev, reset_work); - bool in_probe = work_busy(&dev->probe_work); - - nvme_dev_shutdown(dev); + if (!dev->ctrl.admin_q || blk_queue_dying(dev->ctrl.admin_q)) + return -ENODEV; - /* Synchronize with device probe so that work will see failure status - * and exit gracefully without trying to schedule another reset */ - flush_work(&dev->probe_work); + if (!queue_work(nvme_workq, &dev->reset_work)) + return -EBUSY; - /* Fail this device if reset occured during probe to avoid - * infinite initialization loops. */ - if (in_probe) { - nvme_dead_ctrl(dev); - return; - } - /* Schedule device resume asynchronously so the reset work is available - * to cleanup errors that may occur during reinitialization */ - schedule_work(&dev->probe_work); + flush_work(&dev->reset_work); + return 0; } -static int __nvme_reset(struct nvme_dev *dev) +static int nvme_pci_reg_read32(struct nvme_ctrl *ctrl, u32 off, u32 *val) { - if (work_pending(&dev->reset_work)) - return -EBUSY; - list_del_init(&dev->node); - queue_work(nvme_workq, &dev->reset_work); + *val = readl(to_nvme_dev(ctrl)->bar + off); return 0; } -static int nvme_reset(struct nvme_dev *dev) +static int nvme_pci_reg_write32(struct nvme_ctrl *ctrl, u32 off, u32 val) { - int ret; - - if (!dev->admin_q || blk_queue_dying(dev->admin_q)) - return -ENODEV; - - spin_lock(&dev_list_lock); - ret = __nvme_reset(dev); - spin_unlock(&dev_list_lock); - - if (!ret) { - flush_work(&dev->reset_work); - flush_work(&dev->probe_work); - return 0; - } + writel(val, to_nvme_dev(ctrl)->bar + off); + return 0; +} - return ret; +static int nvme_pci_reg_read64(struct nvme_ctrl *ctrl, u32 off, u64 *val) +{ + *val = readq(to_nvme_dev(ctrl)->bar + off); + return 0; } -static ssize_t nvme_sysfs_reset(struct device *dev, - struct device_attribute *attr, const char *buf, - size_t count) +static bool nvme_pci_io_incapable(struct nvme_ctrl *ctrl) { - struct nvme_dev *ndev = dev_get_drvdata(dev); - int ret; + struct nvme_dev *dev = to_nvme_dev(ctrl); - ret = nvme_reset(ndev); - if (ret < 0) - return ret; + return !dev->bar || dev->online_queues < 2; +} - return count; +static int nvme_pci_reset_ctrl(struct nvme_ctrl *ctrl) +{ + return nvme_reset(to_nvme_dev(ctrl)); } -static DEVICE_ATTR(reset_controller, S_IWUSR, NULL, nvme_sysfs_reset); + +static const struct nvme_ctrl_ops nvme_pci_ctrl_ops = { + .reg_read32 = nvme_pci_reg_read32, + .reg_write32 = nvme_pci_reg_write32, + .reg_read64 = nvme_pci_reg_read64, + .io_incapable = nvme_pci_io_incapable, + .reset_ctrl = nvme_pci_reset_ctrl, + .free_ctrl = nvme_pci_free_ctrl, +}; static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) { @@ -3314,46 +2058,30 @@ static int nvme_probe(struct pci_dev *pdev, const struct pci_device_id *id) if (!dev->queues) goto free; - INIT_LIST_HEAD(&dev->namespaces); - INIT_WORK(&dev->reset_work, nvme_reset_work); dev->dev = get_device(&pdev->dev); pci_set_drvdata(pdev, dev); - result = nvme_set_instance(dev); - if (result) - goto put_pci; + + INIT_LIST_HEAD(&dev->node); + INIT_WORK(&dev->scan_work, nvme_dev_scan); + INIT_WORK(&dev->reset_work, nvme_reset_work); + INIT_WORK(&dev->remove_work, nvme_remove_dead_ctrl_work); + mutex_init(&dev->shutdown_lock); + init_completion(&dev->ioq_wait); result = nvme_setup_prp_pools(dev); if (result) - goto release; - - kref_init(&dev->kref); - dev->device = device_create(nvme_class, &pdev->dev, - MKDEV(nvme_char_major, dev->instance), - dev, "nvme%d", dev->instance); - if (IS_ERR(dev->device)) { - result = PTR_ERR(dev->device); - goto release_pools; - } - get_device(dev->device); - dev_set_drvdata(dev->device, dev); + goto put_pci; - result = device_create_file(dev->device, &dev_attr_reset_controller); + result = nvme_init_ctrl(&dev->ctrl, &pdev->dev, &nvme_pci_ctrl_ops, + id->driver_data); if (result) - goto put_dev; + goto release_pools; - INIT_LIST_HEAD(&dev->node); - INIT_WORK(&dev->scan_work, nvme_dev_scan); - INIT_WORK(&dev->probe_work, nvme_probe_work); - schedule_work(&dev->probe_work); + queue_work(nvme_workq, &dev->reset_work); return 0; - put_dev: - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); - put_device(dev->device); release_pools: nvme_release_prp_pools(dev); - release: - nvme_release_instance(dev); put_pci: put_device(dev->dev); free: @@ -3368,15 +2096,15 @@ static void nvme_reset_notify(struct pci_dev *pdev, bool prepare) struct nvme_dev *dev = pci_get_drvdata(pdev); if (prepare) - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, false); else - schedule_work(&dev->probe_work); + queue_work(nvme_workq, &dev->reset_work); } static void nvme_shutdown(struct pci_dev *pdev) { struct nvme_dev *dev = pci_get_drvdata(pdev); - nvme_dev_shutdown(dev); + nvme_dev_disable(dev, true); } static void nvme_remove(struct pci_dev *pdev) @@ -3388,34 +2116,25 @@ static void nvme_remove(struct pci_dev *pdev) spin_unlock(&dev_list_lock); pci_set_drvdata(pdev, NULL); - flush_work(&dev->probe_work); flush_work(&dev->reset_work); flush_work(&dev->scan_work); - device_remove_file(dev->device, &dev_attr_reset_controller); - nvme_dev_remove(dev); - nvme_dev_shutdown(dev); + nvme_remove_namespaces(&dev->ctrl); + nvme_uninit_ctrl(&dev->ctrl); + nvme_dev_disable(dev, true); nvme_dev_remove_admin(dev); - device_destroy(nvme_class, MKDEV(nvme_char_major, dev->instance)); nvme_free_queues(dev, 0); nvme_release_cmb(dev); nvme_release_prp_pools(dev); - kref_put(&dev->kref, nvme_free_dev); + nvme_put_ctrl(&dev->ctrl); } -/* These functions are yet to be implemented */ -#define nvme_error_detected NULL -#define nvme_dump_registers NULL -#define nvme_link_reset NULL -#define nvme_slot_reset NULL -#define nvme_error_resume NULL - #ifdef CONFIG_PM_SLEEP static int nvme_suspend(struct device *dev) { struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - nvme_dev_shutdown(ndev); + nvme_dev_disable(ndev, true); return 0; } @@ -3424,17 +2143,53 @@ static int nvme_resume(struct device *dev) struct pci_dev *pdev = to_pci_dev(dev); struct nvme_dev *ndev = pci_get_drvdata(pdev); - schedule_work(&ndev->probe_work); + queue_work(nvme_workq, &ndev->reset_work); return 0; } #endif static SIMPLE_DEV_PM_OPS(nvme_dev_pm_ops, nvme_suspend, nvme_resume); +static pci_ers_result_t nvme_error_detected(struct pci_dev *pdev, + pci_channel_state_t state) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + /* + * A frozen channel requires a reset. When detected, this method will + * shutdown the controller to quiesce. The controller will be restarted + * after the slot reset through driver's slot_reset callback. + */ + dev_warn(&pdev->dev, "error detected: state:%d\n", state); + switch (state) { + case pci_channel_io_normal: + return PCI_ERS_RESULT_CAN_RECOVER; + case pci_channel_io_frozen: + nvme_dev_disable(dev, false); + return PCI_ERS_RESULT_NEED_RESET; + case pci_channel_io_perm_failure: + return PCI_ERS_RESULT_DISCONNECT; + } + return PCI_ERS_RESULT_NEED_RESET; +} + +static pci_ers_result_t nvme_slot_reset(struct pci_dev *pdev) +{ + struct nvme_dev *dev = pci_get_drvdata(pdev); + + dev_info(&pdev->dev, "restart after slot reset\n"); + pci_restore_state(pdev); + queue_work(nvme_workq, &dev->reset_work); + return PCI_ERS_RESULT_RECOVERED; +} + +static void nvme_error_resume(struct pci_dev *pdev) +{ + pci_cleanup_aer_uncorrect_error_status(pdev); +} + static const struct pci_error_handlers nvme_err_handler = { .error_detected = nvme_error_detected, - .mmio_enabled = nvme_dump_registers, - .link_reset = nvme_link_reset, .slot_reset = nvme_slot_reset, .resume = nvme_error_resume, .reset_notify = nvme_reset_notify, @@ -3444,6 +2199,10 @@ static const struct pci_error_handlers nvme_err_handler = { #define PCI_CLASS_STORAGE_EXPRESS 0x010802 static const struct pci_device_id nvme_id_table[] = { + { PCI_VDEVICE(INTEL, 0x0953), + .driver_data = NVME_QUIRK_STRIPE_SIZE, }, + { PCI_VDEVICE(INTEL, 0x5845), /* Qemu emulated controller */ + .driver_data = NVME_QUIRK_IDENTIFY_CNS, }, { PCI_DEVICE_CLASS(PCI_CLASS_STORAGE_EXPRESS, 0xffffff) }, { PCI_DEVICE(PCI_VENDOR_ID_APPLE, 0x2001) }, { 0, } @@ -3468,40 +2227,21 @@ static int __init nvme_init(void) init_waitqueue_head(&nvme_kthread_wait); - nvme_workq = create_singlethread_workqueue("nvme"); + nvme_workq = alloc_workqueue("nvme", WQ_UNBOUND | WQ_MEM_RECLAIM, 0); if (!nvme_workq) return -ENOMEM; - result = register_blkdev(nvme_major, "nvme"); + result = nvme_core_init(); if (result < 0) goto kill_workq; - else if (result > 0) - nvme_major = result; - - result = __register_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme", - &nvme_dev_fops); - if (result < 0) - goto unregister_blkdev; - else if (result > 0) - nvme_char_major = result; - - nvme_class = class_create(THIS_MODULE, "nvme"); - if (IS_ERR(nvme_class)) { - result = PTR_ERR(nvme_class); - goto unregister_chrdev; - } result = pci_register_driver(&nvme_driver); if (result) - goto destroy_class; + goto core_exit; return 0; - destroy_class: - class_destroy(nvme_class); - unregister_chrdev: - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); - unregister_blkdev: - unregister_blkdev(nvme_major, "nvme"); + core_exit: + nvme_core_exit(); kill_workq: destroy_workqueue(nvme_workq); return result; @@ -3510,10 +2250,8 @@ static int __init nvme_init(void) static void __exit nvme_exit(void) { pci_unregister_driver(&nvme_driver); - unregister_blkdev(nvme_major, "nvme"); + nvme_core_exit(); destroy_workqueue(nvme_workq); - class_destroy(nvme_class); - __unregister_chrdev(nvme_char_major, 0, NVME_MINORS, "nvme"); BUG_ON(nvme_thread && !IS_ERR(nvme_thread)); _nvme_check_size(); } diff --git a/drivers/nvme/host/scsi.c b/drivers/nvme/host/scsi.c index c3d8d3887a31..e947e298a737 100644 --- a/drivers/nvme/host/scsi.c +++ b/drivers/nvme/host/scsi.c @@ -524,7 +524,7 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ns *id_ns; int res; int nvme_sc; @@ -532,10 +532,10 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, u8 resp_data_format = 0x02; u8 protect; u8 cmdque = 0x01 << 1; - u8 fw_offset = sizeof(dev->firmware_rev); + u8 fw_offset = sizeof(ctrl->firmware_rev); /* nvme ns identify - use DPS value for PROTECT field */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -553,12 +553,12 @@ static int nvme_trans_standard_inquiry_page(struct nvme_ns *ns, inq_response[5] = protect; /* sccs=0 | acc=0 | tpgs=0 | pc3=0 */ inq_response[7] = cmdque; /* wbus16=0 | sync=0 | vs=0 */ strncpy(&inq_response[8], "NVMe ", 8); - strncpy(&inq_response[16], dev->model, 16); + strncpy(&inq_response[16], ctrl->model, 16); - while (dev->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) + while (ctrl->firmware_rev[fw_offset - 1] == ' ' && fw_offset > 4) fw_offset--; fw_offset -= 4; - strncpy(&inq_response[32], dev->firmware_rev + fw_offset, 4); + strncpy(&inq_response[32], ctrl->firmware_rev + fw_offset, 4); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); @@ -588,82 +588,113 @@ static int nvme_trans_unit_serial_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; int xfer_len; memset(inq_response, 0, STANDARD_INQUIRY_LENGTH); inq_response[1] = INQ_UNIT_SERIAL_NUMBER_PAGE; /* Page Code */ inq_response[3] = INQ_SERIAL_NUMBER_LENGTH; /* Page Length */ - strncpy(&inq_response[4], dev->serial, INQ_SERIAL_NUMBER_LENGTH); + strncpy(&inq_response[4], ns->ctrl->serial, INQ_SERIAL_NUMBER_LENGTH); xfer_len = min(alloc_len, STANDARD_INQUIRY_LENGTH); return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); } -static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, - u8 *inq_response, int alloc_len) +static int nvme_fill_device_id_eui64(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *inq_response, int alloc_len) { - struct nvme_dev *dev = ns->dev; - int res; - int nvme_sc; - int xfer_len; - __be32 tmp_id = cpu_to_be32(ns->ns_id); + struct nvme_id_ns *id_ns; + int nvme_sc, res; + size_t len; + void *eui; - memset(inq_response, 0, alloc_len); - inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; /* Page Code */ - if (readl(&dev->bar->vs) >= NVME_VS(1, 1)) { - struct nvme_id_ns *id_ns; - void *eui; - int len; + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); - res = nvme_trans_status_code(hdr, nvme_sc); - if (res) - return res; + eui = id_ns->eui64; + len = sizeof(id_ns->eui64); - eui = id_ns->eui64; - len = sizeof(id_ns->eui64); - if (readl(&dev->bar->vs) >= NVME_VS(1, 2)) { - if (bitmap_empty(eui, len * 8)) { - eui = id_ns->nguid; - len = sizeof(id_ns->nguid); - } - } + if (ns->ctrl->vs >= NVME_VS(1, 2)) { if (bitmap_empty(eui, len * 8)) { - kfree(id_ns); - goto scsi_string; + eui = id_ns->nguid; + len = sizeof(id_ns->nguid); } + } - inq_response[3] = 4 + len; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ - inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = len; /* Designator Length */ - memcpy(&inq_response[8], eui, len); - kfree(id_ns); - } else { - scsi_string: - if (alloc_len < 72) { - return nvme_trans_completion(hdr, - SAM_STAT_CHECK_CONDITION, - ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, - SCSI_ASCQ_CAUSE_NOT_REPORTABLE); - } - inq_response[3] = 0x48; /* Page Length */ - /* Designation Descriptor start */ - inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ - inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ - inq_response[6] = 0x00; /* Rsvd */ - inq_response[7] = 0x44; /* Designator Length */ - - sprintf(&inq_response[8], "%04x", to_pci_dev(dev->dev)->vendor); - memcpy(&inq_response[12], dev->model, sizeof(dev->model)); - sprintf(&inq_response[52], "%04x", tmp_id); - memcpy(&inq_response[56], dev->serial, sizeof(dev->serial)); + if (bitmap_empty(eui, len * 8)) { + res = -EOPNOTSUPP; + goto out_free_id; } - xfer_len = alloc_len; - return nvme_trans_copy_to_user(hdr, inq_response, xfer_len); + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 4 + len; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x01; /* Proto ID=0h | Code set=1h */ + inq_response[5] = 0x02; /* PIV=0b | Asso=00b | Designator Type=2h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = len; /* Designator Length */ + memcpy(&inq_response[8], eui, len); + + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); +out_free_id: + kfree(id_ns); + return res; +} + +static int nvme_fill_device_id_scsi_string(struct nvme_ns *ns, + struct sg_io_hdr *hdr, u8 *inq_response, int alloc_len) +{ + struct nvme_ctrl *ctrl = ns->ctrl; + struct nvme_id_ctrl *id_ctrl; + int nvme_sc, res; + + if (alloc_len < 72) { + return nvme_trans_completion(hdr, + SAM_STAT_CHECK_CONDITION, + ILLEGAL_REQUEST, SCSI_ASC_INVALID_CDB, + SCSI_ASCQ_CAUSE_NOT_REPORTABLE); + } + + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); + res = nvme_trans_status_code(hdr, nvme_sc); + if (res) + return res; + + memset(inq_response, 0, alloc_len); + inq_response[1] = INQ_DEVICE_IDENTIFICATION_PAGE; + inq_response[3] = 0x48; /* Page Length */ + + /* Designation Descriptor start */ + inq_response[4] = 0x03; /* Proto ID=0h | Code set=3h */ + inq_response[5] = 0x08; /* PIV=0b | Asso=00b | Designator Type=8h */ + inq_response[6] = 0x00; /* Rsvd */ + inq_response[7] = 0x44; /* Designator Length */ + + sprintf(&inq_response[8], "%04x", le16_to_cpu(id_ctrl->vid)); + memcpy(&inq_response[12], ctrl->model, sizeof(ctrl->model)); + sprintf(&inq_response[52], "%04x", cpu_to_be32(ns->ns_id)); + memcpy(&inq_response[56], ctrl->serial, sizeof(ctrl->serial)); + + res = nvme_trans_copy_to_user(hdr, inq_response, alloc_len); + kfree(id_ctrl); + return res; +} + +static int nvme_trans_device_id_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, + u8 *resp, int alloc_len) +{ + int res; + + if (ns->ctrl->vs >= NVME_VS(1, 1)) { + res = nvme_fill_device_id_eui64(ns, hdr, resp, alloc_len); + if (res != -EOPNOTSUPP) + return res; + } + + return nvme_fill_device_id_scsi_string(ns, hdr, resp, alloc_len); } static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, @@ -672,7 +703,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *inq_response; int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; + struct nvme_ctrl *ctrl = ns->ctrl; struct nvme_id_ctrl *id_ctrl; struct nvme_id_ns *id_ns; int xfer_len; @@ -688,7 +719,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (inq_response == NULL) return -ENOMEM; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out_free_inq; @@ -704,7 +735,7 @@ static int nvme_trans_ext_inq_page(struct nvme_ns *ns, struct sg_io_hdr *hdr, app_chk = protect << 1; ref_chk = protect; - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) goto out_free_inq; @@ -815,7 +846,6 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, int res; int xfer_len; u8 *log_response; - struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; u8 temp_c; u16 temp_k; @@ -824,7 +854,7 @@ static int nvme_trans_log_info_exceptions(struct nvme_ns *ns, if (log_response == NULL) return -ENOMEM; - res = nvme_get_log_page(dev, &smart_log); + res = nvme_get_log_page(ns->ctrl, &smart_log); if (res < 0) goto out_free_response; @@ -862,7 +892,6 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, int res; int xfer_len; u8 *log_response; - struct nvme_dev *dev = ns->dev; struct nvme_smart_log *smart_log; u32 feature_resp; u8 temp_c_cur, temp_c_thresh; @@ -872,7 +901,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, if (log_response == NULL) return -ENOMEM; - res = nvme_get_log_page(dev, &smart_log); + res = nvme_get_log_page(ns->ctrl, &smart_log); if (res < 0) goto out_free_response; @@ -886,7 +915,7 @@ static int nvme_trans_log_temperature(struct nvme_ns *ns, struct sg_io_hdr *hdr, kfree(smart_log); /* Get Features for Temp Threshold */ - res = nvme_get_features(dev, NVME_FEAT_TEMP_THRESH, 0, 0, + res = nvme_get_features(ns->ctrl, NVME_FEAT_TEMP_THRESH, 0, 0, &feature_resp); if (res != NVME_SC_SUCCESS) temp_c_thresh = LOG_TEMP_UNKNOWN; @@ -948,7 +977,6 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 flbas; u32 lba_length; @@ -958,7 +986,7 @@ static int nvme_trans_fill_blk_desc(struct nvme_ns *ns, struct sg_io_hdr *hdr, else if (llbaa > 0 && len < MODE_PAGE_LLBAA_BLK_DES_LEN) return -EINVAL; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1014,14 +1042,13 @@ static int nvme_trans_fill_caching_page(struct nvme_ns *ns, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; u32 feature_resp; u8 vwc; if (len < MODE_PAGE_CACHING_LEN) return -EINVAL; - nvme_sc = nvme_get_features(dev, NVME_FEAT_VOLATILE_WC, 0, 0, + nvme_sc = nvme_get_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, 0, 0, &feature_resp); res = nvme_trans_status_code(hdr, nvme_sc); if (res) @@ -1207,12 +1234,11 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ctrl *id_ctrl; int lowest_pow_st; /* max npss = lowest power consumption */ unsigned ps_desired = 0; - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1256,7 +1282,7 @@ static int nvme_trans_power_state(struct nvme_ns *ns, struct sg_io_hdr *hdr, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); break; } - nvme_sc = nvme_set_features(dev, NVME_FEAT_POWER_MGMT, ps_desired, 0, + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_POWER_MGMT, ps_desired, 0, NULL); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1280,7 +1306,6 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr u8 buffer_id) { int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_command c; if (hdr->iovec_count > 0) { @@ -1297,7 +1322,7 @@ static int nvme_trans_send_download_fw_cmd(struct nvme_ns *ns, struct sg_io_hdr c.dlfw.numd = cpu_to_le32((tot_len/BYTES_TO_DWORDS) - 1); c.dlfw.offset = cpu_to_le32(offset/BYTES_TO_DWORDS); - nvme_sc = __nvme_submit_sync_cmd(dev->admin_q, &c, NULL, + nvme_sc = nvme_submit_user_cmd(ns->ctrl->admin_q, &c, hdr->dxferp, tot_len, NULL, 0); return nvme_trans_status_code(hdr, nvme_sc); } @@ -1364,14 +1389,13 @@ static int nvme_trans_modesel_get_mp(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; unsigned dword11; switch (page_code) { case MODE_PAGE_CACHING: dword11 = ((mode_page[2] & CACHING_MODE_PAGE_WCE_MASK) ? 1 : 0); - nvme_sc = nvme_set_features(dev, NVME_FEAT_VOLATILE_WC, dword11, - 0, NULL); + nvme_sc = nvme_set_features(ns->ctrl, NVME_FEAT_VOLATILE_WC, + dword11, 0, NULL); res = nvme_trans_status_code(hdr, nvme_sc); break; case MODE_PAGE_CONTROL: @@ -1473,7 +1497,6 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, { int res = 0; int nvme_sc; - struct nvme_dev *dev = ns->dev; u8 flbas; /* @@ -1486,7 +1509,7 @@ static int nvme_trans_fmt_set_blk_size_count(struct nvme_ns *ns, if (ns->mode_select_num_blocks == 0 || ns->mode_select_block_len == 0) { struct nvme_id_ns *id_ns; - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1570,7 +1593,6 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, { int res; int nvme_sc; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 i; u8 flbas, nlbaf; @@ -1579,7 +1601,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, struct nvme_command c; /* Loop thru LBAF's in id_ns to match reqd lbaf, put in cdw10 */ - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -1611,7 +1633,7 @@ static int nvme_trans_fmt_send_cmd(struct nvme_ns *ns, struct sg_io_hdr *hdr, c.format.nsid = cpu_to_le32(ns->ns_id); c.format.cdw10 = cpu_to_le32(cdw10); - nvme_sc = nvme_submit_sync_cmd(dev->admin_q, &c, NULL, 0); + nvme_sc = nvme_submit_sync_cmd(ns->ctrl->admin_q, &c, NULL, 0); res = nvme_trans_status_code(hdr, nvme_sc); kfree(id_ns); @@ -1704,7 +1726,7 @@ static int nvme_trans_do_nvme_io(struct nvme_ns *ns, struct sg_io_hdr *hdr, nvme_sc = NVME_SC_LBA_RANGE; break; } - nvme_sc = __nvme_submit_sync_cmd(ns->queue, &c, NULL, + nvme_sc = nvme_submit_user_cmd(ns->queue, &c, next_mapping_addr, unit_len, NULL, 0); if (nvme_sc) break; @@ -2040,7 +2062,6 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, u32 alloc_len; u32 resp_size; u32 xfer_len; - struct nvme_dev *dev = ns->dev; struct nvme_id_ns *id_ns; u8 *response; @@ -2052,7 +2073,7 @@ static int nvme_trans_read_capacity(struct nvme_ns *ns, struct sg_io_hdr *hdr, resp_size = READ_CAP_10_RESP_SIZE; } - nvme_sc = nvme_identify_ns(dev, ns->ns_id, &id_ns); + nvme_sc = nvme_identify_ns(ns->ctrl, ns->ns_id, &id_ns); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -2080,7 +2101,6 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, int nvme_sc; u32 alloc_len, xfer_len, resp_size; u8 *response; - struct nvme_dev *dev = ns->dev; struct nvme_id_ctrl *id_ctrl; u32 ll_length, lun_id; u8 lun_id_offset = REPORT_LUNS_FIRST_LUN_OFFSET; @@ -2094,7 +2114,7 @@ static int nvme_trans_report_luns(struct nvme_ns *ns, struct sg_io_hdr *hdr, case ALL_LUNS_RETURNED: case ALL_WELL_KNOWN_LUNS_RETURNED: case RESTRICTED_LUNS_RETURNED: - nvme_sc = nvme_identify_ctrl(dev, &id_ctrl); + nvme_sc = nvme_identify_ctrl(ns->ctrl, &id_ctrl); res = nvme_trans_status_code(hdr, nvme_sc); if (res) return res; @@ -2295,9 +2315,7 @@ static int nvme_trans_test_unit_ready(struct nvme_ns *ns, struct sg_io_hdr *hdr, u8 *cmd) { - struct nvme_dev *dev = ns->dev; - - if (!(readl(&dev->bar->csts) & NVME_CSTS_RDY)) + if (nvme_ctrl_ready(ns->ctrl)) return nvme_trans_completion(hdr, SAM_STAT_CHECK_CONDITION, NOT_READY, SCSI_ASC_LUN_NOT_READY, SCSI_ASCQ_CAUSE_NOT_REPORTABLE); diff --git a/drivers/oprofile/oprofilefs.c b/drivers/oprofile/oprofilefs.c index dd92c5edf219..b48ac6300c79 100644 --- a/drivers/oprofile/oprofilefs.c +++ b/drivers/oprofile/oprofilefs.c @@ -138,22 +138,22 @@ static int __oprofilefs_create_file(struct dentry *root, char const *name, struct dentry *dentry; struct inode *inode; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) { - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return -ENOMEM; } inode = oprofilefs_get_inode(root->d_sb, S_IFREG | perm); if (!inode) { dput(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return -ENOMEM; } inode->i_fop = fops; inode->i_private = priv; d_add(dentry, inode); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; } @@ -215,22 +215,22 @@ struct dentry *oprofilefs_mkdir(struct dentry *parent, char const *name) struct dentry *dentry; struct inode *inode; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = d_alloc_name(parent, name); if (!dentry) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return NULL; } inode = oprofilefs_get_inode(parent->d_sb, S_IFDIR | 0755); if (!inode) { dput(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return NULL; } inode->i_op = &simple_dir_inode_operations; inode->i_fop = &simple_dir_operations; d_add(dentry, inode); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return dentry; } diff --git a/drivers/scsi/3w-xxxx.c b/drivers/scsi/3w-xxxx.c index 2940bd769936..25aba1613e21 100644 --- a/drivers/scsi/3w-xxxx.c +++ b/drivers/scsi/3w-xxxx.c @@ -1045,6 +1045,9 @@ static int tw_chrdev_open(struct inode *inode, struct file *file) static const struct file_operations tw_fops = { .owner = THIS_MODULE, .unlocked_ioctl = tw_chrdev_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = tw_chrdev_ioctl, +#endif .open = tw_chrdev_open, .release = NULL, .llseek = noop_llseek, diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig index c1fe0d2f90ca..e2f31c93717d 100644 --- a/drivers/scsi/Kconfig +++ b/drivers/scsi/Kconfig @@ -1106,6 +1106,7 @@ config SCSI_IPR tristate "IBM Power Linux RAID adapter support" depends on PCI && SCSI && ATA select FW_LOADER + select IRQ_POLL ---help--- This driver supports the IBM Power Linux family RAID adapters. This includes IBM pSeries 5712, 5703, 5709, and 570A, as well @@ -1620,23 +1621,6 @@ config ATARI_SCSI ST-DMA, replacing ACSI). It does NOT support other schemes, like in the Hades (without DMA). -config ATARI_SCSI_TOSHIBA_DELAY - bool "Long delays for Toshiba CD-ROMs" - depends on ATARI_SCSI - help - This option increases the delay after a SCSI arbitration to - accommodate some flaky Toshiba CD-ROM drives. Say Y if you intend to - use a Toshiba CD-ROM drive; otherwise, the option is not needed and - would impact performance a bit, so say N. - -config ATARI_SCSI_RESET_BOOT - bool "Reset SCSI-devices at boottime" - depends on ATARI_SCSI - help - Reset the devices on your Atari whenever it boots. This makes the - boot process fractionally longer but may assist recovery from errors - that leave the devices with SCSI operations partway completed. - config MAC_SCSI tristate "Macintosh NCR5380 SCSI" depends on MAC && SCSI=y diff --git a/drivers/scsi/NCR5380.c b/drivers/scsi/NCR5380.c index a777e5c412df..d72867257346 100644 --- a/drivers/scsi/NCR5380.c +++ b/drivers/scsi/NCR5380.c @@ -1,17 +1,17 @@ -/* +/* * NCR 5380 generic driver routines. These should make it *trivial* - * to implement 5380 SCSI drivers under Linux with a non-trantor - * architecture. + * to implement 5380 SCSI drivers under Linux with a non-trantor + * architecture. * - * Note that these routines also work with NR53c400 family chips. + * Note that these routines also work with NR53c400 family chips. * * Copyright 1993, Drew Eckhardt - * Visionary Computing - * (Unix and Linux consulting and custom programming) - * drew@colorado.edu - * +1 (303) 666-5836 + * Visionary Computing + * (Unix and Linux consulting and custom programming) + * drew@colorado.edu + * +1 (303) 666-5836 * - * For more information, please consult + * For more information, please consult * * NCR 5380 Family * SCSI Protocol Controller @@ -25,84 +25,28 @@ */ /* - * Revision 1.10 1998/9/2 Alan Cox - * (alan@lxorguk.ukuu.org.uk) - * Fixed up the timer lockups reported so far. Things still suck. Looking - * forward to 2.3 and per device request queues. Then it'll be possible to - * SMP thread this beast and improve life no end. - - * Revision 1.9 1997/7/27 Ronald van Cuijlenborg - * (ronald.van.cuijlenborg@tip.nl or nutty@dds.nl) - * (hopefully) fixed and enhanced USLEEP - * added support for DTC3181E card (for Mustek scanner) - * - - * Revision 1.8 Ingmar Baumgart - * (ingmar@gonzo.schwaben.de) - * added support for NCR53C400a card - * - - * Revision 1.7 1996/3/2 Ray Van Tassle (rayvt@comm.mot.com) - * added proc_info - * added support needed for DTC 3180/3280 - * fixed a couple of bugs - * - - * Revision 1.5 1994/01/19 09:14:57 drew - * Fixed udelay() hack that was being used on DATAOUT phases - * instead of a proper wait for the final handshake. - * - * Revision 1.4 1994/01/19 06:44:25 drew - * *** empty log message *** - * - * Revision 1.3 1994/01/19 05:24:40 drew - * Added support for TCR LAST_BYTE_SENT bit. - * - * Revision 1.2 1994/01/15 06:14:11 drew - * REAL DMA support, bug fixes. - * - * Revision 1.1 1994/01/15 06:00:54 drew - * Initial revision - * + * With contributions from Ray Van Tassle, Ingmar Baumgart, + * Ronald van Cuijlenborg, Alan Cox and others. */ /* - * Further development / testing that should be done : + * Further development / testing that should be done : * 1. Cleanup the NCR5380_transfer_dma function and DMA operation complete - * code so that everything does the same thing that's done at the - * end of a pseudo-DMA read operation. + * code so that everything does the same thing that's done at the + * end of a pseudo-DMA read operation. * * 2. Fix REAL_DMA (interrupt driven, polled works fine) - - * basically, transfer size needs to be reduced by one - * and the last byte read as is done with PSEUDO_DMA. - * - * 4. Test SCSI-II tagged queueing (I have no devices which support - * tagged queueing) - * - * 5. Test linked command handling code after Eric is ready with - * the high level code. + * basically, transfer size needs to be reduced by one + * and the last byte read as is done with PSEUDO_DMA. + * + * 4. Test SCSI-II tagged queueing (I have no devices which support + * tagged queueing) */ -#include <scsi/scsi_dbg.h> -#include <scsi/scsi_transport_spi.h> - -#if (NDEBUG & NDEBUG_LISTS) -#define LIST(x,y) {printk("LINE:%d Adding %p to %p\n", __LINE__, (void*)(x), (void*)(y)); if ((x)==(y)) udelay(5); } -#define REMOVE(w,x,y,z) {printk("LINE:%d Removing: %p->%p %p->%p \n", __LINE__, (void*)(w), (void*)(x), (void*)(y), (void*)(z)); if ((x)==(y)) udelay(5); } -#else -#define LIST(x,y) -#define REMOVE(w,x,y,z) -#endif #ifndef notyet -#undef LINKED #undef REAL_DMA #endif -#ifdef REAL_DMA_POLL -#undef READ_OVERRUNS -#define READ_OVERRUNS -#endif - #ifdef BOARD_REQUIRES_NO_DELAY #define io_recovery_delay(x) #else @@ -112,44 +56,28 @@ /* * Design * - * This is a generic 5380 driver. To use it on a different platform, + * This is a generic 5380 driver. To use it on a different platform, * one simply writes appropriate system specific macros (ie, data - * transfer - some PC's will use the I/O bus, 68K's must use + * transfer - some PC's will use the I/O bus, 68K's must use * memory mapped) and drops this file in their 'C' wrapper. * - * (Note from hch: unfortunately it was not enough for the different - * m68k folks and instead of improving this driver they copied it - * and hacked it up for their needs. As a consequence they lost - * most updates to this driver. Maybe someone will fix all these - * drivers to use a common core one day..) - * - * As far as command queueing, two queues are maintained for + * As far as command queueing, two queues are maintained for * each 5380 in the system - commands that haven't been issued yet, - * and commands that are currently executing. This means that an - * unlimited number of commands may be queued, letting - * more commands propagate from the higher driver levels giving higher - * throughput. Note that both I_T_L and I_T_L_Q nexuses are supported, - * allowing multiple commands to propagate all the way to a SCSI-II device + * and commands that are currently executing. This means that an + * unlimited number of commands may be queued, letting + * more commands propagate from the higher driver levels giving higher + * throughput. Note that both I_T_L and I_T_L_Q nexuses are supported, + * allowing multiple commands to propagate all the way to a SCSI-II device * while a command is already executing. * * - * Issues specific to the NCR5380 : + * Issues specific to the NCR5380 : * - * When used in a PIO or pseudo-dma mode, the NCR5380 is a braindead - * piece of hardware that requires you to sit in a loop polling for - * the REQ signal as long as you are connected. Some devices are - * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect - * while doing long seek operations. - * - * The workaround for this is to keep track of devices that have - * disconnected. If the device hasn't disconnected, for commands that - * should disconnect, we do something like - * - * while (!REQ is asserted) { sleep for N usecs; poll for M usecs } - * - * Some tweaking of N and M needs to be done. An algorithm based - * on "time to data" would give the best results as long as short time - * to datas (ie, on the same track) were considered, however these + * When used in a PIO or pseudo-dma mode, the NCR5380 is a braindead + * piece of hardware that requires you to sit in a loop polling for + * the REQ signal as long as you are connected. Some devices are + * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect + * while doing long seek operations. [...] These * broken devices are the exception rather than the rule and I'd rather * spend my time optimizing for the normal case. * @@ -159,23 +87,23 @@ * which is started from a workqueue for each NCR5380 host in the * system. It attempts to establish I_T_L or I_T_L_Q nexuses by * removing the commands from the issue queue and calling - * NCR5380_select() if a nexus is not established. + * NCR5380_select() if a nexus is not established. * * Once a nexus is established, the NCR5380_information_transfer() * phase goes through the various phases as instructed by the target. * if the target goes into MSG IN and sends a DISCONNECT message, * the command structure is placed into the per instance disconnected - * queue, and NCR5380_main tries to find more work. If the target is + * queue, and NCR5380_main tries to find more work. If the target is * idle for too long, the system will try to sleep. * * If a command has disconnected, eventually an interrupt will trigger, * calling NCR5380_intr() which will in turn call NCR5380_reselect * to reestablish a nexus. This will run main if necessary. * - * On command termination, the done function will be called as + * On command termination, the done function will be called as * appropriate. * - * SCSI pointers are maintained in the SCp field of SCSI command + * SCSI pointers are maintained in the SCp field of SCSI command * structures, being initialized after the command is connected * in NCR5380_select, and set as appropriate in NCR5380_information_transfer. * Note that in violation of the standard, an implicit SAVE POINTERS operation @@ -185,73 +113,48 @@ /* * Using this file : * This file a skeleton Linux SCSI driver for the NCR 5380 series - * of chips. To use it, you write an architecture specific functions + * of chips. To use it, you write an architecture specific functions * and macros and include this file in your driver. * - * These macros control options : - * AUTOPROBE_IRQ - if defined, the NCR5380_probe_irq() function will be - * defined. - * + * These macros control options : + * AUTOPROBE_IRQ - if defined, the NCR5380_probe_irq() function will be + * defined. + * * AUTOSENSE - if defined, REQUEST SENSE will be performed automatically - * for commands that return with a CHECK CONDITION status. + * for commands that return with a CHECK CONDITION status. * * DIFFERENTIAL - if defined, NCR53c81 chips will use external differential - * transceivers. + * transceivers. * * DONT_USE_INTR - if defined, never use interrupts, even if we probe or - * override-configure an IRQ. - * - * LIMIT_TRANSFERSIZE - if defined, limit the pseudo-dma transfers to 512 - * bytes at a time. Since interrupts are disabled by default during - * these transfers, we might need this to give reasonable interrupt - * service time if the transfer size gets too large. - * - * LINKED - if defined, linked commands are supported. + * override-configure an IRQ. * * PSEUDO_DMA - if defined, PSEUDO DMA is used during the data transfer phases. * * REAL_DMA - if defined, REAL DMA is used during the data transfer phases. * * REAL_DMA_POLL - if defined, REAL DMA is used but the driver doesn't - * rely on phase mismatch and EOP interrupts to determine end - * of phase. - * - * UNSAFE - leave interrupts enabled during pseudo-DMA transfers. You - * only really want to use this if you're having a problem with - * dropped characters during high speed communications, and even - * then, you're going to be better off twiddling with transfersize - * in the high level code. - * - * Defaults for these will be provided although the user may want to adjust - * these to allocate CPU resources to the SCSI driver or "real" code. - * - * USLEEP_SLEEP - amount of time, in jiffies, to sleep - * - * USLEEP_POLL - amount of time, in jiffies, to poll + * rely on phase mismatch and EOP interrupts to determine end + * of phase. * * These macros MUST be defined : - * NCR5380_local_declare() - declare any local variables needed for your - * transfer routines. * - * NCR5380_setup(instance) - initialize any local variables needed from a given - * instance of the host adapter for NCR5380_{read,write,pread,pwrite} - * * NCR5380_read(register) - read from the specified register * - * NCR5380_write(register, value) - write to the specific register + * NCR5380_write(register, value) - write to the specific register * - * NCR5380_implementation_fields - additional fields needed for this - * specific implementation of the NCR5380 + * NCR5380_implementation_fields - additional fields needed for this + * specific implementation of the NCR5380 * * Either real DMA *or* pseudo DMA may be implemented - * REAL functions : + * REAL functions : * NCR5380_REAL_DMA should be defined if real DMA is to be used. - * Note that the DMA setup functions should return the number of bytes - * that they were able to program the controller for. + * Note that the DMA setup functions should return the number of bytes + * that they were able to program the controller for. * - * Also note that generic i386/PC versions of these macros are - * available as NCR5380_i386_dma_write_setup, - * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual. + * Also note that generic i386/PC versions of these macros are + * available as NCR5380_i386_dma_write_setup, + * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual. * * NCR5380_dma_write_setup(instance, src, count) - initialize * NCR5380_dma_read_setup(instance, dst, count) - initialize @@ -262,25 +165,25 @@ * NCR5380_pread(instance, dst, count); * * The generic driver is initialized by calling NCR5380_init(instance), - * after setting the appropriate host specific fields and ID. If the + * after setting the appropriate host specific fields and ID. If the * driver wishes to autoprobe for an IRQ line, the NCR5380_probe_irq(instance, * possible) function may be used. */ -static int do_abort(struct Scsi_Host *host); -static void do_reset(struct Scsi_Host *host); +static int do_abort(struct Scsi_Host *); +static void do_reset(struct Scsi_Host *); -/* - * initialize_SCp - init the scsi pointer field - * @cmd: command block to set up +/** + * initialize_SCp - init the scsi pointer field + * @cmd: command block to set up * - * Set up the internal fields in the SCSI command. + * Set up the internal fields in the SCSI command. */ static inline void initialize_SCp(struct scsi_cmnd *cmd) { - /* - * Initialize the Scsi Pointer field so that all of the commands in the + /* + * Initialize the Scsi Pointer field so that all of the commands in the * various queues are valid. */ @@ -295,120 +198,123 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd) cmd->SCp.ptr = NULL; cmd->SCp.this_residual = 0; } + + cmd->SCp.Status = 0; + cmd->SCp.Message = 0; } /** - * NCR5380_poll_politely - wait for NCR5380 status bits - * @instance: controller to poll - * @reg: 5380 register to poll - * @bit: Bitmask to check - * @val: Value required to exit - * - * Polls the NCR5380 in a reasonably efficient manner waiting for - * an event to occur, after a short quick poll we begin giving the - * CPU back in non IRQ contexts - * - * Returns the value of the register or a negative error code. + * NCR5380_poll_politely2 - wait for two chip register values + * @instance: controller to poll + * @reg1: 5380 register to poll + * @bit1: Bitmask to check + * @val1: Expected value + * @reg2: Second 5380 register to poll + * @bit2: Second bitmask to check + * @val2: Second expected value + * @wait: Time-out in jiffies + * + * Polls the chip in a reasonably efficient manner waiting for an + * event to occur. After a short quick poll we begin to yield the CPU + * (if possible). In irq contexts the time-out is arbitrarily limited. + * Callers may hold locks as long as they are held in irq mode. + * + * Returns 0 if either or both event(s) occurred otherwise -ETIMEDOUT. */ - -static int NCR5380_poll_politely(struct Scsi_Host *instance, int reg, int bit, int val, int t) + +static int NCR5380_poll_politely2(struct Scsi_Host *instance, + int reg1, int bit1, int val1, + int reg2, int bit2, int val2, int wait) { - NCR5380_local_declare(); - int n = 500; /* At about 8uS a cycle for the cpu access */ - unsigned long end = jiffies + t; - int r; - - NCR5380_setup(instance); - - while( n-- > 0) - { - r = NCR5380_read(reg); - if((r & bit) == val) + struct NCR5380_hostdata *hostdata = shost_priv(instance); + unsigned long deadline = jiffies + wait; + unsigned long n; + + /* Busy-wait for up to 10 ms */ + n = min(10000U, jiffies_to_usecs(wait)); + n *= hostdata->accesses_per_ms; + n /= 2000; + do { + if ((NCR5380_read(reg1) & bit1) == val1) + return 0; + if ((NCR5380_read(reg2) & bit2) == val2) return 0; cpu_relax(); - } - - /* t time yet ? */ - while(time_before(jiffies, end)) - { - r = NCR5380_read(reg); - if((r & bit) == val) + } while (n--); + + if (irqs_disabled() || in_interrupt()) + return -ETIMEDOUT; + + /* Repeatedly sleep for 1 ms until deadline */ + while (time_is_after_jiffies(deadline)) { + schedule_timeout_uninterruptible(1); + if ((NCR5380_read(reg1) & bit1) == val1) + return 0; + if ((NCR5380_read(reg2) & bit2) == val2) return 0; - if(!in_interrupt()) - cond_resched(); - else - cpu_relax(); } + return -ETIMEDOUT; } -static struct { - unsigned char value; - const char *name; -} phases[] __maybe_unused = { - {PHASE_DATAOUT, "DATAOUT"}, - {PHASE_DATAIN, "DATAIN"}, - {PHASE_CMDOUT, "CMDOUT"}, - {PHASE_STATIN, "STATIN"}, - {PHASE_MSGOUT, "MSGOUT"}, - {PHASE_MSGIN, "MSGIN"}, - {PHASE_UNKNOWN, "UNKNOWN"} -}; +static inline int NCR5380_poll_politely(struct Scsi_Host *instance, + int reg, int bit, int val, int wait) +{ + return NCR5380_poll_politely2(instance, reg, bit, val, + reg, bit, val, wait); +} #if NDEBUG static struct { unsigned char mask; const char *name; -} signals[] = { - {SR_DBP, "PARITY"}, - {SR_RST, "RST"}, - {SR_BSY, "BSY"}, - {SR_REQ, "REQ"}, - {SR_MSG, "MSG"}, - {SR_CD, "CD"}, - {SR_IO, "IO"}, - {SR_SEL, "SEL"}, +} signals[] = { + {SR_DBP, "PARITY"}, + {SR_RST, "RST"}, + {SR_BSY, "BSY"}, + {SR_REQ, "REQ"}, + {SR_MSG, "MSG"}, + {SR_CD, "CD"}, + {SR_IO, "IO"}, + {SR_SEL, "SEL"}, {0, NULL} -}, +}, basrs[] = { - {BASR_ATN, "ATN"}, - {BASR_ACK, "ACK"}, + {BASR_ATN, "ATN"}, + {BASR_ACK, "ACK"}, {0, NULL} -}, -icrs[] = { - {ICR_ASSERT_RST, "ASSERT RST"}, - {ICR_ASSERT_ACK, "ASSERT ACK"}, - {ICR_ASSERT_BSY, "ASSERT BSY"}, - {ICR_ASSERT_SEL, "ASSERT SEL"}, - {ICR_ASSERT_ATN, "ASSERT ATN"}, - {ICR_ASSERT_DATA, "ASSERT DATA"}, +}, +icrs[] = { + {ICR_ASSERT_RST, "ASSERT RST"}, + {ICR_ASSERT_ACK, "ASSERT ACK"}, + {ICR_ASSERT_BSY, "ASSERT BSY"}, + {ICR_ASSERT_SEL, "ASSERT SEL"}, + {ICR_ASSERT_ATN, "ASSERT ATN"}, + {ICR_ASSERT_DATA, "ASSERT DATA"}, {0, NULL} -}, -mrs[] = { - {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, - {MR_TARGET, "MODE TARGET"}, - {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, - {MR_ENABLE_PAR_INTR, "MODE PARITY INTR"}, - {MR_MONITOR_BSY, "MODE MONITOR BSY"}, - {MR_DMA_MODE, "MODE DMA"}, - {MR_ARBITRATE, "MODE ARBITRATION"}, +}, +mrs[] = { + {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, + {MR_TARGET, "MODE TARGET"}, + {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, + {MR_ENABLE_PAR_INTR, "MODE PARITY INTR"}, + {MR_ENABLE_EOP_INTR, "MODE EOP INTR"}, + {MR_MONITOR_BSY, "MODE MONITOR BSY"}, + {MR_DMA_MODE, "MODE DMA"}, + {MR_ARBITRATE, "MODE ARBITRATION"}, {0, NULL} }; /** - * NCR5380_print - print scsi bus signals - * @instance: adapter state to dump - * - * Print the SCSI bus signals for debugging purposes + * NCR5380_print - print scsi bus signals + * @instance: adapter state to dump * - * Locks: caller holds hostdata lock (not essential) + * Print the SCSI bus signals for debugging purposes */ static void NCR5380_print(struct Scsi_Host *instance) { - NCR5380_local_declare(); unsigned char status, data, basr, mr, icr, i; - NCR5380_setup(instance); data = NCR5380_read(CURRENT_SCSI_DATA_REG); status = NCR5380_read(STATUS_REG); @@ -435,117 +341,56 @@ static void NCR5380_print(struct Scsi_Host *instance) printk("\n"); } +static struct { + unsigned char value; + const char *name; +} phases[] = { + {PHASE_DATAOUT, "DATAOUT"}, + {PHASE_DATAIN, "DATAIN"}, + {PHASE_CMDOUT, "CMDOUT"}, + {PHASE_STATIN, "STATIN"}, + {PHASE_MSGOUT, "MSGOUT"}, + {PHASE_MSGIN, "MSGIN"}, + {PHASE_UNKNOWN, "UNKNOWN"} +}; -/* - * NCR5380_print_phase - show SCSI phase - * @instance: adapter to dump - * - * Print the current SCSI phase for debugging purposes +/** + * NCR5380_print_phase - show SCSI phase + * @instance: adapter to dump * - * Locks: none + * Print the current SCSI phase for debugging purposes */ static void NCR5380_print_phase(struct Scsi_Host *instance) { - NCR5380_local_declare(); unsigned char status; int i; - NCR5380_setup(instance); status = NCR5380_read(STATUS_REG); if (!(status & SR_REQ)) - printk("scsi%d : REQ not asserted, phase unknown.\n", instance->host_no); + shost_printk(KERN_DEBUG, instance, "REQ not asserted, phase unknown.\n"); else { - for (i = 0; (phases[i].value != PHASE_UNKNOWN) && (phases[i].value != (status & PHASE_MASK)); ++i); - printk("scsi%d : phase %s\n", instance->host_no, phases[i].name); + for (i = 0; (phases[i].value != PHASE_UNKNOWN) && + (phases[i].value != (status & PHASE_MASK)); ++i) + ; + shost_printk(KERN_DEBUG, instance, "phase %s\n", phases[i].name); } } #endif -/* - * These need tweaking, and would probably work best as per-device - * flags initialized differently for disk, tape, cd, etc devices. - * People with broken devices are free to experiment as to what gives - * the best results for them. - * - * USLEEP_SLEEP should be a minimum seek time. - * - * USLEEP_POLL should be a maximum rotational latency. - */ -#ifndef USLEEP_SLEEP -/* 20 ms (reasonable hard disk speed) */ -#define USLEEP_SLEEP msecs_to_jiffies(20) -#endif -/* 300 RPM (floppy speed) */ -#ifndef USLEEP_POLL -#define USLEEP_POLL msecs_to_jiffies(200) -#endif -#ifndef USLEEP_WAITLONG -/* RvC: (reasonable time to wait on select error) */ -#define USLEEP_WAITLONG USLEEP_SLEEP -#endif -/* - * Function : int should_disconnect (unsigned char cmd) - * - * Purpose : decide whether a command would normally disconnect or - * not, since if it won't disconnect we should go to sleep. - * - * Input : cmd - opcode of SCSI command - * - * Returns : DISCONNECT_LONG if we should disconnect for a really long - * time (ie always, sleep, look for REQ active, sleep), - * DISCONNECT_TIME_TO_DATA if we would only disconnect for a normal - * time-to-data delay, DISCONNECT_NONE if this command would return - * immediately. - * - * Future sleep algorithms based on time to data can exploit - * something like this so they can differentiate between "normal" - * (ie, read, write, seek) and unusual commands (ie, * format). - * - * Note : We don't deal with commands that handle an immediate disconnect, - * - */ - -static int should_disconnect(unsigned char cmd) -{ - switch (cmd) { - case READ_6: - case WRITE_6: - case SEEK_6: - case READ_10: - case WRITE_10: - case SEEK_10: - return DISCONNECT_TIME_TO_DATA; - case FORMAT_UNIT: - case SEARCH_HIGH: - case SEARCH_LOW: - case SEARCH_EQUAL: - return DISCONNECT_LONG; - default: - return DISCONNECT_NONE; - } -} - -static void NCR5380_set_timer(struct NCR5380_hostdata *hostdata, unsigned long timeout) -{ - hostdata->time_expires = jiffies + timeout; - schedule_delayed_work(&hostdata->coroutine, timeout); -} - - -static int probe_irq __initdata = 0; +static int probe_irq __initdata; /** - * probe_intr - helper for IRQ autoprobe - * @irq: interrupt number - * @dev_id: unused - * @regs: unused + * probe_intr - helper for IRQ autoprobe + * @irq: interrupt number + * @dev_id: unused + * @regs: unused * - * Set a flag to indicate the IRQ in question was received. This is - * used by the IRQ probe code. + * Set a flag to indicate the IRQ in question was received. This is + * used by the IRQ probe code. */ - + static irqreturn_t __init probe_intr(int irq, void *dev_id) { probe_irq = irq; @@ -553,24 +398,20 @@ static irqreturn_t __init probe_intr(int irq, void *dev_id) } /** - * NCR5380_probe_irq - find the IRQ of an NCR5380 - * @instance: NCR5380 controller - * @possible: bitmask of ISA IRQ lines + * NCR5380_probe_irq - find the IRQ of an NCR5380 + * @instance: NCR5380 controller + * @possible: bitmask of ISA IRQ lines * - * Autoprobe for the IRQ line used by the NCR5380 by triggering an IRQ - * and then looking to see what interrupt actually turned up. - * - * Locks: none, irqs must be enabled on entry + * Autoprobe for the IRQ line used by the NCR5380 by triggering an IRQ + * and then looking to see what interrupt actually turned up. */ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance, int possible) { - NCR5380_local_declare(); - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned long timeout; int trying_irqs, i, mask; - NCR5380_setup(instance); for (trying_irqs = 0, i = 1, mask = 2; i < 16; ++i, mask <<= 1) if ((mask & possible) && (request_irq(i, &probe_intr, 0, "NCR-probe", NULL) == 0)) @@ -581,7 +422,7 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance, /* * A interrupt is triggered whenever BSY = false, SEL = true - * and a bit set in the SELECT_ENABLE_REG is asserted on the + * and a bit set in the SELECT_ENABLE_REG is asserted on the * SCSI bus. * * Note that the bus is only driven when the phase control signals @@ -596,7 +437,7 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance, while (probe_irq == NO_IRQ && time_before(jiffies, timeout)) schedule_timeout_uninterruptible(1); - + NCR5380_write(SELECT_ENABLE_REG, 0); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); @@ -608,12 +449,10 @@ static int __init __maybe_unused NCR5380_probe_irq(struct Scsi_Host *instance, } /** - * NCR58380_info - report driver and host information - * @instance: relevant scsi host instance - * - * For use as the host template info() handler. + * NCR58380_info - report driver and host information + * @instance: relevant scsi host instance * - * Locks: none + * For use as the host template info() handler. */ static const char *NCR5380_info(struct Scsi_Host *instance) @@ -633,20 +472,14 @@ static void prepare_info(struct Scsi_Host *instance) "can_queue %d, cmd_per_lun %d, " "sg_tablesize %d, this_id %d, " "flags { %s%s%s}, " -#if defined(USLEEP_POLL) && defined(USLEEP_WAITLONG) - "USLEEP_POLL %lu, USLEEP_WAITLONG %lu, " -#endif "options { %s} ", instance->hostt->name, instance->io_port, instance->n_io_port, instance->base, instance->irq, instance->can_queue, instance->cmd_per_lun, instance->sg_tablesize, instance->this_id, - hostdata->flags & FLAG_NCR53C400 ? "NCR53C400 " : "", - hostdata->flags & FLAG_DTC3181E ? "DTC3181E " : "", + hostdata->flags & FLAG_NO_DMA_FIXUP ? "NO_DMA_FIXUP " : "", hostdata->flags & FLAG_NO_PSEUDO_DMA ? "NO_PSEUDO_DMA " : "", -#if defined(USLEEP_POLL) && defined(USLEEP_WAITLONG) - USLEEP_POLL, USLEEP_WAITLONG, -#endif + hostdata->flags & FLAG_TOSHIBA_DELAY ? "TOSHIBA_DELAY " : "", #ifdef AUTOPROBE_IRQ "AUTOPROBE_IRQ " #endif @@ -665,46 +498,10 @@ static void prepare_info(struct Scsi_Host *instance) #ifdef PSEUDO_DMA "PSEUDO_DMA " #endif -#ifdef UNSAFE - "UNSAFE " -#endif -#ifdef NCR53C400 - "NCR53C400 " -#endif ""); } -/** - * NCR5380_print_status - dump controller info - * @instance: controller to dump - * - * Print commands in the various queues, called from NCR5380_abort - * and NCR5380_debug to aid debugging. - * - * Locks: called functions disable irqs - */ - -static void NCR5380_print_status(struct Scsi_Host *instance) -{ - NCR5380_dprint(NDEBUG_ANY, instance); - NCR5380_dprint_phase(NDEBUG_ANY, instance); -} - #ifdef PSEUDO_DMA -/******************************************/ -/* - * /proc/scsi/[dtc pas16 t128 generic]/[0-ASC_NUM_BOARD_SUPPORTED] - * - * *buffer: I/O buffer - * **start: if inout == FALSE pointer into buffer where user read should start - * offset: current offset - * length: length of buffer - * hostno: Scsi_Host host_no - * inout: TRUE - user is writing; FALSE - user is reading - * - * Return the number of bytes read from or written - */ - static int __maybe_unused NCR5380_write_info(struct Scsi_Host *instance, char *buffer, int length) { @@ -714,104 +511,41 @@ static int __maybe_unused NCR5380_write_info(struct Scsi_Host *instance, hostdata->spin_max_w = 0; return 0; } -#endif - -static -void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m); -static -void lprint_command(unsigned char *cmd, struct seq_file *m); -static -void lprint_opcode(int opcode, struct seq_file *m); static int __maybe_unused NCR5380_show_info(struct seq_file *m, - struct Scsi_Host *instance) + struct Scsi_Host *instance) { - struct NCR5380_hostdata *hostdata; - struct scsi_cmnd *ptr; - - hostdata = (struct NCR5380_hostdata *) instance->hostdata; + struct NCR5380_hostdata *hostdata = shost_priv(instance); -#ifdef PSEUDO_DMA seq_printf(m, "Highwater I/O busy spin counts: write %d, read %d\n", hostdata->spin_max_w, hostdata->spin_max_r); -#endif - spin_lock_irq(instance->host_lock); - if (!hostdata->connected) - seq_printf(m, "scsi%d: no currently connected command\n", instance->host_no); - else - lprint_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected, m); - seq_printf(m, "scsi%d: issue_queue\n", instance->host_no); - for (ptr = (struct scsi_cmnd *) hostdata->issue_queue; ptr; ptr = (struct scsi_cmnd *) ptr->host_scribble) - lprint_Scsi_Cmnd(ptr, m); - - seq_printf(m, "scsi%d: disconnected_queue\n", instance->host_no); - for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr; ptr = (struct scsi_cmnd *) ptr->host_scribble) - lprint_Scsi_Cmnd(ptr, m); - spin_unlock_irq(instance->host_lock); return 0; } - -static void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m) -{ - seq_printf(m, "scsi%d : destination target %d, lun %llu\n", cmd->device->host->host_no, cmd->device->id, cmd->device->lun); - seq_puts(m, " command = "); - lprint_command(cmd->cmnd, m); -} - -static void lprint_command(unsigned char *command, struct seq_file *m) -{ - int i, s; - lprint_opcode(command[0], m); - for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i) - seq_printf(m, "%02x ", command[i]); - seq_putc(m, '\n'); -} - -static void lprint_opcode(int opcode, struct seq_file *m) -{ - seq_printf(m, "%2d (0x%02x)", opcode, opcode); -} - +#endif /** - * NCR5380_init - initialise an NCR5380 - * @instance: adapter to configure - * @flags: control flags + * NCR5380_init - initialise an NCR5380 + * @instance: adapter to configure + * @flags: control flags * - * Initializes *instance and corresponding 5380 chip, - * with flags OR'd into the initial flags value. + * Initializes *instance and corresponding 5380 chip, + * with flags OR'd into the initial flags value. * - * Notes : I assume that the host, hostno, and id bits have been - * set correctly. I don't care about the irq and other fields. + * Notes : I assume that the host, hostno, and id bits have been + * set correctly. I don't care about the irq and other fields. * - * Returns 0 for success - * - * Locks: interrupts must be enabled when we are called + * Returns 0 for success */ static int NCR5380_init(struct Scsi_Host *instance, int flags) { - NCR5380_local_declare(); - int i, pass; - unsigned long timeout; - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; - - if(in_interrupt()) - printk(KERN_ERR "NCR5380_init called with interrupts off!\n"); - /* - * On NCR53C400 boards, NCR5380 registers are mapped 8 past - * the base address. - */ - -#ifdef NCR53C400 - if (flags & FLAG_NCR53C400) - instance->NCR5380_instance_name += NCR53C400_address_adjust; -#endif - - NCR5380_setup(instance); + struct NCR5380_hostdata *hostdata = shost_priv(instance); + int i; + unsigned long deadline; - hostdata->aborted = 0; + hostdata->host = instance; hostdata->id_mask = 1 << instance->this_id; + hostdata->id_higher_mask = 0; for (i = hostdata->id_mask; i <= 0x80; i <<= 1) if (i > hostdata->id_mask) hostdata->id_higher_mask |= i; @@ -820,21 +554,21 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags) #ifdef REAL_DMA hostdata->dmalen = 0; #endif - hostdata->targets_present = 0; + spin_lock_init(&hostdata->lock); hostdata->connected = NULL; - hostdata->issue_queue = NULL; - hostdata->disconnected_queue = NULL; - - INIT_DELAYED_WORK(&hostdata->coroutine, NCR5380_main); - - /* The CHECK code seems to break the 53C400. Will check it later maybe */ - if (flags & FLAG_NCR53C400) - hostdata->flags = FLAG_HAS_LAST_BYTE_SENT | flags; - else - hostdata->flags = FLAG_CHECK_LAST_BYTE_SENT | flags; + hostdata->sensing = NULL; + INIT_LIST_HEAD(&hostdata->autosense); + INIT_LIST_HEAD(&hostdata->unissued); + INIT_LIST_HEAD(&hostdata->disconnected); - hostdata->host = instance; - hostdata->time_expires = 0; + hostdata->flags = flags; + + INIT_WORK(&hostdata->main_task, NCR5380_main); + hostdata->work_q = alloc_workqueue("ncr5380_%d", + WQ_UNBOUND | WQ_MEM_RECLAIM, + 1, instance->host_no); + if (!hostdata->work_q) + return -ENOMEM; prepare_info(instance); @@ -843,43 +577,69 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags) NCR5380_write(TARGET_COMMAND_REG, 0); NCR5380_write(SELECT_ENABLE_REG, 0); -#ifdef NCR53C400 - if (hostdata->flags & FLAG_NCR53C400) { - NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE); - } -#endif + /* Calibrate register polling loop */ + i = 0; + deadline = jiffies + 1; + do { + cpu_relax(); + } while (time_is_after_jiffies(deadline)); + deadline += msecs_to_jiffies(256); + do { + NCR5380_read(STATUS_REG); + ++i; + cpu_relax(); + } while (time_is_after_jiffies(deadline)); + hostdata->accesses_per_ms = i / 256; - /* - * Detect and correct bus wedge problems. - * - * If the system crashed, it may have crashed in a state - * where a SCSI command was still executing, and the - * SCSI bus is not in a BUS FREE STATE. - * - * If this is the case, we'll try to abort the currently - * established nexus which we know nothing about, and that - * failing, do a hard reset of the SCSI bus - */ + return 0; +} + +/** + * NCR5380_maybe_reset_bus - Detect and correct bus wedge problems. + * @instance: adapter to check + * + * If the system crashed, it may have crashed with a connected target and + * the SCSI bus busy. Check for BUS FREE phase. If not, try to abort the + * currently established nexus, which we know nothing about. Failing that + * do a bus reset. + * + * Note that a bus reset will cause the chip to assert IRQ. + * + * Returns 0 if successful, otherwise -ENXIO. + */ + +static int NCR5380_maybe_reset_bus(struct Scsi_Host *instance) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + int pass; for (pass = 1; (NCR5380_read(STATUS_REG) & SR_BSY) && pass <= 6; ++pass) { switch (pass) { case 1: case 3: case 5: - printk(KERN_INFO "scsi%d: SCSI bus busy, waiting up to five seconds\n", instance->host_no); - timeout = jiffies + 5 * HZ; - NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, 0, 5*HZ); + shost_printk(KERN_ERR, instance, "SCSI bus busy, waiting up to five seconds\n"); + NCR5380_poll_politely(instance, + STATUS_REG, SR_BSY, 0, 5 * HZ); break; case 2: - printk(KERN_WARNING "scsi%d: bus busy, attempting abort\n", instance->host_no); + shost_printk(KERN_ERR, instance, "bus busy, attempting abort\n"); do_abort(instance); break; case 4: - printk(KERN_WARNING "scsi%d: bus busy, attempting reset\n", instance->host_no); + shost_printk(KERN_ERR, instance, "bus busy, attempting reset\n"); do_reset(instance); + /* Wait after a reset; the SCSI standard calls for + * 250ms, we wait 500ms to be on the safe side. + * But some Toshiba CD-ROMs need ten times that. + */ + if (hostdata->flags & FLAG_TOSHIBA_DELAY) + msleep(2500); + else + msleep(500); break; case 6: - printk(KERN_ERR "scsi%d: bus locked solid or invalid override\n", instance->host_no); + shost_printk(KERN_ERR, instance, "bus locked solid\n"); return -ENXIO; } } @@ -887,450 +647,513 @@ static int NCR5380_init(struct Scsi_Host *instance, int flags) } /** - * NCR5380_exit - remove an NCR5380 - * @instance: adapter to remove + * NCR5380_exit - remove an NCR5380 + * @instance: adapter to remove + * + * Assumes that no more work can be queued (e.g. by NCR5380_intr). */ static void NCR5380_exit(struct Scsi_Host *instance) { - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; + struct NCR5380_hostdata *hostdata = shost_priv(instance); - cancel_delayed_work_sync(&hostdata->coroutine); + cancel_work_sync(&hostdata->main_task); + destroy_workqueue(hostdata->work_q); } /** - * NCR5380_queue_command - queue a command - * @cmd: SCSI command - * @done: completion handler - * - * cmd is added to the per instance issue_queue, with minor - * twiddling done to the host specific fields of cmd. If the - * main coroutine is not running, it is restarted. + * complete_cmd - finish processing a command and return it to the SCSI ML + * @instance: the host instance + * @cmd: command to complete + */ + +static void complete_cmd(struct Scsi_Host *instance, + struct scsi_cmnd *cmd) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + + dsprintk(NDEBUG_QUEUES, instance, "complete_cmd: cmd %p\n", cmd); + + if (hostdata->sensing == cmd) { + /* Autosense processing ends here */ + if ((cmd->result & 0xff) != SAM_STAT_GOOD) { + scsi_eh_restore_cmnd(cmd, &hostdata->ses); + set_host_byte(cmd, DID_ERROR); + } else + scsi_eh_restore_cmnd(cmd, &hostdata->ses); + hostdata->sensing = NULL; + } + + hostdata->busy[scmd_id(cmd)] &= ~(1 << cmd->device->lun); + + cmd->scsi_done(cmd); +} + +/** + * NCR5380_queue_command - queue a command + * @instance: the relevant SCSI adapter + * @cmd: SCSI command * - * Locks: host lock taken by caller + * cmd is added to the per-instance issue queue, with minor + * twiddling done to the host specific fields of cmd. If the + * main coroutine is not running, it is restarted. */ -static int NCR5380_queue_command_lck(struct scsi_cmnd *cmd, void (*done) (struct scsi_cmnd *)) +static int NCR5380_queue_command(struct Scsi_Host *instance, + struct scsi_cmnd *cmd) { - struct Scsi_Host *instance = cmd->device->host; - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; - struct scsi_cmnd *tmp; + struct NCR5380_hostdata *hostdata = shost_priv(instance); + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); + unsigned long flags; #if (NDEBUG & NDEBUG_NO_WRITE) switch (cmd->cmnd[0]) { case WRITE_6: case WRITE_10: - printk("scsi%d : WRITE attempted with NO_WRITE debugging flag set\n", instance->host_no); + shost_printk(KERN_DEBUG, instance, "WRITE attempted with NDEBUG_NO_WRITE set\n"); cmd->result = (DID_ERROR << 16); - done(cmd); + cmd->scsi_done(cmd); return 0; } -#endif /* (NDEBUG & NDEBUG_NO_WRITE) */ - - /* - * We use the host_scribble field as a pointer to the next command - * in a queue - */ +#endif /* (NDEBUG & NDEBUG_NO_WRITE) */ - cmd->host_scribble = NULL; - cmd->scsi_done = done; cmd->result = 0; - /* - * Insert the cmd into the issue queue. Note that REQUEST SENSE + spin_lock_irqsave(&hostdata->lock, flags); + + /* + * Insert the cmd into the issue queue. Note that REQUEST SENSE * commands are added to the head of the queue since any command will - * clear the contingent allegiance condition that exists and the + * clear the contingent allegiance condition that exists and the * sense data is only guaranteed to be valid while the condition exists. */ - if (!(hostdata->issue_queue) || (cmd->cmnd[0] == REQUEST_SENSE)) { - LIST(cmd, hostdata->issue_queue); - cmd->host_scribble = (unsigned char *) hostdata->issue_queue; - hostdata->issue_queue = cmd; - } else { - for (tmp = (struct scsi_cmnd *) hostdata->issue_queue; tmp->host_scribble; tmp = (struct scsi_cmnd *) tmp->host_scribble); - LIST(cmd, tmp); - tmp->host_scribble = (unsigned char *) cmd; - } - dprintk(NDEBUG_QUEUES, "scsi%d : command added to %s of queue\n", instance->host_no, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail"); + if (cmd->cmnd[0] == REQUEST_SENSE) + list_add(&ncmd->list, &hostdata->unissued); + else + list_add_tail(&ncmd->list, &hostdata->unissued); + + spin_unlock_irqrestore(&hostdata->lock, flags); + + dsprintk(NDEBUG_QUEUES, instance, "command %p added to %s of queue\n", + cmd, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail"); - /* Run the coroutine if it isn't already running. */ /* Kick off command processing */ - schedule_delayed_work(&hostdata->coroutine, 0); + queue_work(hostdata->work_q, &hostdata->main_task); return 0; } -static DEF_SCSI_QCMD(NCR5380_queue_command) +/** + * dequeue_next_cmd - dequeue a command for processing + * @instance: the scsi host instance + * + * Priority is given to commands on the autosense queue. These commands + * need autosense because of a CHECK CONDITION result. + * + * Returns a command pointer if a command is found for a target that is + * not already busy. Otherwise returns NULL. + */ + +static struct scsi_cmnd *dequeue_next_cmd(struct Scsi_Host *instance) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + struct NCR5380_cmd *ncmd; + struct scsi_cmnd *cmd; + + if (list_empty(&hostdata->autosense)) { + list_for_each_entry(ncmd, &hostdata->unissued, list) { + cmd = NCR5380_to_scmd(ncmd); + dsprintk(NDEBUG_QUEUES, instance, "dequeue: cmd=%p target=%d busy=0x%02x lun=%llu\n", + cmd, scmd_id(cmd), hostdata->busy[scmd_id(cmd)], cmd->device->lun); + + if (!(hostdata->busy[scmd_id(cmd)] & (1 << cmd->device->lun))) { + list_del(&ncmd->list); + dsprintk(NDEBUG_QUEUES, instance, + "dequeue: removed %p from issue queue\n", cmd); + return cmd; + } + } + } else { + /* Autosense processing begins here */ + ncmd = list_first_entry(&hostdata->autosense, + struct NCR5380_cmd, list); + list_del(&ncmd->list); + cmd = NCR5380_to_scmd(ncmd); + dsprintk(NDEBUG_QUEUES, instance, + "dequeue: removed %p from autosense queue\n", cmd); + scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0); + hostdata->sensing = cmd; + return cmd; + } + return NULL; +} + +static void requeue_cmd(struct Scsi_Host *instance, struct scsi_cmnd *cmd) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); + + if (hostdata->sensing) { + scsi_eh_restore_cmnd(cmd, &hostdata->ses); + list_add(&ncmd->list, &hostdata->autosense); + hostdata->sensing = NULL; + } else + list_add(&ncmd->list, &hostdata->unissued); +} /** - * NCR5380_main - NCR state machines - * - * NCR5380_main is a coroutine that runs as long as more work can - * be done on the NCR5380 host adapters in a system. Both - * NCR5380_queue_command() and NCR5380_intr() will try to start it - * in case it is not running. - * - * Locks: called as its own thread with no locks held. Takes the - * host lock and called routines may take the isa dma lock. + * NCR5380_main - NCR state machines + * + * NCR5380_main is a coroutine that runs as long as more work can + * be done on the NCR5380 host adapters in a system. Both + * NCR5380_queue_command() and NCR5380_intr() will try to start it + * in case it is not running. */ static void NCR5380_main(struct work_struct *work) { struct NCR5380_hostdata *hostdata = - container_of(work, struct NCR5380_hostdata, coroutine.work); + container_of(work, struct NCR5380_hostdata, main_task); struct Scsi_Host *instance = hostdata->host; - struct scsi_cmnd *tmp, *prev; + struct scsi_cmnd *cmd; int done; - - spin_lock_irq(instance->host_lock); + do { - /* Lock held here */ done = 1; - if (!hostdata->connected && !hostdata->selecting) { - dprintk(NDEBUG_MAIN, "scsi%d : not connected\n", instance->host_no); - /* - * Search through the issue_queue for a command destined - * for a target that's not busy. - */ - for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, prev = NULL; tmp; prev = tmp, tmp = (struct scsi_cmnd *) tmp->host_scribble) - { - if (prev != tmp) - dprintk(NDEBUG_LISTS, "MAIN tmp=%p target=%d busy=%d lun=%llu\n", tmp, tmp->device->id, hostdata->busy[tmp->device->id], tmp->device->lun); - /* When we find one, remove it from the issue queue. */ - if (!(hostdata->busy[tmp->device->id] & - (1 << (u8)(tmp->device->lun & 0xff)))) { - if (prev) { - REMOVE(prev, prev->host_scribble, tmp, tmp->host_scribble); - prev->host_scribble = tmp->host_scribble; - } else { - REMOVE(-1, hostdata->issue_queue, tmp, tmp->host_scribble); - hostdata->issue_queue = (struct scsi_cmnd *) tmp->host_scribble; - } - tmp->host_scribble = NULL; - /* - * Attempt to establish an I_T_L nexus here. - * On success, instance->hostdata->connected is set. - * On failure, we must add the command back to the - * issue queue so we can keep trying. - */ - dprintk(NDEBUG_MAIN|NDEBUG_QUEUES, "scsi%d : main() : command for target %d lun %llu removed from issue_queue\n", instance->host_no, tmp->device->id, tmp->device->lun); - - /* - * A successful selection is defined as one that - * leaves us with the command connected and - * in hostdata->connected, OR has terminated the - * command. - * - * With successful commands, we fall through - * and see if we can do an information transfer, - * with failures we will restart. - */ - hostdata->selecting = NULL; - /* RvC: have to preset this to indicate a new command is being performed */ + spin_lock_irq(&hostdata->lock); + while (!hostdata->connected && + (cmd = dequeue_next_cmd(instance))) { - /* - * REQUEST SENSE commands are issued without tagged - * queueing, even on SCSI-II devices because the - * contingent allegiance condition exists for the - * entire unit. - */ + dsprintk(NDEBUG_MAIN, instance, "main: dequeued %p\n", cmd); - if (!NCR5380_select(instance, tmp)) { - break; - } else { - LIST(tmp, hostdata->issue_queue); - tmp->host_scribble = (unsigned char *) hostdata->issue_queue; - hostdata->issue_queue = tmp; - done = 0; - dprintk(NDEBUG_MAIN|NDEBUG_QUEUES, "scsi%d : main(): select() failed, returned to issue_queue\n", instance->host_no); - } - /* lock held here still */ - } /* if target/lun is not busy */ - } /* for */ - /* exited locked */ - } /* if (!hostdata->connected) */ - if (hostdata->selecting) { - tmp = (struct scsi_cmnd *) hostdata->selecting; - /* Selection will drop and retake the lock */ - if (!NCR5380_select(instance, tmp)) { - /* Ok ?? */ + /* + * Attempt to establish an I_T_L nexus here. + * On success, instance->hostdata->connected is set. + * On failure, we must add the command back to the + * issue queue so we can keep trying. + */ + /* + * REQUEST SENSE commands are issued without tagged + * queueing, even on SCSI-II devices because the + * contingent allegiance condition exists for the + * entire unit. + */ + + cmd = NCR5380_select(instance, cmd); + if (!cmd) { + dsprintk(NDEBUG_MAIN, instance, "main: select complete\n"); } else { - /* RvC: device failed, so we wait a long time - this is needed for Mustek scanners, that - do not respond to commands immediately - after a scan */ - printk(KERN_DEBUG "scsi%d: device %d did not respond in time\n", instance->host_no, tmp->device->id); - LIST(tmp, hostdata->issue_queue); - tmp->host_scribble = (unsigned char *) hostdata->issue_queue; - hostdata->issue_queue = tmp; - NCR5380_set_timer(hostdata, USLEEP_WAITLONG); + dsprintk(NDEBUG_MAIN | NDEBUG_QUEUES, instance, + "main: select failed, returning %p to queue\n", cmd); + requeue_cmd(instance, cmd); } - } /* if hostdata->selecting */ + } if (hostdata->connected #ifdef REAL_DMA && !hostdata->dmalen #endif - && (!hostdata->time_expires || time_before_eq(hostdata->time_expires, jiffies)) ) { - dprintk(NDEBUG_MAIN, "scsi%d : main() : performing information transfer\n", instance->host_no); + dsprintk(NDEBUG_MAIN, instance, "main: performing information transfer\n"); NCR5380_information_transfer(instance); - dprintk(NDEBUG_MAIN, "scsi%d : main() : done set false\n", instance->host_no); done = 0; - } else - break; + } + spin_unlock_irq(&hostdata->lock); + if (!done) + cond_resched(); } while (!done); - - spin_unlock_irq(instance->host_lock); } #ifndef DONT_USE_INTR /** - * NCR5380_intr - generic NCR5380 irq handler - * @irq: interrupt number - * @dev_id: device info - * - * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses - * from the disconnected queue, and restarting NCR5380_main() - * as required. - * - * Locks: takes the needed instance locks + * NCR5380_intr - generic NCR5380 irq handler + * @irq: interrupt number + * @dev_id: device info + * + * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses + * from the disconnected queue, and restarting NCR5380_main() + * as required. + * + * The chip can assert IRQ in any of six different conditions. The IRQ flag + * is then cleared by reading the Reset Parity/Interrupt Register (RPIR). + * Three of these six conditions are latched in the Bus and Status Register: + * - End of DMA (cleared by ending DMA Mode) + * - Parity error (cleared by reading RPIR) + * - Loss of BSY (cleared by reading RPIR) + * Two conditions have flag bits that are not latched: + * - Bus phase mismatch (non-maskable in DMA Mode, cleared by ending DMA Mode) + * - Bus reset (non-maskable) + * The remaining condition has no flag bit at all: + * - Selection/reselection + * + * Hence, establishing the cause(s) of any interrupt is partly guesswork. + * In "The DP8490 and DP5380 Comparison Guide", National Semiconductor + * claimed that "the design of the [DP8490] interrupt logic ensures + * interrupts will not be lost (they can be on the DP5380)." + * The L5380/53C80 datasheet from LOGIC Devices has more details. + * + * Checking for bus reset by reading RST is futile because of interrupt + * latency, but a bus reset will reset chip logic. Checking for parity error + * is unnecessary because that interrupt is never enabled. A Loss of BSY + * condition will clear DMA Mode. We can tell when this occurs because the + * the Busy Monitor interrupt is enabled together with DMA Mode. */ -static irqreturn_t NCR5380_intr(int dummy, void *dev_id) +static irqreturn_t NCR5380_intr(int irq, void *dev_id) { - NCR5380_local_declare(); struct Scsi_Host *instance = dev_id; - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; - int done; + struct NCR5380_hostdata *hostdata = shost_priv(instance); + int handled = 0; unsigned char basr; unsigned long flags; - dprintk(NDEBUG_INTR, "scsi : NCR5380 irq %d triggered\n", - instance->irq); + spin_lock_irqsave(&hostdata->lock, flags); + + basr = NCR5380_read(BUS_AND_STATUS_REG); + if (basr & BASR_IRQ) { + unsigned char mr = NCR5380_read(MODE_REG); + unsigned char sr = NCR5380_read(STATUS_REG); + + dsprintk(NDEBUG_INTR, instance, "IRQ %d, BASR 0x%02x, SR 0x%02x, MR 0x%02x\n", + irq, basr, sr, mr); - do { - done = 1; - spin_lock_irqsave(instance->host_lock, flags); - /* Look for pending interrupts */ - NCR5380_setup(instance); - basr = NCR5380_read(BUS_AND_STATUS_REG); - /* XXX dispatch to appropriate routine if found and done=0 */ - if (basr & BASR_IRQ) { - NCR5380_dprint(NDEBUG_INTR, instance); - if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) { - done = 0; - dprintk(NDEBUG_INTR, "scsi%d : SEL interrupt\n", instance->host_no); - NCR5380_reselect(instance); - (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else if (basr & BASR_PARITY_ERROR) { - dprintk(NDEBUG_INTR, "scsi%d : PARITY interrupt\n", instance->host_no); - (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else if ((NCR5380_read(STATUS_REG) & SR_RST) == SR_RST) { - dprintk(NDEBUG_INTR, "scsi%d : RESET interrupt\n", instance->host_no); - (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else { #if defined(REAL_DMA) - /* - * We should only get PHASE MISMATCH and EOP interrupts - * if we have DMA enabled, so do a sanity check based on - * the current setting of the MODE register. - */ + if ((mr & MR_DMA_MODE) || (mr & MR_MONITOR_BSY)) { + /* Probably End of DMA, Phase Mismatch or Loss of BSY. + * We ack IRQ after clearing Mode Register. Workarounds + * for End of DMA errata need to happen in DMA Mode. + */ - if ((NCR5380_read(MODE_REG) & MR_DMA) && ((basr & BASR_END_DMA_TRANSFER) || !(basr & BASR_PHASE_MATCH))) { - int transferred; + dsprintk(NDEBUG_INTR, instance, "interrupt in DMA mode\n"); - if (!hostdata->connected) - panic("scsi%d : received end of DMA interrupt with no connected cmd\n", instance->hostno); + int transferred; - transferred = (hostdata->dmalen - NCR5380_dma_residual(instance)); - hostdata->connected->SCp.this_residual -= transferred; - hostdata->connected->SCp.ptr += transferred; - hostdata->dmalen = 0; + if (!hostdata->connected) + panic("scsi%d : DMA interrupt with no connected cmd\n", + instance->hostno); - (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG); - - /* FIXME: we need to poll briefly then defer a workqueue task ! */ - NCR5380_poll_politely(hostdata, BUS_AND_STATUS_REG, BASR_ACK, 0, 2*HZ); + transferred = hostdata->dmalen - NCR5380_dma_residual(instance); + hostdata->connected->SCp.this_residual -= transferred; + hostdata->connected->SCp.ptr += transferred; + hostdata->dmalen = 0; - NCR5380_write(MODE_REG, MR_BASE); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - } -#else - dprintk(NDEBUG_INTR, "scsi : unknown interrupt, BASR 0x%X, MR 0x%X, SR 0x%x\n", basr, NCR5380_read(MODE_REG), NCR5380_read(STATUS_REG)); - (void) NCR5380_read(RESET_PARITY_INTERRUPT_REG); -#endif + /* FIXME: we need to poll briefly then defer a workqueue task ! */ + NCR5380_poll_politely(hostdata, BUS_AND_STATUS_REG, BASR_ACK, 0, 2 * HZ); + + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + NCR5380_write(MODE_REG, MR_BASE); + NCR5380_read(RESET_PARITY_INTERRUPT_REG); + } else +#endif /* REAL_DMA */ + if ((NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_mask) && + (sr & (SR_SEL | SR_IO | SR_BSY | SR_RST)) == (SR_SEL | SR_IO)) { + /* Probably reselected */ + NCR5380_write(SELECT_ENABLE_REG, 0); + NCR5380_read(RESET_PARITY_INTERRUPT_REG); + + dsprintk(NDEBUG_INTR, instance, "interrupt with SEL and IO\n"); + + if (!hostdata->connected) { + NCR5380_reselect(instance); + queue_work(hostdata->work_q, &hostdata->main_task); } - } /* if BASR_IRQ */ - spin_unlock_irqrestore(instance->host_lock, flags); - if(!done) - schedule_delayed_work(&hostdata->coroutine, 0); - } while (!done); - return IRQ_HANDLED; + if (!hostdata->connected) + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + } else { + /* Probably Bus Reset */ + NCR5380_read(RESET_PARITY_INTERRUPT_REG); + + dsprintk(NDEBUG_INTR, instance, "unknown interrupt\n"); + } + handled = 1; + } else { + shost_printk(KERN_NOTICE, instance, "interrupt without IRQ bit\n"); + } + + spin_unlock_irqrestore(&hostdata->lock, flags); + + return IRQ_RETVAL(handled); } -#endif +#endif -/* +/* * Function : int NCR5380_select(struct Scsi_Host *instance, - * struct scsi_cmnd *cmd) + * struct scsi_cmnd *cmd) * * Purpose : establishes I_T_L or I_T_L_Q nexus for new or existing command, - * including ARBITRATION, SELECTION, and initial message out for - * IDENTIFY and queue messages. - * - * Inputs : instance - instantiation of the 5380 driver on which this - * target lives, cmd - SCSI command to execute. - * - * Returns : -1 if selection could not execute for some reason, - * 0 if selection succeeded or failed because the target - * did not respond. - * - * Side effects : - * If bus busy, arbitration failed, etc, NCR5380_select() will exit - * with registers as they should have been on entry - ie - * SELECT_ENABLE will be set appropriately, the NCR5380 - * will cease to drive any SCSI bus signals. - * - * If successful : I_T_L or I_T_L_Q nexus will be established, - * instance->connected will be set to cmd. - * SELECT interrupt will be disabled. - * - * If failed (no target) : cmd->scsi_done() will be called, and the - * cmd->result host byte set to DID_BAD_TARGET. - * - * Locks: caller holds hostdata lock in IRQ mode + * including ARBITRATION, SELECTION, and initial message out for + * IDENTIFY and queue messages. + * + * Inputs : instance - instantiation of the 5380 driver on which this + * target lives, cmd - SCSI command to execute. + * + * Returns cmd if selection failed but should be retried, + * NULL if selection failed and should not be retried, or + * NULL if selection succeeded (hostdata->connected == cmd). + * + * Side effects : + * If bus busy, arbitration failed, etc, NCR5380_select() will exit + * with registers as they should have been on entry - ie + * SELECT_ENABLE will be set appropriately, the NCR5380 + * will cease to drive any SCSI bus signals. + * + * If successful : I_T_L or I_T_L_Q nexus will be established, + * instance->connected will be set to cmd. + * SELECT interrupt will be disabled. + * + * If failed (no target) : cmd->scsi_done() will be called, and the + * cmd->result host byte set to DID_BAD_TARGET. */ - -static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) + +static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *instance, + struct scsi_cmnd *cmd) { - NCR5380_local_declare(); - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned char tmp[3], phase; unsigned char *data; int len; - unsigned long timeout; - unsigned char value; int err; - NCR5380_setup(instance); - - if (hostdata->selecting) - goto part2; - - hostdata->restart_select = 0; NCR5380_dprint(NDEBUG_ARBITRATION, instance); - dprintk(NDEBUG_ARBITRATION, "scsi%d : starting arbitration, id = %d\n", instance->host_no, instance->this_id); + dsprintk(NDEBUG_ARBITRATION, instance, "starting arbitration, id = %d\n", + instance->this_id); + + /* + * Arbitration and selection phases are slow and involve dropping the + * lock, so we have to watch out for EH. An exception handler may + * change 'selecting' to NULL. This function will then return NULL + * so that the caller will forget about 'cmd'. (During information + * transfer phases, EH may change 'connected' to NULL.) + */ + hostdata->selecting = cmd; - /* - * Set the phase bits to 0, otherwise the NCR5380 won't drive the + /* + * Set the phase bits to 0, otherwise the NCR5380 won't drive the * data bus during SELECTION. */ NCR5380_write(TARGET_COMMAND_REG, 0); - /* + /* * Start arbitration. */ NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask); NCR5380_write(MODE_REG, MR_ARBITRATE); + /* The chip now waits for BUS FREE phase. Then after the 800 ns + * Bus Free Delay, arbitration will begin. + */ - /* We can be relaxed here, interrupts are on, we are - in workqueue context, the birds are singing in the trees */ - spin_unlock_irq(instance->host_lock); - err = NCR5380_poll_politely(instance, INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS, ICR_ARBITRATION_PROGRESS, 5*HZ); - spin_lock_irq(instance->host_lock); + spin_unlock_irq(&hostdata->lock); + err = NCR5380_poll_politely2(instance, MODE_REG, MR_ARBITRATE, 0, + INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS, + ICR_ARBITRATION_PROGRESS, HZ); + spin_lock_irq(&hostdata->lock); + if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) { + /* Reselection interrupt */ + goto out; + } if (err < 0) { - printk(KERN_DEBUG "scsi: arbitration timeout at %d\n", __LINE__); NCR5380_write(MODE_REG, MR_BASE); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - goto failed; + shost_printk(KERN_ERR, instance, + "select: arbitration timeout\n"); + goto out; } + spin_unlock_irq(&hostdata->lock); - dprintk(NDEBUG_ARBITRATION, "scsi%d : arbitration complete\n", instance->host_no); - - /* - * The arbitration delay is 2.2us, but this is a minimum and there is - * no maximum so we can safely sleep for ceil(2.2) usecs to accommodate - * the integral nature of udelay(). - * - */ - + /* The SCSI-2 arbitration delay is 2.4 us */ udelay(3); /* Check for lost arbitration */ - if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) || (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) { - NCR5380_write(MODE_REG, MR_BASE); - dprintk(NDEBUG_ARBITRATION, "scsi%d : lost arbitration, deasserting MR_ARBITRATE\n", instance->host_no); - goto failed; - } - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_SEL); - - if (!(hostdata->flags & FLAG_DTC3181E) && - /* RvC: DTC3181E has some trouble with this - * so we simply removed it. Seems to work with - * only Mustek scanner attached - */ + if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || + (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) || (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) { NCR5380_write(MODE_REG, MR_BASE); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - dprintk(NDEBUG_ARBITRATION, "scsi%d : lost arbitration, deasserting ICR_ASSERT_SEL\n", instance->host_no); - goto failed; + dsprintk(NDEBUG_ARBITRATION, instance, "lost arbitration, deasserting MR_ARBITRATE\n"); + spin_lock_irq(&hostdata->lock); + goto out; } - /* - * Again, bus clear + bus settle time is 1.2us, however, this is + + /* After/during arbitration, BSY should be asserted. + * IBM DPES-31080 Version S31Q works now + * Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman) + */ + NCR5380_write(INITIATOR_COMMAND_REG, + ICR_BASE | ICR_ASSERT_SEL | ICR_ASSERT_BSY); + + /* + * Again, bus clear + bus settle time is 1.2us, however, this is * a minimum so we'll udelay ceil(1.2) */ - udelay(2); + if (hostdata->flags & FLAG_TOSHIBA_DELAY) + udelay(15); + else + udelay(2); + + spin_lock_irq(&hostdata->lock); + + /* NCR5380_reselect() clears MODE_REG after a reselection interrupt */ + if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) + goto out; + + if (!hostdata->selecting) { + NCR5380_write(MODE_REG, MR_BASE); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + goto out; + } - dprintk(NDEBUG_ARBITRATION, "scsi%d : won arbitration\n", instance->host_no); + dsprintk(NDEBUG_ARBITRATION, instance, "won arbitration\n"); - /* - * Now that we have won arbitration, start Selection process, asserting + /* + * Now that we have won arbitration, start Selection process, asserting * the host and target ID's on the SCSI bus. */ - NCR5380_write(OUTPUT_DATA_REG, (hostdata->id_mask | (1 << scmd_id(cmd)))); + NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask | (1 << scmd_id(cmd))); - /* + /* * Raise ATN while SEL is true before BSY goes false from arbitration, * since this is the only way to guarantee that we'll get a MESSAGE OUT * phase immediately after selection. */ - NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_BSY | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL)); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY | + ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL); NCR5380_write(MODE_REG, MR_BASE); - /* + /* * Reselect interrupts must be turned off prior to the dropping of BSY, * otherwise we will trigger an interrupt. */ NCR5380_write(SELECT_ENABLE_REG, 0); + spin_unlock_irq(&hostdata->lock); + /* - * The initiator shall then wait at least two deskew delays and release + * The initiator shall then wait at least two deskew delays and release * the BSY signal. */ - udelay(1); /* wingel -- wait two bus deskew delay >2*45ns */ + udelay(1); /* wingel -- wait two bus deskew delay >2*45ns */ /* Reset BSY */ - NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL)); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | + ICR_ASSERT_ATN | ICR_ASSERT_SEL); - /* + /* * Something weird happens when we cease to drive BSY - looks - * like the board/chip is letting us do another read before the + * like the board/chip is letting us do another read before the * appropriate propagation delay has expired, and we're confusing * a BSY signal from ourselves as the target's response to SELECTION. * * A small delay (the 'C++' frontend breaks the pipeline with an * unnecessary jump, making it work on my 386-33/Trantor T128, the - * tighter 'C' code breaks and requires this) solves the problem - - * the 1 us delay is arbitrary, and only used because this delay will - * be the same on other platforms and since it works here, it should + * tighter 'C' code breaks and requires this) solves the problem - + * the 1 us delay is arbitrary, and only used because this delay will + * be the same on other platforms and since it works here, it should * work there. * * wingel suggests that this could be due to failing to wait @@ -1339,50 +1162,43 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) udelay(1); - dprintk(NDEBUG_SELECTION, "scsi%d : selecting target %d\n", instance->host_no, scmd_id(cmd)); + dsprintk(NDEBUG_SELECTION, instance, "selecting target %d\n", scmd_id(cmd)); - /* - * The SCSI specification calls for a 250 ms timeout for the actual + /* + * The SCSI specification calls for a 250 ms timeout for the actual * selection. */ - timeout = jiffies + msecs_to_jiffies(250); - - /* - * XXX very interesting - we're seeing a bounce where the BSY we - * asserted is being reflected / still asserted (propagation delay?) - * and it's detecting as true. Sigh. - */ - - hostdata->select_time = 0; /* we count the clock ticks at which we polled */ - hostdata->selecting = cmd; + err = NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, SR_BSY, + msecs_to_jiffies(250)); -part2: - /* RvC: here we enter after a sleeping period, or immediately after - execution of part 1 - we poll only once ech clock tick */ - value = NCR5380_read(STATUS_REG) & (SR_BSY | SR_IO); - - if (!value && (hostdata->select_time < HZ/4)) { - /* RvC: we still must wait for a device response */ - hostdata->select_time++; /* after 25 ticks the device has failed */ - NCR5380_set_timer(hostdata, 1); - return 0; /* RvC: we return here with hostdata->selecting set, - to go to sleep */ - } - - hostdata->selecting = NULL;/* clear this pointer, because we passed the - waiting period */ if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) { + spin_lock_irq(&hostdata->lock); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); NCR5380_reselect(instance); - printk("scsi%d : reselection after won arbitration?\n", instance->host_no); + if (!hostdata->connected) + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + shost_printk(KERN_ERR, instance, "reselection after won arbitration?\n"); + goto out; + } + + if (err < 0) { + spin_lock_irq(&hostdata->lock); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return -1; + /* Can't touch cmd if it has been reclaimed by the scsi ML */ + if (hostdata->selecting) { + cmd->result = DID_BAD_TARGET << 16; + complete_cmd(instance, cmd); + dsprintk(NDEBUG_SELECTION, instance, "target did not respond within 250ms\n"); + cmd = NULL; + } + goto out; } - /* - * No less than two deskew delays after the initiator detects the - * BSY signal is true, it shall release the SEL signal and may + + /* + * No less than two deskew delays after the initiator detects the + * BSY signal is true, it shall release the SEL signal and may * change the DATA BUS. -wingel */ @@ -1390,53 +1206,38 @@ part2: NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); - if (!(NCR5380_read(STATUS_REG) & SR_BSY)) { - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - if (hostdata->targets_present & (1 << scmd_id(cmd))) { - printk(KERN_DEBUG "scsi%d : weirdness\n", instance->host_no); - if (hostdata->restart_select) - printk(KERN_DEBUG "\trestart select\n"); - NCR5380_dprint(NDEBUG_SELECTION, instance); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return -1; - } - cmd->result = DID_BAD_TARGET << 16; - cmd->scsi_done(cmd); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - dprintk(NDEBUG_SELECTION, "scsi%d : target did not respond within 250ms\n", instance->host_no); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return 0; - } - hostdata->targets_present |= (1 << scmd_id(cmd)); - /* - * Since we followed the SCSI spec, and raised ATN while SEL + * Since we followed the SCSI spec, and raised ATN while SEL * was true but before BSY was false during selection, the information * transfer phase should be a MESSAGE OUT phase so that we can send the * IDENTIFY message. - * + * * If SCSI-II tagged queuing is enabled, we also send a SIMPLE_QUEUE_TAG * message (2 bytes) with a tag ID that we increment with every command * until it wraps back to 0. * * XXX - it turns out that there are some broken SCSI-II devices, - * which claim to support tagged queuing but fail when more than - * some number of commands are issued at once. + * which claim to support tagged queuing but fail when more than + * some number of commands are issued at once. */ /* Wait for start of REQ/ACK handshake */ - spin_unlock_irq(instance->host_lock); err = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ); - spin_lock_irq(instance->host_lock); - - if(err) { - printk(KERN_ERR "scsi%d: timeout at NCR5380.c:%d\n", instance->host_no, __LINE__); + spin_lock_irq(&hostdata->lock); + if (err < 0) { + shost_printk(KERN_ERR, instance, "select: REQ timeout\n"); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - goto failed; + goto out; + } + if (!hostdata->selecting) { + do_abort(instance); + goto out; } - dprintk(NDEBUG_SELECTION, "scsi%d : target %d selected, going into MESSAGE OUT phase.\n", instance->host_no, cmd->device->id); + dsprintk(NDEBUG_SELECTION, instance, "target %d selected, going into MESSAGE OUT phase.\n", + scmd_id(cmd)); tmp[0] = IDENTIFY(((instance->irq == NO_IRQ) ? 0 : 1), cmd->device->lun); len = 1; @@ -1446,104 +1247,82 @@ part2: data = tmp; phase = PHASE_MSGOUT; NCR5380_transfer_pio(instance, &phase, &len, &data); - dprintk(NDEBUG_SELECTION, "scsi%d : nexus established.\n", instance->host_no); + dsprintk(NDEBUG_SELECTION, instance, "nexus established.\n"); /* XXX need to handle errors here */ + hostdata->connected = cmd; - hostdata->busy[cmd->device->id] |= (1 << (cmd->device->lun & 0xFF)); + hostdata->busy[cmd->device->id] |= 1 << cmd->device->lun; initialize_SCp(cmd); - return 0; - - /* Selection failed */ -failed: - return -1; + cmd = NULL; +out: + if (!hostdata->selecting) + return NULL; + hostdata->selecting = NULL; + return cmd; } -/* - * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance, - * unsigned char *phase, int *count, unsigned char **data) +/* + * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance, + * unsigned char *phase, int *count, unsigned char **data) * * Purpose : transfers data in given phase using polled I/O * - * Inputs : instance - instance of driver, *phase - pointer to - * what phase is expected, *count - pointer to number of - * bytes to transfer, **data - pointer to data pointer. - * + * Inputs : instance - instance of driver, *phase - pointer to + * what phase is expected, *count - pointer to number of + * bytes to transfer, **data - pointer to data pointer. + * * Returns : -1 when different phase is entered without transferring - * maximum number of bytes, 0 if all bytes or transferred or exit - * is in same phase. + * maximum number of bytes, 0 if all bytes are transferred or exit + * is in same phase. * - * Also, *phase, *count, *data are modified in place. + * Also, *phase, *count, *data are modified in place. * * XXX Note : handling for bus free may be useful. */ /* - * Note : this code is not as quick as it could be, however it + * Note : this code is not as quick as it could be, however it * IS 100% reliable, and for the actual data transfer where speed * counts, we will always do a pseudo DMA or DMA transfer. */ -static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) { - NCR5380_local_declare(); +static int NCR5380_transfer_pio(struct Scsi_Host *instance, + unsigned char *phase, int *count, + unsigned char **data) +{ unsigned char p = *phase, tmp; int c = *count; unsigned char *d = *data; - /* - * RvC: some administrative data to process polling time - */ - int break_allowed = 0; - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; - NCR5380_setup(instance); - - if (!(p & SR_IO)) - dprintk(NDEBUG_PIO, "scsi%d : pio write %d bytes\n", instance->host_no, c); - else - dprintk(NDEBUG_PIO, "scsi%d : pio read %d bytes\n", instance->host_no, c); - /* - * The NCR5380 chip will only drive the SCSI bus when the + /* + * The NCR5380 chip will only drive the SCSI bus when the * phase specified in the appropriate bits of the TARGET COMMAND * REGISTER match the STATUS REGISTER */ - NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p)); + NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p)); - /* RvC: don't know if this is necessary, but other SCSI I/O is short - * so breaks are not necessary there - */ - if ((p == PHASE_DATAIN) || (p == PHASE_DATAOUT)) { - break_allowed = 1; - } do { - /* - * Wait for assertion of REQ, after which the phase bits will be - * valid - */ - - /* RvC: we simply poll once, after that we stop temporarily - * and let the device buffer fill up - * if breaking is not allowed, we keep polling as long as needed + /* + * Wait for assertion of REQ, after which the phase bits will be + * valid */ - /* FIXME */ - while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ) && !break_allowed); - if (!(tmp & SR_REQ)) { - /* timeout condition */ - NCR5380_set_timer(hostdata, USLEEP_SLEEP); + if (NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ) < 0) break; - } - dprintk(NDEBUG_HANDSHAKE, "scsi%d : REQ detected\n", instance->host_no); + dsprintk(NDEBUG_HANDSHAKE, instance, "REQ asserted\n"); /* Check for phase mismatch */ - if ((tmp & PHASE_MASK) != p) { - dprintk(NDEBUG_HANDSHAKE, "scsi%d : phase mismatch\n", instance->host_no); - NCR5380_dprint_phase(NDEBUG_HANDSHAKE, instance); + if ((NCR5380_read(STATUS_REG) & PHASE_MASK) != p) { + dsprintk(NDEBUG_PIO, instance, "phase mismatch\n"); + NCR5380_dprint_phase(NDEBUG_PIO, instance); break; } + /* Do actual transfer from SCSI bus to / from memory */ if (!(p & SR_IO)) NCR5380_write(OUTPUT_DATA_REG, *d); @@ -1552,7 +1331,7 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase ++d; - /* + /* * The SCSI standard suggests that in MSGOUT phase, the initiator * should drop ATN on the last byte of the message phase * after REQ has been asserted for the handshake but before @@ -1563,29 +1342,34 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase if (!((p & SR_MSG) && c > 1)) { NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA); NCR5380_dprint(NDEBUG_PIO, instance); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ACK); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | + ICR_ASSERT_DATA | ICR_ASSERT_ACK); } else { - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | + ICR_ASSERT_DATA | ICR_ASSERT_ATN); NCR5380_dprint(NDEBUG_PIO, instance); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | + ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK); } } else { NCR5380_dprint(NDEBUG_PIO, instance); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ACK); } - /* FIXME - if this fails bus reset ?? */ - NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 5*HZ); - dprintk(NDEBUG_HANDSHAKE, "scsi%d : req false, handshake complete\n", instance->host_no); + if (NCR5380_poll_politely(instance, + STATUS_REG, SR_REQ, 0, 5 * HZ) < 0) + break; + + dsprintk(NDEBUG_HANDSHAKE, instance, "REQ negated, handshake complete\n"); /* - * We have several special cases to consider during REQ/ACK handshaking : - * 1. We were in MSGOUT phase, and we are on the last byte of the - * message. ATN must be dropped as ACK is dropped. + * We have several special cases to consider during REQ/ACK handshaking : + * 1. We were in MSGOUT phase, and we are on the last byte of the + * message. ATN must be dropped as ACK is dropped. * - * 2. We are in a MSGIN phase, and we are on the last byte of the - * message. We must exit with ACK asserted, so that the calling - * code may raise ATN before dropping ACK to reject the message. + * 2. We are in a MSGIN phase, and we are on the last byte of the + * message. We must exit with ACK asserted, so that the calling + * code may raise ATN before dropping ACK to reject the message. * * 3. ACK and ATN are clear and the target may proceed as normal. */ @@ -1597,12 +1381,16 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase } } while (--c); - dprintk(NDEBUG_PIO, "scsi%d : residual %d\n", instance->host_no, c); + dsprintk(NDEBUG_PIO, instance, "residual %d\n", c); *count = c; *data = d; tmp = NCR5380_read(STATUS_REG); - if (tmp & SR_REQ) + /* The phase read from the bus is valid if either REQ is (already) + * asserted or if ACK hasn't been released yet. The latter applies if + * we're in MSG IN, DATA IN or STATUS and all bytes have been received. + */ + if ((tmp & SR_REQ) || ((tmp & SR_IO) && c == 0)) *phase = tmp & PHASE_MASK; else *phase = PHASE_UNKNOWN; @@ -1614,79 +1402,80 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase } /** - * do_reset - issue a reset command - * @host: adapter to reset + * do_reset - issue a reset command + * @instance: adapter to reset * - * Issue a reset sequence to the NCR5380 and try and get the bus - * back into sane shape. + * Issue a reset sequence to the NCR5380 and try and get the bus + * back into sane shape. * - * Locks: caller holds queue lock + * This clears the reset interrupt flag because there may be no handler for + * it. When the driver is initialized, the NCR5380_intr() handler has not yet + * been installed. And when in EH we may have released the ST DMA interrupt. */ - -static void do_reset(struct Scsi_Host *host) { - NCR5380_local_declare(); - NCR5380_setup(host); - NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK)); +static void do_reset(struct Scsi_Host *instance) +{ + unsigned long flags; + + local_irq_save(flags); + NCR5380_write(TARGET_COMMAND_REG, + PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK)); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST); - udelay(25); + udelay(50); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); + local_irq_restore(flags); } -/* - * Function : do_abort (Scsi_Host *host) - * - * Purpose : abort the currently established nexus. Should only be - * called from a routine which can drop into a - * - * Returns : 0 on success, -1 on failure. - * - * Locks: queue lock held by caller - * FIXME: sort this out and get new_eh running +/** + * do_abort - abort the currently established nexus by going to + * MESSAGE OUT phase and sending an ABORT message. + * @instance: relevant scsi host instance + * + * Returns 0 on success, -1 on failure. */ -static int do_abort(struct Scsi_Host *host) { - NCR5380_local_declare(); +static int do_abort(struct Scsi_Host *instance) +{ unsigned char *msgptr, phase, tmp; int len; int rc; - NCR5380_setup(host); - /* Request message out phase */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); - /* - * Wait for the target to indicate a valid phase by asserting - * REQ. Once this happens, we'll have either a MSGOUT phase - * and can immediately send the ABORT message, or we'll have some + /* + * Wait for the target to indicate a valid phase by asserting + * REQ. Once this happens, we'll have either a MSGOUT phase + * and can immediately send the ABORT message, or we'll have some * other phase and will have to source/sink data. - * + * * We really don't care what value was on the bus or what value * the target sees, so we just handshake. */ - rc = NCR5380_poll_politely(host, STATUS_REG, SR_REQ, SR_REQ, 60 * HZ); - - if(rc < 0) - return -1; + rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 10 * HZ); + if (rc < 0) + goto timeout; + + tmp = NCR5380_read(STATUS_REG) & PHASE_MASK; - tmp = (unsigned char)rc; - NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp)); - if ((tmp & PHASE_MASK) != PHASE_MSGOUT) { - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK); - rc = NCR5380_poll_politely(host, STATUS_REG, SR_REQ, 0, 3*HZ); + if (tmp != PHASE_MSGOUT) { + NCR5380_write(INITIATOR_COMMAND_REG, + ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK); + rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 3 * HZ); + if (rc < 0) + goto timeout; NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); - if(rc == -1) - return -1; } + tmp = ABORT; msgptr = &tmp; len = 1; phase = PHASE_MSGOUT; - NCR5380_transfer_pio(host, &phase, &len, &msgptr); + NCR5380_transfer_pio(instance, &phase, &len, &msgptr); /* * If we got here, and the command completed successfully, @@ -1694,32 +1483,37 @@ static int do_abort(struct Scsi_Host *host) { */ return len ? -1 : 0; + +timeout: + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + return -1; } #if defined(REAL_DMA) || defined(PSEUDO_DMA) || defined (REAL_DMA_POLL) -/* - * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance, - * unsigned char *phase, int *count, unsigned char **data) +/* + * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance, + * unsigned char *phase, int *count, unsigned char **data) * * Purpose : transfers data in given phase using either real - * or pseudo DMA. + * or pseudo DMA. * - * Inputs : instance - instance of driver, *phase - pointer to - * what phase is expected, *count - pointer to number of - * bytes to transfer, **data - pointer to data pointer. - * - * Returns : -1 when different phase is entered without transferring - * maximum number of bytes, 0 if all bytes or transferred or exit - * is in same phase. + * Inputs : instance - instance of driver, *phase - pointer to + * what phase is expected, *count - pointer to number of + * bytes to transfer, **data - pointer to data pointer. * - * Also, *phase, *count, *data are modified in place. + * Returns : -1 when different phase is entered without transferring + * maximum number of bytes, 0 if all bytes or transferred or exit + * is in same phase. * - * Locks: io_request lock held by caller + * Also, *phase, *count, *data are modified in place. */ -static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) { - NCR5380_local_declare(); +static int NCR5380_transfer_dma(struct Scsi_Host *instance, + unsigned char *phase, int *count, + unsigned char **data) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); register int c = *count; register unsigned char p = *phase; register unsigned char *d = *data; @@ -1730,54 +1524,47 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase unsigned char saved_data = 0, overrun = 0, residue; #endif - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; - - NCR5380_setup(instance); - if ((tmp = (NCR5380_read(STATUS_REG) & PHASE_MASK)) != p) { *phase = tmp; return -1; } #if defined(REAL_DMA) || defined(REAL_DMA_POLL) -#ifdef READ_OVERRUNS if (p & SR_IO) { - c -= 2; + if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS)) + c -= 2; } -#endif - dprintk(NDEBUG_DMA, "scsi%d : initializing DMA channel %d for %s, %d bytes %s %0x\n", instance->host_no, instance->dma_channel, (p & SR_IO) ? "reading" : "writing", c, (p & SR_IO) ? "to" : "from", (unsigned) d); hostdata->dma_len = (p & SR_IO) ? NCR5380_dma_read_setup(instance, d, c) : NCR5380_dma_write_setup(instance, d, c); + + dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n", + (p & SR_IO) ? "receive" : "send", c, *data); #endif NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p)); #ifdef REAL_DMA - NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_ENABLE_EOP_INTR | MR_MONITOR_BSY); + NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY | + MR_ENABLE_EOP_INTR); #elif defined(REAL_DMA_POLL) - NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE); + NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY); #else /* * Note : on my sample board, watch-dog timeouts occurred when interrupts - * were not disabled for the duration of a single DMA transfer, from + * were not disabled for the duration of a single DMA transfer, from * before the setting of DMA mode to after transfer of the last byte. */ -#if defined(PSEUDO_DMA) && defined(UNSAFE) - spin_unlock_irq(instance->host_lock); -#endif - /* KLL May need eop and parity in 53c400 */ - if (hostdata->flags & FLAG_NCR53C400) - NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | - MR_ENABLE_PAR_CHECK | MR_ENABLE_PAR_INTR | - MR_ENABLE_EOP_INTR | MR_MONITOR_BSY); + if (hostdata->flags & FLAG_NO_DMA_FIXUP) + NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY | + MR_ENABLE_EOP_INTR); else - NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE); + NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY); #endif /* def REAL_DMA */ dprintk(NDEBUG_DMA, "scsi%d : mode reg = 0x%X\n", instance->host_no, NCR5380_read(MODE_REG)); - /* - * On the PAS16 at least I/O recovery delays are not needed here. - * Everyone else seems to want them. + /* + * On the PAS16 at least I/O recovery delays are not needed here. + * Everyone else seems to want them. */ if (p & SR_IO) { @@ -1797,49 +1584,49 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase } while ((tmp & BASR_PHASE_MATCH) && !(tmp & (BASR_BUSY_ERROR | BASR_END_DMA_TRANSFER))); /* - At this point, either we've completed DMA, or we have a phase mismatch, - or we've unexpectedly lost BUSY (which is a real error). - - For write DMAs, we want to wait until the last byte has been - transferred out over the bus before we turn off DMA mode. Alas, there - seems to be no terribly good way of doing this on a 5380 under all - conditions. For non-scatter-gather operations, we can wait until REQ - and ACK both go false, or until a phase mismatch occurs. Gather-writes - are nastier, since the device will be expecting more data than we - are prepared to send it, and REQ will remain asserted. On a 53C8[01] we - could test LAST BIT SENT to assure transfer (I imagine this is precisely - why this signal was added to the newer chips) but on the older 538[01] - this signal does not exist. The workaround for this lack is a watchdog; - we bail out of the wait-loop after a modest amount of wait-time if - the usual exit conditions are not met. Not a terribly clean or - correct solution :-% - - Reads are equally tricky due to a nasty characteristic of the NCR5380. - If the chip is in DMA mode for an READ, it will respond to a target's - REQ by latching the SCSI data into the INPUT DATA register and asserting - ACK, even if it has _already_ been notified by the DMA controller that - the current DMA transfer has completed! If the NCR5380 is then taken - out of DMA mode, this already-acknowledged byte is lost. - - This is not a problem for "one DMA transfer per command" reads, because - the situation will never arise... either all of the data is DMA'ed - properly, or the target switches to MESSAGE IN phase to signal a - disconnection (either operation bringing the DMA to a clean halt). - However, in order to handle scatter-reads, we must work around the - problem. The chosen fix is to DMA N-2 bytes, then check for the - condition before taking the NCR5380 out of DMA mode. One or two extra - bytes are transferred via PIO as necessary to fill out the original - request. + * At this point, either we've completed DMA, or we have a phase mismatch, + * or we've unexpectedly lost BUSY (which is a real error). + * + * For DMA sends, we want to wait until the last byte has been + * transferred out over the bus before we turn off DMA mode. Alas, there + * seems to be no terribly good way of doing this on a 5380 under all + * conditions. For non-scatter-gather operations, we can wait until REQ + * and ACK both go false, or until a phase mismatch occurs. Gather-sends + * are nastier, since the device will be expecting more data than we + * are prepared to send it, and REQ will remain asserted. On a 53C8[01] we + * could test Last Byte Sent to assure transfer (I imagine this is precisely + * why this signal was added to the newer chips) but on the older 538[01] + * this signal does not exist. The workaround for this lack is a watchdog; + * we bail out of the wait-loop after a modest amount of wait-time if + * the usual exit conditions are not met. Not a terribly clean or + * correct solution :-% + * + * DMA receive is equally tricky due to a nasty characteristic of the NCR5380. + * If the chip is in DMA receive mode, it will respond to a target's + * REQ by latching the SCSI data into the INPUT DATA register and asserting + * ACK, even if it has _already_ been notified by the DMA controller that + * the current DMA transfer has completed! If the NCR5380 is then taken + * out of DMA mode, this already-acknowledged byte is lost. This is + * not a problem for "one DMA transfer per READ command", because + * the situation will never arise... either all of the data is DMA'ed + * properly, or the target switches to MESSAGE IN phase to signal a + * disconnection (either operation bringing the DMA to a clean halt). + * However, in order to handle scatter-receive, we must work around the + * problem. The chosen fix is to DMA N-2 bytes, then check for the + * condition before taking the NCR5380 out of DMA mode. One or two extra + * bytes are transferred via PIO as necessary to fill out the original + * request. */ if (p & SR_IO) { -#ifdef READ_OVERRUNS - udelay(10); - if (((NCR5380_read(BUS_AND_STATUS_REG) & (BASR_PHASE_MATCH | BASR_ACK)) == (BASR_PHASE_MATCH | BASR_ACK))) { - saved_data = NCR5380_read(INPUT_DATA_REGISTER); - overrun = 1; + if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS)) { + udelay(10); + if ((NCR5380_read(BUS_AND_STATUS_REG) & (BASR_PHASE_MATCH | BASR_ACK)) == + (BASR_PHASE_MATCH | BASR_ACK)) { + saved_data = NCR5380_read(INPUT_DATA_REGISTER); + overrun = 1; + } } -#endif } else { int limit = 100; while (((tmp = NCR5380_read(BUS_AND_STATUS_REG)) & BASR_ACK) || (NCR5380_read(STATUS_REG) & SR_REQ)) { @@ -1850,7 +1637,8 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase } } - dprintk(NDEBUG_DMA, "scsi%d : polled DMA transfer complete, basr 0x%X, sr 0x%X\n", instance->host_no, tmp, NCR5380_read(STATUS_REG)); + dsprintk(NDEBUG_DMA, "polled DMA transfer complete, basr 0x%02x, sr 0x%02x\n", + tmp, NCR5380_read(STATUS_REG)); NCR5380_write(MODE_REG, MR_BASE); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); @@ -1861,8 +1649,8 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase *data += c; *phase = NCR5380_read(STATUS_REG) & PHASE_MASK; -#ifdef READ_OVERRUNS - if (*phase == p && (p & SR_IO) && residue == 0) { + if (!(hostdata->flags & FLAG_NO_DMA_FIXUPS) && + *phase == p && (p & SR_IO) && residue == 0) { if (overrun) { dprintk(NDEBUG_DMA, "Got an input overrun, using saved byte\n"); **data = saved_data; @@ -1877,7 +1665,6 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase NCR5380_transfer_pio(instance, phase, &cnt, data); *count -= toPIO - cnt; } -#endif dprintk(NDEBUG_DMA, "Return with data ptr = 0x%X, count %d, last 0x%X, next 0x%X\n", *data, *count, *(*data + *count - 1), *(*data + *count)); return 0; @@ -1886,95 +1673,64 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase return 0; #else /* defined(REAL_DMA_POLL) */ if (p & SR_IO) { -#ifdef DMA_WORKS_RIGHT - foo = NCR5380_pread(instance, d, c); -#else - int diff = 1; - if (hostdata->flags & FLAG_NCR53C400) { - diff = 0; - } - if (!(foo = NCR5380_pread(instance, d, c - diff))) { + foo = NCR5380_pread(instance, d, + hostdata->flags & FLAG_NO_DMA_FIXUP ? c : c - 1); + if (!foo && !(hostdata->flags & FLAG_NO_DMA_FIXUP)) { /* - * We can't disable DMA mode after successfully transferring + * We can't disable DMA mode after successfully transferring * what we plan to be the last byte, since that would open up - * a race condition where if the target asserted REQ before + * a race condition where if the target asserted REQ before * we got the DMA mode reset, the NCR5380 would have latched * an additional byte into the INPUT DATA register and we'd * have dropped it. - * - * The workaround was to transfer one fewer bytes than we - * intended to with the pseudo-DMA read function, wait for + * + * The workaround was to transfer one fewer bytes than we + * intended to with the pseudo-DMA read function, wait for * the chip to latch the last byte, read it, and then disable * pseudo-DMA mode. - * + * * After REQ is asserted, the NCR5380 asserts DRQ and ACK. * REQ is deasserted when ACK is asserted, and not reasserted * until ACK goes false. Since the NCR5380 won't lower ACK * until DACK is asserted, which won't happen unless we twiddle - * the DMA port or we take the NCR5380 out of DMA mode, we - * can guarantee that we won't handshake another extra + * the DMA port or we take the NCR5380 out of DMA mode, we + * can guarantee that we won't handshake another extra * byte. */ - if (!(hostdata->flags & FLAG_NCR53C400)) { - while (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_DRQ)); - /* Wait for clean handshake */ - while (NCR5380_read(STATUS_REG) & SR_REQ); - d[c - 1] = NCR5380_read(INPUT_DATA_REG); + if (NCR5380_poll_politely(instance, BUS_AND_STATUS_REG, + BASR_DRQ, BASR_DRQ, HZ) < 0) { + foo = -1; + shost_printk(KERN_ERR, instance, "PDMA read: DRQ timeout\n"); + } + if (NCR5380_poll_politely(instance, STATUS_REG, + SR_REQ, 0, HZ) < 0) { + foo = -1; + shost_printk(KERN_ERR, instance, "PDMA read: !REQ timeout\n"); } + d[c - 1] = NCR5380_read(INPUT_DATA_REG); } -#endif } else { -#ifdef DMA_WORKS_RIGHT foo = NCR5380_pwrite(instance, d, c); -#else - int timeout; - dprintk(NDEBUG_C400_PWRITE, "About to pwrite %d bytes\n", c); - if (!(foo = NCR5380_pwrite(instance, d, c))) { + if (!foo && !(hostdata->flags & FLAG_NO_DMA_FIXUP)) { /* - * Wait for the last byte to be sent. If REQ is being asserted for - * the byte we're interested, we'll ACK it and it will go false. + * Wait for the last byte to be sent. If REQ is being asserted for + * the byte we're interested, we'll ACK it and it will go false. */ - if (!(hostdata->flags & FLAG_HAS_LAST_BYTE_SENT)) { - timeout = 20000; - while (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_DRQ) && (NCR5380_read(BUS_AND_STATUS_REG) & BASR_PHASE_MATCH)); - - if (!timeout) - dprintk(NDEBUG_LAST_BYTE_SENT, "scsi%d : timed out on last byte\n", instance->host_no); - - if (hostdata->flags & FLAG_CHECK_LAST_BYTE_SENT) { - hostdata->flags &= ~FLAG_CHECK_LAST_BYTE_SENT; - if (NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT) { - hostdata->flags |= FLAG_HAS_LAST_BYTE_SENT; - dprintk(NDEBUG_LAST_BYTE_SENT, "scsi%d : last byte sent works\n", instance->host_no); - } - } - } else { - dprintk(NDEBUG_C400_PWRITE, "Waiting for LASTBYTE\n"); - while (!(NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT)); - dprintk(NDEBUG_C400_PWRITE, "Got LASTBYTE\n"); + if (NCR5380_poll_politely2(instance, + BUS_AND_STATUS_REG, BASR_DRQ, BASR_DRQ, + BUS_AND_STATUS_REG, BASR_PHASE_MATCH, 0, HZ) < 0) { + foo = -1; + shost_printk(KERN_ERR, instance, "PDMA write: DRQ and phase timeout\n"); } } -#endif } NCR5380_write(MODE_REG, MR_BASE); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - - if ((!(p & SR_IO)) && (hostdata->flags & FLAG_NCR53C400)) { - dprintk(NDEBUG_C400_PWRITE, "53C400w: Checking for IRQ\n"); - if (NCR5380_read(BUS_AND_STATUS_REG) & BASR_IRQ) { - dprintk(NDEBUG_C400_PWRITE, "53C400w: got it, reading reset interrupt reg\n"); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else { - printk("53C400w: IRQ NOT THERE!\n"); - } - } + NCR5380_read(RESET_PARITY_INTERRUPT_REG); *data = d + c; *count = 0; *phase = NCR5380_read(STATUS_REG) & PHASE_MASK; -#if defined(PSEUDO_DMA) && defined(UNSAFE) - spin_lock_irq(instance->host_lock); -#endif /* defined(REAL_DMA_POLL) */ return foo; #endif /* def REAL_DMA */ } @@ -1983,25 +1739,23 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase /* * Function : NCR5380_information_transfer (struct Scsi_Host *instance) * - * Purpose : run through the various SCSI phases and do as the target - * directs us to. Operates on the currently connected command, - * instance->connected. + * Purpose : run through the various SCSI phases and do as the target + * directs us to. Operates on the currently connected command, + * instance->connected. * * Inputs : instance, instance for which we are doing commands * - * Side effects : SCSI things happen, the disconnected queue will be - * modified if a command disconnects, *instance->connected will - * change. - * - * XXX Note : we need to watch for bus free or a reset condition here - * to recover from an unexpected bus free condition. + * Side effects : SCSI things happen, the disconnected queue will be + * modified if a command disconnects, *instance->connected will + * change. * - * Locks: io_request_lock held by caller in IRQ mode + * XXX Note : we need to watch for bus free or a reset condition here + * to recover from an unexpected bus free condition. */ -static void NCR5380_information_transfer(struct Scsi_Host *instance) { - NCR5380_local_declare(); - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *)instance->hostdata; +static void NCR5380_information_transfer(struct Scsi_Host *instance) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned char msgout = NOP; int sink = 0; int len; @@ -2010,13 +1764,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { #endif unsigned char *data; unsigned char phase, tmp, extended_msg[10], old_phase = 0xff; - struct scsi_cmnd *cmd = (struct scsi_cmnd *) hostdata->connected; - /* RvC: we need to set the end of the polling time */ - unsigned long poll_time = jiffies + USLEEP_POLL; + struct scsi_cmnd *cmd; - NCR5380_setup(instance); + while ((cmd = hostdata->connected)) { + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); - while (1) { tmp = NCR5380_read(STATUS_REG); /* We only have a valid SCSI phase when REQ is asserted */ if (tmp & SR_REQ) { @@ -2028,24 +1780,28 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { if (sink && (phase != PHASE_MSGOUT)) { NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp)); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK); - while (NCR5380_read(STATUS_REG) & SR_REQ); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | + ICR_ASSERT_ACK); + while (NCR5380_read(STATUS_REG) & SR_REQ) + ; + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | + ICR_ASSERT_ATN); sink = 0; continue; } + switch (phase) { - case PHASE_DATAIN: case PHASE_DATAOUT: #if (NDEBUG & NDEBUG_NO_DATAOUT) - printk("scsi%d : NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n", instance->host_no); + shost_printk(KERN_DEBUG, instance, "NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n"); sink = 1; do_abort(instance); cmd->result = DID_ERROR << 16; - cmd->scsi_done(cmd); + complete_cmd(instance, cmd); return; #endif - /* + case PHASE_DATAIN: + /* * If there is no room left in the current buffer in the * scatter-gather list, move onto the next one. */ @@ -2055,10 +1811,13 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { --cmd->SCp.buffers_residual; cmd->SCp.this_residual = cmd->SCp.buffer->length; cmd->SCp.ptr = sg_virt(cmd->SCp.buffer); - dprintk(NDEBUG_INFORMATION, "scsi%d : %d bytes and %d buffers left\n", instance->host_no, cmd->SCp.this_residual, cmd->SCp.buffers_residual); + dsprintk(NDEBUG_INFORMATION, instance, "%d bytes and %d buffers left\n", + cmd->SCp.this_residual, + cmd->SCp.buffers_residual); } + /* - * The preferred transfer method is going to be + * The preferred transfer method is going to be * PSEUDO-DMA for systems that are strictly PIO, * since we can let the hardware do the handshaking. * @@ -2068,50 +1827,39 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { */ #if defined(PSEUDO_DMA) || defined(REAL_DMA_POLL) - /* KLL - * PSEUDO_DMA is defined here. If this is the g_NCR5380 - * driver then it will always be defined, so the - * FLAG_NO_PSEUDO_DMA is used to inhibit PDMA in the base - * NCR5380 case. I think this is a fairly clean solution. - * We supplement these 2 if's with the flag. - */ -#ifdef NCR5380_dma_xfer_len - if (!cmd->device->borken && !(hostdata->flags & FLAG_NO_PSEUDO_DMA) && (transfersize = NCR5380_dma_xfer_len(instance, cmd)) != 0) { -#else - transfersize = cmd->transfersize; - -#ifdef LIMIT_TRANSFERSIZE /* If we have problems with interrupt service */ - if (transfersize > 512) - transfersize = 512; -#endif /* LIMIT_TRANSFERSIZE */ - - if (!cmd->device->borken && transfersize && !(hostdata->flags & FLAG_NO_PSEUDO_DMA) && cmd->SCp.this_residual && !(cmd->SCp.this_residual % transfersize)) { - /* Limit transfers to 32K, for xx400 & xx406 - * pseudoDMA that transfers in 128 bytes blocks. */ - if (transfersize > 32 * 1024) - transfersize = 32 * 1024; -#endif + transfersize = 0; + if (!cmd->device->borken && + !(hostdata->flags & FLAG_NO_PSEUDO_DMA)) + transfersize = NCR5380_dma_xfer_len(instance, cmd, phase); + + if (transfersize) { len = transfersize; - if (NCR5380_transfer_dma(instance, &phase, &len, (unsigned char **) &cmd->SCp.ptr)) { + if (NCR5380_transfer_dma(instance, &phase, + &len, (unsigned char **)&cmd->SCp.ptr)) { /* - * If the watchdog timer fires, all future accesses to this - * device will use the polled-IO. + * If the watchdog timer fires, all future + * accesses to this device will use the + * polled-IO. */ scmd_printk(KERN_INFO, cmd, - "switching to slow handshake\n"); + "switching to slow handshake\n"); cmd->device->borken = 1; - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); sink = 1; do_abort(instance); cmd->result = DID_ERROR << 16; - cmd->scsi_done(cmd); + complete_cmd(instance, cmd); /* XXX - need to source or sink data here, as appropriate */ } else cmd->SCp.this_residual -= transfersize - len; } else #endif /* defined(PSEUDO_DMA) || defined(REAL_DMA_POLL) */ - NCR5380_transfer_pio(instance, &phase, (int *) &cmd->SCp.this_residual, (unsigned char **) - &cmd->SCp.ptr); + { + spin_unlock_irq(&hostdata->lock); + NCR5380_transfer_pio(instance, &phase, + (int *)&cmd->SCp.this_residual, + (unsigned char **)&cmd->SCp.ptr); + spin_lock_irq(&hostdata->lock); + } break; case PHASE_MSGIN: len = 1; @@ -2120,101 +1868,42 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { cmd->SCp.Message = tmp; switch (tmp) { - /* - * Linking lets us reduce the time required to get the - * next command out to the device, hopefully this will - * mean we don't waste another revolution due to the delays - * required by ARBITRATION and another SELECTION. - * - * In the current implementation proposal, low level drivers - * merely have to start the next command, pointed to by - * next_link, done() is called as with unlinked commands. - */ -#ifdef LINKED - case LINKED_CMD_COMPLETE: - case LINKED_FLG_CMD_COMPLETE: - /* Accept message by clearing ACK */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - dprintk(NDEBUG_LINKED, "scsi%d : target %d lun %llu linked command complete.\n", instance->host_no, cmd->device->id, cmd->device->lun); - /* - * Sanity check : A linked command should only terminate with - * one of these messages if there are more linked commands - * available. - */ - if (!cmd->next_link) { - printk("scsi%d : target %d lun %llu linked command complete, no next_link\n" instance->host_no, cmd->device->id, cmd->device->lun); - sink = 1; - do_abort(instance); - return; - } - initialize_SCp(cmd->next_link); - /* The next command is still part of this process */ - cmd->next_link->tag = cmd->tag; - cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8); - dprintk(NDEBUG_LINKED, "scsi%d : target %d lun %llu linked request done, calling scsi_done().\n", instance->host_no, cmd->device->id, cmd->device->lun); - cmd->scsi_done(cmd); - cmd = hostdata->connected; - break; -#endif /* def LINKED */ case ABORT: case COMMAND_COMPLETE: /* Accept message by clearing ACK */ sink = 1; NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - hostdata->connected = NULL; - dprintk(NDEBUG_QUEUES, "scsi%d : command for target %d, lun %llu completed\n", instance->host_no, cmd->device->id, cmd->device->lun); - hostdata->busy[cmd->device->id] &= ~(1 << (cmd->device->lun & 0xFF)); - - /* - * I'm not sure what the correct thing to do here is : - * - * If the command that just executed is NOT a request - * sense, the obvious thing to do is to set the result - * code to the values of the stored parameters. - * - * If it was a REQUEST SENSE command, we need some way - * to differentiate between the failure code of the original - * and the failure code of the REQUEST sense - the obvious - * case is success, where we fall through and leave the result - * code unchanged. - * - * The non-obvious place is where the REQUEST SENSE failed - */ - - if (cmd->cmnd[0] != REQUEST_SENSE) - cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8); - else if (status_byte(cmd->SCp.Status) != GOOD) - cmd->result = (cmd->result & 0x00ffff) | (DID_ERROR << 16); + dsprintk(NDEBUG_QUEUES, instance, + "COMMAND COMPLETE %p target %d lun %llu\n", + cmd, scmd_id(cmd), cmd->device->lun); - if ((cmd->cmnd[0] == REQUEST_SENSE) && - hostdata->ses.cmd_len) { - scsi_eh_restore_cmnd(cmd, &hostdata->ses); - hostdata->ses.cmd_len = 0 ; - } - - if ((cmd->cmnd[0] != REQUEST_SENSE) && (status_byte(cmd->SCp.Status) == CHECK_CONDITION)) { - scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0); - - dprintk(NDEBUG_AUTOSENSE, "scsi%d : performing request sense\n", instance->host_no); + hostdata->connected = NULL; - LIST(cmd, hostdata->issue_queue); - cmd->host_scribble = (unsigned char *) - hostdata->issue_queue; - hostdata->issue_queue = (struct scsi_cmnd *) cmd; - dprintk(NDEBUG_QUEUES, "scsi%d : REQUEST SENSE added to head of issue queue\n", instance->host_no); - } else { - cmd->scsi_done(cmd); + cmd->result &= ~0xffff; + cmd->result |= cmd->SCp.Status; + cmd->result |= cmd->SCp.Message << 8; + + if (cmd->cmnd[0] == REQUEST_SENSE) + complete_cmd(instance, cmd); + else { + if (cmd->SCp.Status == SAM_STAT_CHECK_CONDITION || + cmd->SCp.Status == SAM_STAT_COMMAND_TERMINATED) { + dsprintk(NDEBUG_QUEUES, instance, "autosense: adding cmd %p to tail of autosense queue\n", + cmd); + list_add_tail(&ncmd->list, + &hostdata->autosense); + } else + complete_cmd(instance, cmd); } - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - /* - * Restore phase bits to 0 so an interrupted selection, + /* + * Restore phase bits to 0 so an interrupted selection, * arbitration can resume. */ NCR5380_write(TARGET_COMMAND_REG, 0); - while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected) - barrier(); + /* Enable reselect interrupts */ + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); return; case MESSAGE_REJECT: /* Accept message by clearing ACK */ @@ -2229,38 +1918,33 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { default: break; } - case DISCONNECT:{ - /* Accept message by clearing ACK */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - cmd->device->disconnect = 1; - LIST(cmd, hostdata->disconnected_queue); - cmd->host_scribble = (unsigned char *) - hostdata->disconnected_queue; - hostdata->connected = NULL; - hostdata->disconnected_queue = cmd; - dprintk(NDEBUG_QUEUES, "scsi%d : command for target %d lun %llu was moved from connected to" " the disconnected_queue\n", instance->host_no, cmd->device->id, cmd->device->lun); - /* - * Restore phase bits to 0 so an interrupted selection, - * arbitration can resume. - */ - NCR5380_write(TARGET_COMMAND_REG, 0); - - /* Enable reselect interrupts */ - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - /* Wait for bus free to avoid nasty timeouts - FIXME timeout !*/ - /* NCR538_poll_politely(instance, STATUS_REG, SR_BSY, 0, 30 * HZ); */ - while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected) - barrier(); - return; - } - /* + break; + case DISCONNECT: + /* Accept message by clearing ACK */ + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + hostdata->connected = NULL; + list_add(&ncmd->list, &hostdata->disconnected); + dsprintk(NDEBUG_INFORMATION | NDEBUG_QUEUES, + instance, "connected command %p for target %d lun %llu moved to disconnected queue\n", + cmd, scmd_id(cmd), cmd->device->lun); + + /* + * Restore phase bits to 0 so an interrupted selection, + * arbitration can resume. + */ + NCR5380_write(TARGET_COMMAND_REG, 0); + + /* Enable reselect interrupts */ + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + return; + /* * The SCSI data pointer is *IMPLICITLY* saved on a disconnect - * operation, in violation of the SCSI spec so we can safely + * operation, in violation of the SCSI spec so we can safely * ignore SAVE/RESTORE pointers calls. * - * Unfortunately, some disks violate the SCSI spec and + * Unfortunately, some disks violate the SCSI spec and * don't issue the required SAVE_POINTERS message before - * disconnecting, and we have to break spec to remain + * disconnecting, and we have to break spec to remain * compatible. */ case SAVE_POINTERS: @@ -2269,31 +1953,28 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); break; case EXTENDED_MESSAGE: -/* - * Extended messages are sent in the following format : - * Byte - * 0 EXTENDED_MESSAGE == 1 - * 1 length (includes one byte for code, doesn't - * include first two bytes) - * 2 code - * 3..length+1 arguments - * - * Start the extended message buffer with the EXTENDED_MESSAGE - * byte, since spi_print_msg() wants the whole thing. - */ + /* + * Start the message buffer with the EXTENDED_MESSAGE + * byte, since spi_print_msg() wants the whole thing. + */ extended_msg[0] = EXTENDED_MESSAGE; /* Accept first byte by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - dprintk(NDEBUG_EXTENDED, "scsi%d : receiving extended message\n", instance->host_no); + + spin_unlock_irq(&hostdata->lock); + + dsprintk(NDEBUG_EXTENDED, instance, "receiving extended message\n"); len = 2; data = extended_msg + 1; phase = PHASE_MSGIN; NCR5380_transfer_pio(instance, &phase, &len, &data); + dsprintk(NDEBUG_EXTENDED, instance, "length %d, code 0x%02x\n", + (int)extended_msg[1], + (int)extended_msg[2]); - dprintk(NDEBUG_EXTENDED, "scsi%d : length=%d, code=0x%02x\n", instance->host_no, (int) extended_msg[1], (int) extended_msg[2]); - - if (!len && extended_msg[1] <= (sizeof(extended_msg) - 1)) { + if (!len && extended_msg[1] > 0 && + extended_msg[1] <= sizeof(extended_msg) - 2) { /* Accept third byte by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); len = extended_msg[1] - 1; @@ -2301,7 +1982,8 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { phase = PHASE_MSGIN; NCR5380_transfer_pio(instance, &phase, &len, &data); - dprintk(NDEBUG_EXTENDED, "scsi%d : message received, residual %d\n", instance->host_no, len); + dsprintk(NDEBUG_EXTENDED, instance, "message received, residual %d\n", + len); switch (extended_msg[2]) { case EXTENDED_SDTR: @@ -2311,34 +1993,42 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { tmp = 0; } } else if (len) { - printk("scsi%d: error receiving extended message\n", instance->host_no); + shost_printk(KERN_ERR, instance, "error receiving extended message\n"); tmp = 0; } else { - printk("scsi%d: extended message code %02x length %d is too long\n", instance->host_no, extended_msg[2], extended_msg[1]); + shost_printk(KERN_NOTICE, instance, "extended message code %02x length %d is too long\n", + extended_msg[2], extended_msg[1]); tmp = 0; } + + spin_lock_irq(&hostdata->lock); + if (!hostdata->connected) + return; + /* Fall through to reject message */ - /* - * If we get something weird that we aren't expecting, + /* + * If we get something weird that we aren't expecting, * reject it. */ default: if (!tmp) { - printk("scsi%d: rejecting message ", instance->host_no); + shost_printk(KERN_ERR, instance, "rejecting message "); spi_print_msg(extended_msg); printk("\n"); } else if (tmp != EXTENDED_MESSAGE) scmd_printk(KERN_INFO, cmd, - "rejecting unknown message %02x\n",tmp); + "rejecting unknown message %02x\n", + tmp); else scmd_printk(KERN_INFO, cmd, - "rejecting unknown extended message code %02x, length %d\n", extended_msg[1], extended_msg[0]); + "rejecting unknown extended message code %02x, length %d\n", + extended_msg[1], extended_msg[0]); msgout = MESSAGE_REJECT; NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); break; - } /* switch (tmp) */ + } /* switch (tmp) */ break; case PHASE_MSGOUT: len = 1; @@ -2346,10 +2036,9 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { hostdata->last_message = msgout; NCR5380_transfer_pio(instance, &phase, &len, &data); if (msgout == ABORT) { - hostdata->busy[cmd->device->id] &= ~(1 << (cmd->device->lun & 0xFF)); hostdata->connected = NULL; cmd->result = DID_ERROR << 16; - cmd->scsi_done(cmd); + complete_cmd(instance, cmd); NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); return; } @@ -2358,17 +2047,12 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { case PHASE_CMDOUT: len = cmd->cmd_len; data = cmd->cmnd; - /* - * XXX for performance reasons, on machines with a - * PSEUDO-DMA architecture we should probably - * use the dma transfer function. + /* + * XXX for performance reasons, on machines with a + * PSEUDO-DMA architecture we should probably + * use the dma transfer function. */ NCR5380_transfer_pio(instance, &phase, &len, &data); - if (!cmd->device->disconnect && should_disconnect(cmd->cmnd[0])) { - NCR5380_set_timer(hostdata, USLEEP_SLEEP); - dprintk(NDEBUG_USLEEP, "scsi%d : issued command, sleeping until %lu\n", instance->host_no, hostdata->time_expires); - return; - } break; case PHASE_STATIN: len = 1; @@ -2377,46 +2061,37 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) { cmd->SCp.Status = tmp; break; default: - printk("scsi%d : unknown phase\n", instance->host_no); + shost_printk(KERN_ERR, instance, "unknown phase\n"); NCR5380_dprint(NDEBUG_ANY, instance); - } /* switch(phase) */ - } /* if (tmp * SR_REQ) */ - else { - /* RvC: go to sleep if polling time expired - */ - if (!cmd->device->disconnect && time_after_eq(jiffies, poll_time)) { - NCR5380_set_timer(hostdata, USLEEP_SLEEP); - dprintk(NDEBUG_USLEEP, "scsi%d : poll timed out, sleeping until %lu\n", instance->host_no, hostdata->time_expires); - return; - } + } /* switch(phase) */ + } else { + spin_unlock_irq(&hostdata->lock); + NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ); + spin_lock_irq(&hostdata->lock); } - } /* while (1) */ + } } /* * Function : void NCR5380_reselect (struct Scsi_Host *instance) * - * Purpose : does reselection, initializing the instance->connected - * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q - * nexus has been reestablished, - * - * Inputs : instance - this instance of the NCR5380. + * Purpose : does reselection, initializing the instance->connected + * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q + * nexus has been reestablished, * - * Locks: io_request_lock held by caller if IRQ driven + * Inputs : instance - this instance of the NCR5380. */ -static void NCR5380_reselect(struct Scsi_Host *instance) { - NCR5380_local_declare(); - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) - instance->hostdata; +static void NCR5380_reselect(struct Scsi_Host *instance) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned char target_mask; unsigned char lun, phase; int len; unsigned char msg[3]; unsigned char *data; - struct scsi_cmnd *tmp = NULL, *prev; - int abort = 0; - NCR5380_setup(instance); + struct NCR5380_cmd *ncmd; + struct scsi_cmnd *tmp; /* * Disable arbitration, etc. since the host adapter obviously @@ -2424,12 +2099,12 @@ static void NCR5380_reselect(struct Scsi_Host *instance) { */ NCR5380_write(MODE_REG, MR_BASE); - hostdata->restart_select = 1; target_mask = NCR5380_read(CURRENT_SCSI_DATA_REG) & ~(hostdata->id_mask); - dprintk(NDEBUG_SELECTION, "scsi%d : reselect\n", instance->host_no); - /* + dsprintk(NDEBUG_RESELECTION, instance, "reselect\n"); + + /* * At this point, we have detected that our SCSI ID is on the bus, * SEL is true and BSY was false for at least one bus settle delay * (400 ns). @@ -2439,103 +2114,110 @@ static void NCR5380_reselect(struct Scsi_Host *instance) { */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY); - - /* FIXME: timeout too long, must fail to workqueue */ - if(NCR5380_poll_politely(instance, STATUS_REG, SR_SEL, 0, 2*HZ)<0) - abort = 1; - + if (NCR5380_poll_politely(instance, + STATUS_REG, SR_SEL, 0, 2 * HZ) < 0) { + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + return; + } NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); /* * Wait for target to go into MSGIN. - * FIXME: timeout needed and fail to work queeu */ - if(NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 2*HZ)) - abort = 1; + if (NCR5380_poll_politely(instance, + STATUS_REG, SR_REQ, SR_REQ, 2 * HZ) < 0) { + do_abort(instance); + return; + } len = 1; data = msg; phase = PHASE_MSGIN; NCR5380_transfer_pio(instance, &phase, &len, &data); + if (len) { + do_abort(instance); + return; + } + if (!(msg[0] & 0x80)) { - printk(KERN_ERR "scsi%d : expecting IDENTIFY message, got ", instance->host_no); + shost_printk(KERN_ERR, instance, "expecting IDENTIFY message, got "); spi_print_msg(msg); - abort = 1; - } else { - /* Accept message by clearing ACK */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - lun = (msg[0] & 0x07); + printk("\n"); + do_abort(instance); + return; + } + lun = msg[0] & 0x07; - /* - * We need to add code for SCSI-II to track which devices have - * I_T_L_Q nexuses established, and which have simple I_T_L - * nexuses so we can chose to do additional data transfer. - */ + /* + * We need to add code for SCSI-II to track which devices have + * I_T_L_Q nexuses established, and which have simple I_T_L + * nexuses so we can chose to do additional data transfer. + */ - /* - * Find the command corresponding to the I_T_L or I_T_L_Q nexus we - * just reestablished, and remove it from the disconnected queue. - */ + /* + * Find the command corresponding to the I_T_L or I_T_L_Q nexus we + * just reestablished, and remove it from the disconnected queue. + */ + tmp = NULL; + list_for_each_entry(ncmd, &hostdata->disconnected, list) { + struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd); - for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue, prev = NULL; tmp; prev = tmp, tmp = (struct scsi_cmnd *) tmp->host_scribble) - if ((target_mask == (1 << tmp->device->id)) && (lun == (u8)tmp->device->lun) - ) { - if (prev) { - REMOVE(prev, prev->host_scribble, tmp, tmp->host_scribble); - prev->host_scribble = tmp->host_scribble; - } else { - REMOVE(-1, hostdata->disconnected_queue, tmp, tmp->host_scribble); - hostdata->disconnected_queue = (struct scsi_cmnd *) tmp->host_scribble; - } - tmp->host_scribble = NULL; - break; - } - if (!tmp) { - printk(KERN_ERR "scsi%d : warning : target bitmask %02x lun %d not in disconnect_queue.\n", instance->host_no, target_mask, lun); - /* - * Since we have an established nexus that we can't do anything with, - * we must abort it. - */ - abort = 1; + if (target_mask == (1 << scmd_id(cmd)) && + lun == (u8)cmd->device->lun) { + list_del(&ncmd->list); + tmp = cmd; + break; } } - if (abort) { - do_abort(instance); + if (tmp) { + dsprintk(NDEBUG_RESELECTION | NDEBUG_QUEUES, instance, + "reselect: removed %p from disconnected queue\n", tmp); } else { - hostdata->connected = tmp; - dprintk(NDEBUG_RESELECTION, "scsi%d : nexus established, target = %d, lun = %llu, tag = %d\n", instance->host_no, tmp->device->id, tmp->device->lun, tmp->tag); + shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d not in disconnected queue.\n", + target_mask, lun); + /* + * Since we have an established nexus that we can't do anything + * with, we must abort it. + */ + do_abort(instance); + return; } + + /* Accept message by clearing ACK */ + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + + hostdata->connected = tmp; + dsprintk(NDEBUG_RESELECTION, instance, "nexus established, target %d, lun %llu, tag %d\n", + scmd_id(tmp), tmp->device->lun, tmp->tag); } /* * Function : void NCR5380_dma_complete (struct Scsi_Host *instance) * * Purpose : called by interrupt handler when DMA finishes or a phase - * mismatch occurs (which would finish the DMA transfer). + * mismatch occurs (which would finish the DMA transfer). * * Inputs : instance - this instance of the NCR5380. * * Returns : pointer to the scsi_cmnd structure for which the I_T_L - * nexus has been reestablished, on failure NULL is returned. + * nexus has been reestablished, on failure NULL is returned. */ #ifdef REAL_DMA static void NCR5380_dma_complete(NCR5380_instance * instance) { - NCR5380_local_declare(); - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; + struct NCR5380_hostdata *hostdata = shost_priv(instance); int transferred; - NCR5380_setup(instance); /* * XXX this might not be right. * * Wait for final byte to transfer, ie wait for ACK to go false. * - * We should use the Last Byte Sent bit, unfortunately this is + * We should use the Last Byte Sent bit, unfortunately this is * not available on the 5380/5381 (only the various CMOS chips) * * FIXME: timeout, and need to handle long timeout/irq case @@ -2543,7 +2225,6 @@ static void NCR5380_dma_complete(NCR5380_instance * instance) { NCR5380_poll_politely(instance, BUS_AND_STATUS_REG, BASR_ACK, 0, 5*HZ); - NCR5380_write(MODE_REG, MR_BASE); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); /* @@ -2560,190 +2241,251 @@ static void NCR5380_dma_complete(NCR5380_instance * instance) { } #endif /* def REAL_DMA */ -/* - * Function : int NCR5380_abort (struct scsi_cmnd *cmd) - * - * Purpose : abort a command - * - * Inputs : cmd - the scsi_cmnd to abort, code - code to set the - * host byte of the result field to, if zero DID_ABORTED is - * used. - * - * Returns : SUCCESS - success, FAILED on failure. - * - * XXX - there is no way to abort the command that is currently - * connected, you have to wait for it to complete. If this is - * a problem, we could implement longjmp() / setjmp(), setjmp() - * called where the loop started in NCR5380_main(). - * - * Locks: host lock taken by caller +/** + * list_find_cmd - test for presence of a command in a linked list + * @haystack: list of commands + * @needle: command to search for */ -static int NCR5380_abort(struct scsi_cmnd *cmd) +static bool list_find_cmd(struct list_head *haystack, + struct scsi_cmnd *needle) { - NCR5380_local_declare(); - struct Scsi_Host *instance = cmd->device->host; - struct NCR5380_hostdata *hostdata = (struct NCR5380_hostdata *) instance->hostdata; - struct scsi_cmnd *tmp, **prev; + struct NCR5380_cmd *ncmd; - scmd_printk(KERN_WARNING, cmd, "aborting command\n"); + list_for_each_entry(ncmd, haystack, list) + if (NCR5380_to_scmd(ncmd) == needle) + return true; + return false; +} - NCR5380_print_status(instance); +/** + * list_remove_cmd - remove a command from linked list + * @haystack: list of commands + * @needle: command to remove + */ - NCR5380_setup(instance); +static bool list_del_cmd(struct list_head *haystack, + struct scsi_cmnd *needle) +{ + if (list_find_cmd(haystack, needle)) { + struct NCR5380_cmd *ncmd = scsi_cmd_priv(needle); - dprintk(NDEBUG_ABORT, "scsi%d : abort called\n", instance->host_no); - dprintk(NDEBUG_ABORT, " basr 0x%X, sr 0x%X\n", NCR5380_read(BUS_AND_STATUS_REG), NCR5380_read(STATUS_REG)); + list_del(&ncmd->list); + return true; + } + return false; +} -#if 0 -/* - * Case 1 : If the command is the currently executing command, - * we'll set the aborted flag and return control so that - * information transfer routine can exit cleanly. +/** + * NCR5380_abort - scsi host eh_abort_handler() method + * @cmd: the command to be aborted + * + * Try to abort a given command by removing it from queues and/or sending + * the target an abort message. This may not succeed in causing a target + * to abort the command. Nonetheless, the low-level driver must forget about + * the command because the mid-layer reclaims it and it may be re-issued. + * + * The normal path taken by a command is as follows. For EH we trace this + * same path to locate and abort the command. + * + * unissued -> selecting -> [unissued -> selecting ->]... connected -> + * [disconnected -> connected ->]... + * [autosense -> connected ->] done + * + * If cmd is unissued then just remove it. + * If cmd is disconnected, try to select the target. + * If cmd is connected, try to send an abort message. + * If cmd is waiting for autosense, give it a chance to complete but check + * that it isn't left connected. + * If cmd was not found at all then presumably it has already been completed, + * in which case return SUCCESS to try to avoid further EH measures. + * If the command has not completed yet, we must not fail to find it. */ - if (hostdata->connected == cmd) { - dprintk(NDEBUG_ABORT, "scsi%d : aborting connected command\n", instance->host_no); - hostdata->aborted = 1; -/* - * We should perform BSY checking, and make sure we haven't slipped - * into BUS FREE. - */ +static int NCR5380_abort(struct scsi_cmnd *cmd) +{ + struct Scsi_Host *instance = cmd->device->host; + struct NCR5380_hostdata *hostdata = shost_priv(instance); + unsigned long flags; + int result = SUCCESS; - NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_ATN); -/* - * Since we can't change phases until we've completed the current - * handshake, we have to source or sink a byte of data if the current - * phase is not MSGOUT. - */ + spin_lock_irqsave(&hostdata->lock, flags); -/* - * Return control to the executing NCR drive so we can clear the - * aborted flag and get back into our main loop. - */ +#if (NDEBUG & NDEBUG_ANY) + scmd_printk(KERN_INFO, cmd, __func__); +#endif + NCR5380_dprint(NDEBUG_ANY, instance); + NCR5380_dprint_phase(NDEBUG_ANY, instance); - return SUCCESS; + if (list_del_cmd(&hostdata->unissued, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: removed %p from issue queue\n", cmd); + cmd->result = DID_ABORT << 16; + cmd->scsi_done(cmd); /* No tag or busy flag to worry about */ } -#endif -/* - * Case 2 : If the command hasn't been issued yet, we simply remove it - * from the issue queue. - */ - - dprintk(NDEBUG_ABORT, "scsi%d : abort going into loop.\n", instance->host_no); - for (prev = (struct scsi_cmnd **) &(hostdata->issue_queue), tmp = (struct scsi_cmnd *) hostdata->issue_queue; tmp; prev = (struct scsi_cmnd **) &(tmp->host_scribble), tmp = (struct scsi_cmnd *) tmp->host_scribble) - if (cmd == tmp) { - REMOVE(5, *prev, tmp, tmp->host_scribble); - (*prev) = (struct scsi_cmnd *) tmp->host_scribble; - tmp->host_scribble = NULL; - tmp->result = DID_ABORT << 16; - dprintk(NDEBUG_ABORT, "scsi%d : abort removed command from issue queue.\n", instance->host_no); - tmp->scsi_done(tmp); - return SUCCESS; + if (hostdata->selecting == cmd) { + dsprintk(NDEBUG_ABORT, instance, + "abort: cmd %p == selecting\n", cmd); + hostdata->selecting = NULL; + cmd->result = DID_ABORT << 16; + complete_cmd(instance, cmd); + goto out; + } + + if (list_del_cmd(&hostdata->disconnected, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: removed %p from disconnected list\n", cmd); + cmd->result = DID_ERROR << 16; + if (!hostdata->connected) + NCR5380_select(instance, cmd); + if (hostdata->connected != cmd) { + complete_cmd(instance, cmd); + result = FAILED; + goto out; + } + } + + if (hostdata->connected == cmd) { + dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd); + hostdata->connected = NULL; + if (do_abort(instance)) { + set_host_byte(cmd, DID_ERROR); + complete_cmd(instance, cmd); + result = FAILED; + goto out; } -#if (NDEBUG & NDEBUG_ABORT) - /* KLL */ - else if (prev == tmp) - printk(KERN_ERR "scsi%d : LOOP\n", instance->host_no); + set_host_byte(cmd, DID_ABORT); +#ifdef REAL_DMA + hostdata->dma_len = 0; #endif + if (cmd->cmnd[0] == REQUEST_SENSE) + complete_cmd(instance, cmd); + else { + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); -/* - * Case 3 : If any commands are connected, we're going to fail the abort - * and let the high level SCSI driver retry at a later time or - * issue a reset. - * - * Timeouts, and therefore aborted commands, will be highly unlikely - * and handling them cleanly in this situation would make the common - * case of noresets less efficient, and would pollute our code. So, - * we fail. - */ + /* Perform autosense for this command */ + list_add(&ncmd->list, &hostdata->autosense); + } + } - if (hostdata->connected) { - dprintk(NDEBUG_ABORT, "scsi%d : abort failed, command connected.\n", instance->host_no); - return FAILED; + if (list_find_cmd(&hostdata->autosense, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: found %p on sense queue\n", cmd); + spin_unlock_irqrestore(&hostdata->lock, flags); + queue_work(hostdata->work_q, &hostdata->main_task); + msleep(1000); + spin_lock_irqsave(&hostdata->lock, flags); + if (list_del_cmd(&hostdata->autosense, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: removed %p from sense queue\n", cmd); + set_host_byte(cmd, DID_ABORT); + complete_cmd(instance, cmd); + goto out; + } } -/* - * Case 4: If the command is currently disconnected from the bus, and - * there are no connected commands, we reconnect the I_T_L or - * I_T_L_Q nexus associated with it, go into message out, and send - * an abort message. - * - * This case is especially ugly. In order to reestablish the nexus, we - * need to call NCR5380_select(). The easiest way to implement this - * function was to abort if the bus was busy, and let the interrupt - * handler triggered on the SEL for reselect take care of lost arbitrations - * where necessary, meaning interrupts need to be enabled. - * - * When interrupts are enabled, the queues may change - so we - * can't remove it from the disconnected queue before selecting it - * because that could cause a failure in hashing the nexus if that - * device reselected. - * - * Since the queues may change, we can't use the pointers from when we - * first locate it. - * - * So, we must first locate the command, and if NCR5380_select() - * succeeds, then issue the abort, relocate the command and remove - * it from the disconnected queue. - */ - for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; tmp = (struct scsi_cmnd *) tmp->host_scribble) - if (cmd == tmp) { - dprintk(NDEBUG_ABORT, "scsi%d : aborting disconnected command.\n", instance->host_no); + if (hostdata->connected == cmd) { + dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd); + hostdata->connected = NULL; + if (do_abort(instance)) { + set_host_byte(cmd, DID_ERROR); + complete_cmd(instance, cmd); + result = FAILED; + goto out; + } + set_host_byte(cmd, DID_ABORT); +#ifdef REAL_DMA + hostdata->dma_len = 0; +#endif + complete_cmd(instance, cmd); + } - if (NCR5380_select(instance, cmd)) - return FAILED; - dprintk(NDEBUG_ABORT, "scsi%d : nexus reestablished.\n", instance->host_no); +out: + if (result == FAILED) + dsprintk(NDEBUG_ABORT, instance, "abort: failed to abort %p\n", cmd); + else + dsprintk(NDEBUG_ABORT, instance, "abort: successfully aborted %p\n", cmd); - do_abort(instance); + queue_work(hostdata->work_q, &hostdata->main_task); + spin_unlock_irqrestore(&hostdata->lock, flags); - for (prev = (struct scsi_cmnd **) &(hostdata->disconnected_queue), tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; prev = (struct scsi_cmnd **) &(tmp->host_scribble), tmp = (struct scsi_cmnd *) tmp->host_scribble) - if (cmd == tmp) { - REMOVE(5, *prev, tmp, tmp->host_scribble); - *prev = (struct scsi_cmnd *) tmp->host_scribble; - tmp->host_scribble = NULL; - tmp->result = DID_ABORT << 16; - tmp->scsi_done(tmp); - return SUCCESS; - } - } -/* - * Case 5 : If we reached this point, the command was not found in any of - * the queues. - * - * We probably reached this point because of an unlikely race condition - * between the command completing successfully and the abortion code, - * so we won't panic, but we will notify the user in case something really - * broke. - */ - printk(KERN_WARNING "scsi%d : warning : SCSI command probably completed successfully\n" - " before abortion\n", instance->host_no); - return FAILED; + return result; } -/* - * Function : int NCR5380_bus_reset (struct scsi_cmnd *cmd) - * - * Purpose : reset the SCSI bus. - * - * Returns : SUCCESS +/** + * NCR5380_bus_reset - reset the SCSI bus + * @cmd: SCSI command undergoing EH * - * Locks: host lock taken by caller + * Returns SUCCESS */ static int NCR5380_bus_reset(struct scsi_cmnd *cmd) { struct Scsi_Host *instance = cmd->device->host; + struct NCR5380_hostdata *hostdata = shost_priv(instance); + int i; + unsigned long flags; + struct NCR5380_cmd *ncmd; - NCR5380_local_declare(); - NCR5380_setup(instance); - NCR5380_print_status(instance); + spin_lock_irqsave(&hostdata->lock, flags); + +#if (NDEBUG & NDEBUG_ANY) + scmd_printk(KERN_INFO, cmd, __func__); +#endif + NCR5380_dprint(NDEBUG_ANY, instance); + NCR5380_dprint_phase(NDEBUG_ANY, instance); - spin_lock_irq(instance->host_lock); do_reset(instance); - spin_unlock_irq(instance->host_lock); + + /* reset NCR registers */ + NCR5380_write(MODE_REG, MR_BASE); + NCR5380_write(TARGET_COMMAND_REG, 0); + NCR5380_write(SELECT_ENABLE_REG, 0); + + /* After the reset, there are no more connected or disconnected commands + * and no busy units; so clear the low-level status here to avoid + * conflicts when the mid-level code tries to wake up the affected + * commands! + */ + + hostdata->selecting = NULL; + + list_for_each_entry(ncmd, &hostdata->disconnected, list) { + struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd); + + set_host_byte(cmd, DID_RESET); + cmd->scsi_done(cmd); + } + + list_for_each_entry(ncmd, &hostdata->autosense, list) { + struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd); + + set_host_byte(cmd, DID_RESET); + cmd->scsi_done(cmd); + } + + if (hostdata->connected) { + set_host_byte(hostdata->connected, DID_RESET); + complete_cmd(instance, hostdata->connected); + hostdata->connected = NULL; + } + + if (hostdata->sensing) { + set_host_byte(hostdata->connected, DID_RESET); + complete_cmd(instance, hostdata->sensing); + hostdata->sensing = NULL; + } + + for (i = 0; i < 8; ++i) + hostdata->busy[i] = 0; +#ifdef REAL_DMA + hostdata->dma_len = 0; +#endif + + queue_work(hostdata->work_q, &hostdata->main_task); + spin_unlock_irqrestore(&hostdata->lock, flags); return SUCCESS; } diff --git a/drivers/scsi/NCR5380.h b/drivers/scsi/NCR5380.h index 162112dd1bf8..a79288682a74 100644 --- a/drivers/scsi/NCR5380.h +++ b/drivers/scsi/NCR5380.h @@ -22,8 +22,13 @@ #ifndef NCR5380_H #define NCR5380_H +#include <linux/delay.h> #include <linux/interrupt.h> +#include <linux/list.h> +#include <linux/workqueue.h> +#include <scsi/scsi_dbg.h> #include <scsi/scsi_eh.h> +#include <scsi/scsi_transport_spi.h> #define NDEBUG_ARBITRATION 0x1 #define NDEBUG_AUTOSENSE 0x2 @@ -158,8 +163,7 @@ /* Write any value to this register to start an ini mode DMA receive */ #define START_DMA_INITIATOR_RECEIVE_REG 7 /* wo */ -#define C400_CONTROL_STATUS_REG NCR53C400_register_offset-8 /* rw */ - +/* NCR 53C400(A) Control Status Register bits: */ #define CSR_RESET 0x80 /* wo Resets 53c400 */ #define CSR_53C80_REG 0x80 /* ro 5380 registers busy */ #define CSR_TRANS_DIR 0x40 /* rw Data transfer direction */ @@ -176,16 +180,6 @@ #define CSR_BASE CSR_53C80_INTR #endif -/* Number of 128-byte blocks to be transferred */ -#define C400_BLOCK_COUNTER_REG NCR53C400_register_offset-7 /* rw */ - -/* Resume transfer after disconnect */ -#define C400_RESUME_TRANSFER_REG NCR53C400_register_offset-6 /* wo */ - -/* Access to host buffer stack */ -#define C400_HOST_BUFFER NCR53C400_register_offset-4 /* rw */ - - /* Note : PHASE_* macros are based on the values of the STATUS register */ #define PHASE_MASK (SR_MSG | SR_CD | SR_IO) @@ -205,16 +199,6 @@ #define PHASE_SR_TO_TCR(phase) ((phase) >> 2) -/* - * The internal should_disconnect() function returns these based on the - * expected length of a disconnect if a device supports disconnect/ - * reconnect. - */ - -#define DISCONNECT_NONE 0 -#define DISCONNECT_TIME_TO_DATA 1 -#define DISCONNECT_LONG 2 - /* * "Special" value for the (unsigned char) command tag, to indicate * I_T_L nexus instead of I_T_L_Q. @@ -236,15 +220,11 @@ #define NO_IRQ 0 #endif -#define FLAG_HAS_LAST_BYTE_SENT 1 /* NCR53c81 or better */ -#define FLAG_CHECK_LAST_BYTE_SENT 2 /* Only test once */ -#define FLAG_NCR53C400 4 /* NCR53c400 */ +#define FLAG_NO_DMA_FIXUP 1 /* No DMA errata workarounds */ #define FLAG_NO_PSEUDO_DMA 8 /* Inhibit DMA */ -#define FLAG_DTC3181E 16 /* DTC3181E */ #define FLAG_LATE_DMA_SETUP 32 /* Setup NCR before DMA H/W */ #define FLAG_TAGGED_QUEUING 64 /* as X3T9.2 spelled it */ - -#ifndef ASM +#define FLAG_TOSHIBA_DELAY 128 /* Allow for borken CD-ROMs */ #ifdef SUPPORT_TAGS struct tag_alloc { @@ -258,33 +238,24 @@ struct NCR5380_hostdata { NCR5380_implementation_fields; /* implementation specific */ struct Scsi_Host *host; /* Host backpointer */ unsigned char id_mask, id_higher_mask; /* 1 << id, all bits greater */ - unsigned char targets_present; /* targets we have connected - to, so we can call a select - failure a retryable condition */ - volatile unsigned char busy[8]; /* index = target, bit = lun */ + unsigned char busy[8]; /* index = target, bit = lun */ #if defined(REAL_DMA) || defined(REAL_DMA_POLL) - volatile int dma_len; /* requested length of DMA */ + int dma_len; /* requested length of DMA */ #endif - volatile unsigned char last_message; /* last message OUT */ - volatile struct scsi_cmnd *connected; /* currently connected command */ - volatile struct scsi_cmnd *issue_queue; /* waiting to be issued */ - volatile struct scsi_cmnd *disconnected_queue; /* waiting for reconnect */ - volatile int restart_select; /* we have disconnected, - used to restart - NCR5380_select() */ - volatile unsigned aborted:1; /* flag, says aborted */ + unsigned char last_message; /* last message OUT */ + struct scsi_cmnd *connected; /* currently connected cmnd */ + struct scsi_cmnd *selecting; /* cmnd to be connected */ + struct list_head unissued; /* waiting to be issued */ + struct list_head autosense; /* priority issue queue */ + struct list_head disconnected; /* waiting for reconnect */ + spinlock_t lock; /* protects this struct */ int flags; - unsigned long time_expires; /* in jiffies, set prior to sleeping */ - int select_time; /* timer in select for target response */ - volatile struct scsi_cmnd *selecting; - struct delayed_work coroutine; /* our co-routine */ struct scsi_eh_save ses; + struct scsi_cmnd *sensing; char info[256]; int read_overruns; /* number of bytes to cut from a * transfer to handle chip overruns */ - int retain_dma_intr; struct work_struct main_task; - volatile int main_running; #ifdef SUPPORT_TAGS struct tag_alloc TagAlloc[8][8]; /* 8 targets and 8 LUNs */ #endif @@ -292,10 +263,23 @@ struct NCR5380_hostdata { unsigned spin_max_r; unsigned spin_max_w; #endif + struct workqueue_struct *work_q; + unsigned long accesses_per_ms; /* chip register accesses per ms */ }; #ifdef __KERNEL__ +struct NCR5380_cmd { + struct list_head list; +}; + +#define NCR5380_CMD_SIZE (sizeof(struct NCR5380_cmd)) + +static inline struct scsi_cmnd *NCR5380_to_scmd(struct NCR5380_cmd *ncmd_ptr) +{ + return ((struct scsi_cmnd *)ncmd_ptr) - 1; +} + #ifndef NDEBUG #define NDEBUG (0) #endif @@ -304,6 +288,11 @@ struct NCR5380_hostdata { do { if ((NDEBUG) & (flg)) \ printk(KERN_DEBUG fmt, ## __VA_ARGS__); } while (0) +#define dsprintk(flg, host, fmt, ...) \ + do { if ((NDEBUG) & (flg)) \ + shost_printk(KERN_DEBUG, host, fmt, ## __VA_ARGS__); \ + } while (0) + #if NDEBUG #define NCR5380_dprint(flg, arg) \ do { if ((NDEBUG) & (flg)) NCR5380_print(arg); } while (0) @@ -320,6 +309,7 @@ static void NCR5380_print(struct Scsi_Host *instance); static int NCR5380_probe_irq(struct Scsi_Host *instance, int possible); #endif static int NCR5380_init(struct Scsi_Host *instance, int flags); +static int NCR5380_maybe_reset_bus(struct Scsi_Host *); static void NCR5380_exit(struct Scsi_Host *instance); static void NCR5380_information_transfer(struct Scsi_Host *instance); #ifndef DONT_USE_INTR @@ -328,7 +318,7 @@ static irqreturn_t NCR5380_intr(int irq, void *dev_id); static void NCR5380_main(struct work_struct *work); static const char *NCR5380_info(struct Scsi_Host *instance); static void NCR5380_reselect(struct Scsi_Host *instance); -static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd); +static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *, struct scsi_cmnd *); #if defined(PSEUDO_DMA) || defined(REAL_DMA) || defined(REAL_DMA_POLL) static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data); #endif @@ -443,5 +433,4 @@ static __inline__ int NCR5380_pc_dma_residual(struct Scsi_Host *instance) #endif /* defined(i386) || defined(__alpha__) */ #endif /* defined(REAL_DMA) */ #endif /* __KERNEL__ */ -#endif /* ndef ASM */ #endif /* NCR5380_H */ diff --git a/drivers/scsi/arm/cumana_1.c b/drivers/scsi/arm/cumana_1.c index d28d6c0f18c0..221f18c5df93 100644 --- a/drivers/scsi/arm/cumana_1.c +++ b/drivers/scsi/arm/cumana_1.c @@ -4,9 +4,7 @@ * Copyright 1995-2002, Russell King */ #include <linux/module.h> -#include <linux/signal.h> #include <linux/ioport.h> -#include <linux/delay.h> #include <linux/blkdev.h> #include <linux/init.h> @@ -15,15 +13,14 @@ #include <scsi/scsi_host.h> -#include <scsi/scsicam.h> - #define PSEUDO_DMA #define priv(host) ((struct NCR5380_hostdata *)(host)->hostdata) -#define NCR5380_local_declare() struct Scsi_Host *_instance -#define NCR5380_setup(instance) _instance = instance -#define NCR5380_read(reg) cumanascsi_read(_instance, reg) -#define NCR5380_write(reg, value) cumanascsi_write(_instance, reg, value) +#define NCR5380_read(reg) cumanascsi_read(instance, reg) +#define NCR5380_write(reg, value) cumanascsi_write(instance, reg, value) + +#define NCR5380_dma_xfer_len(instance, cmd, phase) (cmd->transfersize) + #define NCR5380_intr cumanascsi_intr #define NCR5380_queue_command cumanascsi_queue_command #define NCR5380_info cumanascsi_info @@ -211,6 +208,8 @@ static struct scsi_host_template cumanascsi_template = { .cmd_per_lun = 2, .use_clustering = DISABLE_CLUSTERING, .proc_name = "CumanaSCSI-1", + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; static int cumanascsi1_probe(struct expansion_card *ec, @@ -240,23 +239,21 @@ static int cumanascsi1_probe(struct expansion_card *ec, host->irq = ec->irq; - NCR5380_init(host, 0); + ret = NCR5380_init(host, 0); + if (ret) + goto out_unmap; + + NCR5380_maybe_reset_bus(host); priv(host)->ctrl = 0; writeb(0, priv(host)->base + CTRL); - host->n_io_port = 255; - if (!(request_region(host->io_port, host->n_io_port, "CumanaSCSI-1"))) { - ret = -EBUSY; - goto out_unmap; - } - ret = request_irq(host->irq, cumanascsi_intr, 0, "CumanaSCSI-1", host); if (ret) { printk("scsi%d: IRQ%d not free: %d\n", host->host_no, host->irq, ret); - goto out_unmap; + goto out_exit; } ret = scsi_add_host(host, &ec->dev); @@ -268,6 +265,8 @@ static int cumanascsi1_probe(struct expansion_card *ec, out_free_irq: free_irq(host->irq, host); + out_exit: + NCR5380_exit(host); out_unmap: iounmap(priv(host)->base); iounmap(priv(host)->dma); diff --git a/drivers/scsi/arm/oak.c b/drivers/scsi/arm/oak.c index 7c6fa1479c9c..1fab1d1896b1 100644 --- a/drivers/scsi/arm/oak.c +++ b/drivers/scsi/arm/oak.c @@ -5,9 +5,7 @@ */ #include <linux/module.h> -#include <linux/signal.h> #include <linux/ioport.h> -#include <linux/delay.h> #include <linux/blkdev.h> #include <linux/init.h> @@ -20,14 +18,16 @@ #define DONT_USE_INTR #define priv(host) ((struct NCR5380_hostdata *)(host)->hostdata) -#define NCR5380_local_declare() void __iomem *_base -#define NCR5380_setup(host) _base = priv(host)->base -#define NCR5380_read(reg) readb(_base + ((reg) << 2)) -#define NCR5380_write(reg, value) writeb(value, _base + ((reg) << 2)) +#define NCR5380_read(reg) \ + readb(priv(instance)->base + ((reg) << 2)) +#define NCR5380_write(reg, value) \ + writeb(value, priv(instance)->base + ((reg) << 2)) + +#define NCR5380_dma_xfer_len(instance, cmd, phase) (cmd->transfersize) + #define NCR5380_queue_command oakscsi_queue_command #define NCR5380_info oakscsi_info -#define NCR5380_show_info oakscsi_show_info #define NCR5380_implementation_fields \ void __iomem *base @@ -103,7 +103,6 @@ printk("reading %p len %d\n", addr, len); static struct scsi_host_template oakscsi_template = { .module = THIS_MODULE, - .show_info = oakscsi_show_info, .name = "Oak 16-bit SCSI", .info = oakscsi_info, .queuecommand = oakscsi_queue_command, @@ -115,6 +114,8 @@ static struct scsi_host_template oakscsi_template = { .cmd_per_lun = 2, .use_clustering = DISABLE_CLUSTERING, .proc_name = "oakscsi", + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; static int oakscsi_probe(struct expansion_card *ec, const struct ecard_id *id) @@ -142,15 +143,21 @@ static int oakscsi_probe(struct expansion_card *ec, const struct ecard_id *id) host->irq = NO_IRQ; host->n_io_port = 255; - NCR5380_init(host, 0); + ret = NCR5380_init(host, 0); + if (ret) + goto out_unmap; + + NCR5380_maybe_reset_bus(host); ret = scsi_add_host(host, &ec->dev); if (ret) - goto out_unmap; + goto out_exit; scsi_scan_host(host); goto out; + out_exit: + NCR5380_exit(host); out_unmap: iounmap(priv(host)->base); unreg: diff --git a/drivers/scsi/atari_NCR5380.c b/drivers/scsi/atari_NCR5380.c index db87ece6edb2..e65478651ca9 100644 --- a/drivers/scsi/atari_NCR5380.c +++ b/drivers/scsi/atari_NCR5380.c @@ -1,15 +1,15 @@ /* * NCR 5380 generic driver routines. These should make it *trivial* - * to implement 5380 SCSI drivers under Linux with a non-trantor - * architecture. + * to implement 5380 SCSI drivers under Linux with a non-trantor + * architecture. * - * Note that these routines also work with NR53c400 family chips. + * Note that these routines also work with NR53c400 family chips. * * Copyright 1993, Drew Eckhardt - * Visionary Computing - * (Unix and Linux consulting and custom programming) - * drew@colorado.edu - * +1 (303) 666-5836 + * Visionary Computing + * (Unix and Linux consulting and custom programming) + * drew@colorado.edu + * +1 (303) 666-5836 * * For more information, please consult * @@ -24,84 +24,10 @@ * 1+ (800) 334-5454 */ -/* - * ++roman: To port the 5380 driver to the Atari, I had to do some changes in - * this file, too: - * - * - Some of the debug statements were incorrect (undefined variables and the - * like). I fixed that. - * - * - In information_transfer(), I think a #ifdef was wrong. Looking at the - * possible DMA transfer size should also happen for REAL_DMA. I added this - * in the #if statement. - * - * - When using real DMA, information_transfer() should return in a DATAOUT - * phase after starting the DMA. It has nothing more to do. - * - * - The interrupt service routine should run main after end of DMA, too (not - * only after RESELECTION interrupts). Additionally, it should _not_ test - * for more interrupts after running main, since a DMA process may have - * been started and interrupts are turned on now. The new int could happen - * inside the execution of NCR5380_intr(), leading to recursive - * calls. - * - * - I've added a function merge_contiguous_buffers() that tries to - * merge scatter-gather buffers that are located at contiguous - * physical addresses and can be processed with the same DMA setup. - * Since most scatter-gather operations work on a page (4K) of - * 4 buffers (1K), in more than 90% of all cases three interrupts and - * DMA setup actions are saved. - * - * - I've deleted all the stuff for AUTOPROBE_IRQ, REAL_DMA_POLL, PSEUDO_DMA - * and USLEEP, because these were messing up readability and will never be - * needed for Atari SCSI. - * - * - I've revised the NCR5380_main() calling scheme (relax the 'main_running' - * stuff), and 'main' is executed in a bottom half if awoken by an - * interrupt. - * - * - The code was quite cluttered up by "#if (NDEBUG & NDEBUG_*) printk..." - * constructs. In my eyes, this made the source rather unreadable, so I - * finally replaced that by the *_PRINTK() macros. - * - */ - -/* - * Further development / testing that should be done : - * 1. Test linked command handling code after Eric is ready with - * the high level code. - */ +/* Ported to Atari by Roman Hodek and others. */ /* Adapted for the sun3 by Sam Creasey. */ -#include <scsi/scsi_dbg.h> -#include <scsi/scsi_transport_spi.h> - -#if (NDEBUG & NDEBUG_LISTS) -#define LIST(x, y) \ - do { \ - printk("LINE:%d Adding %p to %p\n", \ - __LINE__, (void*)(x), (void*)(y)); \ - if ((x) == (y)) \ - udelay(5); \ - } while (0) -#define REMOVE(w, x, y, z) \ - do { \ - printk("LINE:%d Removing: %p->%p %p->%p \n", \ - __LINE__, (void*)(w), (void*)(x), \ - (void*)(y), (void*)(z)); \ - if ((x) == (y)) \ - udelay(5); \ - } while (0) -#else -#define LIST(x,y) -#define REMOVE(w,x,y,z) -#endif - -#ifndef notyet -#undef LINKED -#endif - /* * Design * @@ -126,17 +52,7 @@ * piece of hardware that requires you to sit in a loop polling for * the REQ signal as long as you are connected. Some devices are * brain dead (ie, many TEXEL CD ROM drives) and won't disconnect - * while doing long seek operations. - * - * The workaround for this is to keep track of devices that have - * disconnected. If the device hasn't disconnected, for commands that - * should disconnect, we do something like - * - * while (!REQ is asserted) { sleep for N usecs; poll for M usecs } - * - * Some tweaking of N and M needs to be done. An algorithm based - * on "time to data" would give the best results as long as short time - * to datas (ie, on the same track) were considered, however these + * while doing long seek operations. [...] These * broken devices are the exception rather than the rule and I'd rather * spend my time optimizing for the normal case. * @@ -177,12 +93,10 @@ * * These macros control options : * AUTOSENSE - if defined, REQUEST SENSE will be performed automatically - * for commands that return with a CHECK CONDITION status. + * for commands that return with a CHECK CONDITION status. * * DIFFERENTIAL - if defined, NCR53c81 chips will use external differential - * transceivers. - * - * LINKED - if defined, linked commands are supported. + * transceivers. * * REAL_DMA - if defined, REAL DMA is used during the data transfer phases. * @@ -195,17 +109,17 @@ * NCR5380_write(register, value) - write to the specific register * * NCR5380_implementation_fields - additional fields needed for this - * specific implementation of the NCR5380 + * specific implementation of the NCR5380 * * Either real DMA *or* pseudo DMA may be implemented * REAL functions : * NCR5380_REAL_DMA should be defined if real DMA is to be used. * Note that the DMA setup functions should return the number of bytes - * that they were able to program the controller for. + * that they were able to program the controller for. * * Also note that generic i386/PC versions of these macros are - * available as NCR5380_i386_dma_write_setup, - * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual. + * available as NCR5380_i386_dma_write_setup, + * NCR5380_i386_dma_read_setup, and NCR5380_i386_dma_residual. * * NCR5380_dma_write_setup(instance, src, count) - initialize * NCR5380_dma_read_setup(instance, dst, count) - initialize @@ -221,18 +135,8 @@ * possible) function may be used. */ -/* Macros ease life... :-) */ -#define SETUP_HOSTDATA(in) \ - struct NCR5380_hostdata *hostdata = \ - (struct NCR5380_hostdata *)(in)->hostdata -#define HOSTDATA(in) ((struct NCR5380_hostdata *)(in)->hostdata) - -#define NEXT(cmd) ((struct scsi_cmnd *)(cmd)->host_scribble) -#define SET_NEXT(cmd,next) ((cmd)->host_scribble = (void *)(next)) -#define NEXTADDR(cmd) ((struct scsi_cmnd **)&(cmd)->host_scribble) - -#define HOSTNO instance->host_no -#define H_NO(cmd) (cmd)->device->host->host_no +static int do_abort(struct Scsi_Host *); +static void do_reset(struct Scsi_Host *); #ifdef SUPPORT_TAGS @@ -251,9 +155,7 @@ * cannot know it in advance :-( We just see a QUEUE_FULL status being * returned. So, in this case, the driver internal queue size assumption is * reduced to the number of active tags if QUEUE_FULL is returned by the - * target. The command is returned to the mid-level, but with status changed - * to BUSY, since --as I've seen-- the mid-level can't handle QUEUE_FULL - * correctly. + * target. * * We're also not allowed running tagged commands as long as an untagged * command is active. And REQUEST SENSE commands after a contingent allegiance @@ -304,7 +206,8 @@ static void __init init_tags(struct NCR5380_hostdata *hostdata) static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged) { u8 lun = cmd->device->lun; - SETUP_HOSTDATA(cmd->device->host); + struct Scsi_Host *instance = cmd->device->host; + struct NCR5380_hostdata *hostdata = shost_priv(instance); if (hostdata->busy[cmd->device->id] & (1 << lun)) return 1; @@ -314,8 +217,8 @@ static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged) return 0; if (hostdata->TagAlloc[scmd_id(cmd)][lun].nr_allocated >= hostdata->TagAlloc[scmd_id(cmd)][lun].queue_size) { - dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d: no free tags\n", - H_NO(cmd), cmd->device->id, lun); + dsprintk(NDEBUG_TAGS, instance, "target %d lun %d: no free tags\n", + scmd_id(cmd), lun); return 1; } return 0; @@ -330,7 +233,8 @@ static int is_lun_busy(struct scsi_cmnd *cmd, int should_be_tagged) static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged) { u8 lun = cmd->device->lun; - SETUP_HOSTDATA(cmd->device->host); + struct Scsi_Host *instance = cmd->device->host; + struct NCR5380_hostdata *hostdata = shost_priv(instance); /* If we or the target don't support tagged queuing, allocate the LUN for * an untagged command. @@ -340,18 +244,16 @@ static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged) !cmd->device->tagged_supported) { cmd->tag = TAG_NONE; hostdata->busy[cmd->device->id] |= (1 << lun); - dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d now allocated by untagged " - "command\n", H_NO(cmd), cmd->device->id, lun); + dsprintk(NDEBUG_TAGS, instance, "target %d lun %d now allocated by untagged command\n", + scmd_id(cmd), lun); } else { struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun]; cmd->tag = find_first_zero_bit(ta->allocated, MAX_TAGS); set_bit(cmd->tag, ta->allocated); ta->nr_allocated++; - dprintk(NDEBUG_TAGS, "scsi%d: using tag %d for target %d lun %d " - "(now %d tags in use)\n", - H_NO(cmd), cmd->tag, cmd->device->id, - lun, ta->nr_allocated); + dsprintk(NDEBUG_TAGS, instance, "using tag %d for target %d lun %d (%d tags allocated)\n", + cmd->tag, scmd_id(cmd), lun, ta->nr_allocated); } } @@ -363,21 +265,22 @@ static void cmd_get_tag(struct scsi_cmnd *cmd, int should_be_tagged) static void cmd_free_tag(struct scsi_cmnd *cmd) { u8 lun = cmd->device->lun; - SETUP_HOSTDATA(cmd->device->host); + struct Scsi_Host *instance = cmd->device->host; + struct NCR5380_hostdata *hostdata = shost_priv(instance); if (cmd->tag == TAG_NONE) { hostdata->busy[cmd->device->id] &= ~(1 << lun); - dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %d untagged cmd finished\n", - H_NO(cmd), cmd->device->id, lun); + dsprintk(NDEBUG_TAGS, instance, "target %d lun %d untagged cmd freed\n", + scmd_id(cmd), lun); } else if (cmd->tag >= MAX_TAGS) { - printk(KERN_NOTICE "scsi%d: trying to free bad tag %d!\n", - H_NO(cmd), cmd->tag); + shost_printk(KERN_NOTICE, instance, + "trying to free bad tag %d!\n", cmd->tag); } else { struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun]; clear_bit(cmd->tag, ta->allocated); ta->nr_allocated--; - dprintk(NDEBUG_TAGS, "scsi%d: freed tag %d for target %d lun %d\n", - H_NO(cmd), cmd->tag, cmd->device->id, lun); + dsprintk(NDEBUG_TAGS, instance, "freed tag %d for target %d lun %d\n", + cmd->tag, scmd_id(cmd), lun); } } @@ -401,17 +304,15 @@ static void free_all_tags(struct NCR5380_hostdata *hostdata) #endif /* SUPPORT_TAGS */ - -/* - * Function: void merge_contiguous_buffers( struct scsi_cmnd *cmd ) - * - * Purpose: Try to merge several scatter-gather requests into one DMA - * transfer. This is possible if the scatter buffers lie on - * physical contiguous addresses. - * - * Parameters: struct scsi_cmnd *cmd - * The command to work on. The first scatter buffer's data are - * assumed to be already transferred into ptr/this_residual. +/** + * merge_contiguous_buffers - coalesce scatter-gather list entries + * @cmd: command requesting IO + * + * Try to merge several scatter-gather buffers into one DMA transfer. + * This is possible if the scatter buffers lie on physically + * contiguous addresses. The first scatter-gather buffer's data are + * assumed to be already transferred into cmd->SCp.this_residual. + * Every buffer merged avoids an interrupt and a DMA setup operation. */ static void merge_contiguous_buffers(struct scsi_cmnd *cmd) @@ -463,9 +364,7 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd) cmd->SCp.buffers_residual = scsi_sg_count(cmd) - 1; cmd->SCp.ptr = sg_virt(cmd->SCp.buffer); cmd->SCp.this_residual = cmd->SCp.buffer->length; - /* ++roman: Try to merge some scatter-buffers if they are at - * contiguous physical addresses. - */ + merge_contiguous_buffers(cmd); } else { cmd->SCp.buffer = NULL; @@ -473,31 +372,110 @@ static inline void initialize_SCp(struct scsi_cmnd *cmd) cmd->SCp.ptr = NULL; cmd->SCp.this_residual = 0; } + + cmd->SCp.Status = 0; + cmd->SCp.Message = 0; +} + +/** + * NCR5380_poll_politely2 - wait for two chip register values + * @instance: controller to poll + * @reg1: 5380 register to poll + * @bit1: Bitmask to check + * @val1: Expected value + * @reg2: Second 5380 register to poll + * @bit2: Second bitmask to check + * @val2: Second expected value + * @wait: Time-out in jiffies + * + * Polls the chip in a reasonably efficient manner waiting for an + * event to occur. After a short quick poll we begin to yield the CPU + * (if possible). In irq contexts the time-out is arbitrarily limited. + * Callers may hold locks as long as they are held in irq mode. + * + * Returns 0 if either or both event(s) occurred otherwise -ETIMEDOUT. + */ + +static int NCR5380_poll_politely2(struct Scsi_Host *instance, + int reg1, int bit1, int val1, + int reg2, int bit2, int val2, int wait) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + unsigned long deadline = jiffies + wait; + unsigned long n; + + /* Busy-wait for up to 10 ms */ + n = min(10000U, jiffies_to_usecs(wait)); + n *= hostdata->accesses_per_ms; + n /= 2000; + do { + if ((NCR5380_read(reg1) & bit1) == val1) + return 0; + if ((NCR5380_read(reg2) & bit2) == val2) + return 0; + cpu_relax(); + } while (n--); + + if (irqs_disabled() || in_interrupt()) + return -ETIMEDOUT; + + /* Repeatedly sleep for 1 ms until deadline */ + while (time_is_after_jiffies(deadline)) { + schedule_timeout_uninterruptible(1); + if ((NCR5380_read(reg1) & bit1) == val1) + return 0; + if ((NCR5380_read(reg2) & bit2) == val2) + return 0; + } + + return -ETIMEDOUT; } -#include <linux/delay.h> +static inline int NCR5380_poll_politely(struct Scsi_Host *instance, + int reg, int bit, int val, int wait) +{ + return NCR5380_poll_politely2(instance, reg, bit, val, + reg, bit, val, wait); +} #if NDEBUG static struct { unsigned char mask; const char *name; } signals[] = { - { SR_DBP, "PARITY"}, { SR_RST, "RST" }, { SR_BSY, "BSY" }, - { SR_REQ, "REQ" }, { SR_MSG, "MSG" }, { SR_CD, "CD" }, { SR_IO, "IO" }, - { SR_SEL, "SEL" }, {0, NULL} -}, basrs[] = { - {BASR_ATN, "ATN"}, {BASR_ACK, "ACK"}, {0, NULL} -}, icrs[] = { - {ICR_ASSERT_RST, "ASSERT RST"},{ICR_ASSERT_ACK, "ASSERT ACK"}, - {ICR_ASSERT_BSY, "ASSERT BSY"}, {ICR_ASSERT_SEL, "ASSERT SEL"}, - {ICR_ASSERT_ATN, "ASSERT ATN"}, {ICR_ASSERT_DATA, "ASSERT DATA"}, + {SR_DBP, "PARITY"}, + {SR_RST, "RST"}, + {SR_BSY, "BSY"}, + {SR_REQ, "REQ"}, + {SR_MSG, "MSG"}, + {SR_CD, "CD"}, + {SR_IO, "IO"}, + {SR_SEL, "SEL"}, + {0, NULL} +}, +basrs[] = { + {BASR_ATN, "ATN"}, + {BASR_ACK, "ACK"}, {0, NULL} -}, mrs[] = { - {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, {MR_TARGET, "MODE TARGET"}, - {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, {MR_ENABLE_PAR_INTR, - "MODE PARITY INTR"}, {MR_ENABLE_EOP_INTR,"MODE EOP INTR"}, +}, +icrs[] = { + {ICR_ASSERT_RST, "ASSERT RST"}, + {ICR_ASSERT_ACK, "ASSERT ACK"}, + {ICR_ASSERT_BSY, "ASSERT BSY"}, + {ICR_ASSERT_SEL, "ASSERT SEL"}, + {ICR_ASSERT_ATN, "ASSERT ATN"}, + {ICR_ASSERT_DATA, "ASSERT DATA"}, + {0, NULL} +}, +mrs[] = { + {MR_BLOCK_DMA_MODE, "MODE BLOCK DMA"}, + {MR_TARGET, "MODE TARGET"}, + {MR_ENABLE_PAR_CHECK, "MODE PARITY CHECK"}, + {MR_ENABLE_PAR_INTR, "MODE PARITY INTR"}, + {MR_ENABLE_EOP_INTR, "MODE EOP INTR"}, {MR_MONITOR_BSY, "MODE MONITOR BSY"}, - {MR_DMA_MODE, "MODE DMA"}, {MR_ARBITRATE, "MODE ARBITRATION"}, + {MR_DMA_MODE, "MODE DMA"}, + {MR_ARBITRATE, "MODE ARBITRATION"}, {0, NULL} }; @@ -511,15 +489,13 @@ static struct { static void NCR5380_print(struct Scsi_Host *instance) { unsigned char status, data, basr, mr, icr, i; - unsigned long flags; - local_irq_save(flags); data = NCR5380_read(CURRENT_SCSI_DATA_REG); status = NCR5380_read(STATUS_REG); mr = NCR5380_read(MODE_REG); icr = NCR5380_read(INITIATOR_COMMAND_REG); basr = NCR5380_read(BUS_AND_STATUS_REG); - local_irq_restore(flags); + printk("STATUS_REG: %02x ", status); for (i = 0; signals[i].mask; ++i) if (status & signals[i].mask) @@ -543,8 +519,12 @@ static struct { unsigned char value; const char *name; } phases[] = { - {PHASE_DATAOUT, "DATAOUT"}, {PHASE_DATAIN, "DATAIN"}, {PHASE_CMDOUT, "CMDOUT"}, - {PHASE_STATIN, "STATIN"}, {PHASE_MSGOUT, "MSGOUT"}, {PHASE_MSGIN, "MSGIN"}, + {PHASE_DATAOUT, "DATAOUT"}, + {PHASE_DATAIN, "DATAIN"}, + {PHASE_CMDOUT, "CMDOUT"}, + {PHASE_STATIN, "STATIN"}, + {PHASE_MSGOUT, "MSGOUT"}, + {PHASE_MSGIN, "MSGIN"}, {PHASE_UNKNOWN, "UNKNOWN"} }; @@ -553,8 +533,6 @@ static struct { * @instance: adapter to dump * * Print the current SCSI phase for debugging purposes - * - * Locks: none */ static void NCR5380_print_phase(struct Scsi_Host *instance) @@ -564,54 +542,21 @@ static void NCR5380_print_phase(struct Scsi_Host *instance) status = NCR5380_read(STATUS_REG); if (!(status & SR_REQ)) - printk(KERN_DEBUG "scsi%d: REQ not asserted, phase unknown.\n", HOSTNO); + shost_printk(KERN_DEBUG, instance, "REQ not asserted, phase unknown.\n"); else { for (i = 0; (phases[i].value != PHASE_UNKNOWN) && (phases[i].value != (status & PHASE_MASK)); ++i) ; - printk(KERN_DEBUG "scsi%d: phase %s\n", HOSTNO, phases[i].name); + shost_printk(KERN_DEBUG, instance, "phase %s\n", phases[i].name); } } - #endif -/* - * ++roman: New scheme of calling NCR5380_main() - * - * If we're not in an interrupt, we can call our main directly, it cannot be - * already running. Else, we queue it on a task queue, if not 'main_running' - * tells us that a lower level is already executing it. This way, - * 'main_running' needs not be protected in a special way. - * - * queue_main() is a utility function for putting our main onto the task - * queue, if main_running is false. It should be called only from a - * interrupt or bottom half. - */ - -#include <linux/gfp.h> -#include <linux/workqueue.h> -#include <linux/interrupt.h> - -static inline void queue_main(struct NCR5380_hostdata *hostdata) -{ - if (!hostdata->main_running) { - /* If in interrupt and NCR5380_main() not already running, - queue it on the 'immediate' task queue, to be processed - immediately after the current interrupt processing has - finished. */ - schedule_work(&hostdata->main_task); - } - /* else: nothing to do: the running NCR5380_main() will pick up - any newly queued command. */ -} - /** * NCR58380_info - report driver and host information * @instance: relevant scsi host instance * * For use as the host template info() handler. - * - * Locks: none */ static const char *NCR5380_info(struct Scsi_Host *instance) @@ -630,13 +575,14 @@ static void prepare_info(struct Scsi_Host *instance) "base 0x%lx, irq %d, " "can_queue %d, cmd_per_lun %d, " "sg_tablesize %d, this_id %d, " - "flags { %s}, " + "flags { %s%s}, " "options { %s} ", instance->hostt->name, instance->io_port, instance->n_io_port, instance->base, instance->irq, instance->can_queue, instance->cmd_per_lun, instance->sg_tablesize, instance->this_id, hostdata->flags & FLAG_TAGGED_QUEUING ? "TAGGED_QUEUING " : "", + hostdata->flags & FLAG_TOSHIBA_DELAY ? "TOSHIBA_DELAY " : "", #ifdef DIFFERENTIAL "DIFFERENTIAL " #endif @@ -653,102 +599,6 @@ static void prepare_info(struct Scsi_Host *instance) } /** - * NCR5380_print_status - dump controller info - * @instance: controller to dump - * - * Print commands in the various queues, called from NCR5380_abort - * to aid debugging. - */ - -static void lprint_Scsi_Cmnd(struct scsi_cmnd *cmd) -{ - int i, s; - unsigned char *command; - printk("scsi%d: destination target %d, lun %llu\n", - H_NO(cmd), cmd->device->id, cmd->device->lun); - printk(KERN_CONT " command = "); - command = cmd->cmnd; - printk(KERN_CONT "%2d (0x%02x)", command[0], command[0]); - for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i) - printk(KERN_CONT " %02x", command[i]); - printk("\n"); -} - -static void NCR5380_print_status(struct Scsi_Host *instance) -{ - struct NCR5380_hostdata *hostdata; - struct scsi_cmnd *ptr; - unsigned long flags; - - NCR5380_dprint(NDEBUG_ANY, instance); - NCR5380_dprint_phase(NDEBUG_ANY, instance); - - hostdata = (struct NCR5380_hostdata *)instance->hostdata; - - local_irq_save(flags); - printk("NCR5380: coroutine is%s running.\n", - hostdata->main_running ? "" : "n't"); - if (!hostdata->connected) - printk("scsi%d: no currently connected command\n", HOSTNO); - else - lprint_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected); - printk("scsi%d: issue_queue\n", HOSTNO); - for (ptr = (struct scsi_cmnd *)hostdata->issue_queue; ptr; ptr = NEXT(ptr)) - lprint_Scsi_Cmnd(ptr); - - printk("scsi%d: disconnected_queue\n", HOSTNO); - for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr; - ptr = NEXT(ptr)) - lprint_Scsi_Cmnd(ptr); - - local_irq_restore(flags); - printk("\n"); -} - -static void show_Scsi_Cmnd(struct scsi_cmnd *cmd, struct seq_file *m) -{ - int i, s; - unsigned char *command; - seq_printf(m, "scsi%d: destination target %d, lun %llu\n", - H_NO(cmd), cmd->device->id, cmd->device->lun); - seq_puts(m, " command = "); - command = cmd->cmnd; - seq_printf(m, "%2d (0x%02x)", command[0], command[0]); - for (i = 1, s = COMMAND_SIZE(command[0]); i < s; ++i) - seq_printf(m, " %02x", command[i]); - seq_putc(m, '\n'); -} - -static int __maybe_unused NCR5380_show_info(struct seq_file *m, - struct Scsi_Host *instance) -{ - struct NCR5380_hostdata *hostdata; - struct scsi_cmnd *ptr; - unsigned long flags; - - hostdata = (struct NCR5380_hostdata *)instance->hostdata; - - local_irq_save(flags); - seq_printf(m, "NCR5380: coroutine is%s running.\n", - hostdata->main_running ? "" : "n't"); - if (!hostdata->connected) - seq_printf(m, "scsi%d: no currently connected command\n", HOSTNO); - else - show_Scsi_Cmnd((struct scsi_cmnd *) hostdata->connected, m); - seq_printf(m, "scsi%d: issue_queue\n", HOSTNO); - for (ptr = (struct scsi_cmnd *)hostdata->issue_queue; ptr; ptr = NEXT(ptr)) - show_Scsi_Cmnd(ptr, m); - - seq_printf(m, "scsi%d: disconnected_queue\n", HOSTNO); - for (ptr = (struct scsi_cmnd *) hostdata->disconnected_queue; ptr; - ptr = NEXT(ptr)) - show_Scsi_Cmnd(ptr, m); - - local_irq_restore(flags); - return 0; -} - -/** * NCR5380_init - initialise an NCR5380 * @instance: adapter to configure * @flags: control flags @@ -764,11 +614,11 @@ static int __maybe_unused NCR5380_show_info(struct seq_file *m, static int __init NCR5380_init(struct Scsi_Host *instance, int flags) { + struct NCR5380_hostdata *hostdata = shost_priv(instance); int i; - SETUP_HOSTDATA(instance); + unsigned long deadline; hostdata->host = instance; - hostdata->aborted = 0; hostdata->id_mask = 1 << instance->this_id; hostdata->id_higher_mask = 0; for (i = hostdata->id_mask; i <= 0x80; i <<= 1) @@ -782,13 +632,21 @@ static int __init NCR5380_init(struct Scsi_Host *instance, int flags) #if defined (REAL_DMA) hostdata->dma_len = 0; #endif - hostdata->targets_present = 0; + spin_lock_init(&hostdata->lock); hostdata->connected = NULL; - hostdata->issue_queue = NULL; - hostdata->disconnected_queue = NULL; + hostdata->sensing = NULL; + INIT_LIST_HEAD(&hostdata->autosense); + INIT_LIST_HEAD(&hostdata->unissued); + INIT_LIST_HEAD(&hostdata->disconnected); + hostdata->flags = flags; INIT_WORK(&hostdata->main_task, NCR5380_main); + hostdata->work_q = alloc_workqueue("ncr5380_%d", + WQ_UNBOUND | WQ_MEM_RECLAIM, + 1, instance->host_no); + if (!hostdata->work_q) + return -ENOMEM; prepare_info(instance); @@ -797,6 +655,72 @@ static int __init NCR5380_init(struct Scsi_Host *instance, int flags) NCR5380_write(TARGET_COMMAND_REG, 0); NCR5380_write(SELECT_ENABLE_REG, 0); + /* Calibrate register polling loop */ + i = 0; + deadline = jiffies + 1; + do { + cpu_relax(); + } while (time_is_after_jiffies(deadline)); + deadline += msecs_to_jiffies(256); + do { + NCR5380_read(STATUS_REG); + ++i; + cpu_relax(); + } while (time_is_after_jiffies(deadline)); + hostdata->accesses_per_ms = i / 256; + + return 0; +} + +/** + * NCR5380_maybe_reset_bus - Detect and correct bus wedge problems. + * @instance: adapter to check + * + * If the system crashed, it may have crashed with a connected target and + * the SCSI bus busy. Check for BUS FREE phase. If not, try to abort the + * currently established nexus, which we know nothing about. Failing that + * do a bus reset. + * + * Note that a bus reset will cause the chip to assert IRQ. + * + * Returns 0 if successful, otherwise -ENXIO. + */ + +static int NCR5380_maybe_reset_bus(struct Scsi_Host *instance) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + int pass; + + for (pass = 1; (NCR5380_read(STATUS_REG) & SR_BSY) && pass <= 6; ++pass) { + switch (pass) { + case 1: + case 3: + case 5: + shost_printk(KERN_ERR, instance, "SCSI bus busy, waiting up to five seconds\n"); + NCR5380_poll_politely(instance, + STATUS_REG, SR_BSY, 0, 5 * HZ); + break; + case 2: + shost_printk(KERN_ERR, instance, "bus busy, attempting abort\n"); + do_abort(instance); + break; + case 4: + shost_printk(KERN_ERR, instance, "bus busy, attempting reset\n"); + do_reset(instance); + /* Wait after a reset; the SCSI standard calls for + * 250ms, we wait 500ms to be on the safe side. + * But some Toshiba CD-ROMs need ten times that. + */ + if (hostdata->flags & FLAG_TOSHIBA_DELAY) + msleep(2500); + else + msleep(500); + break; + case 6: + shost_printk(KERN_ERR, instance, "bus locked solid\n"); + return -ENXIO; + } + } return 0; } @@ -812,6 +736,38 @@ static void NCR5380_exit(struct Scsi_Host *instance) struct NCR5380_hostdata *hostdata = shost_priv(instance); cancel_work_sync(&hostdata->main_task); + destroy_workqueue(hostdata->work_q); +} + +/** + * complete_cmd - finish processing a command and return it to the SCSI ML + * @instance: the host instance + * @cmd: command to complete + */ + +static void complete_cmd(struct Scsi_Host *instance, + struct scsi_cmnd *cmd) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + + dsprintk(NDEBUG_QUEUES, instance, "complete_cmd: cmd %p\n", cmd); + + if (hostdata->sensing == cmd) { + /* Autosense processing ends here */ + if ((cmd->result & 0xff) != SAM_STAT_GOOD) { + scsi_eh_restore_cmnd(cmd, &hostdata->ses); + set_host_byte(cmd, DID_ERROR); + } else + scsi_eh_restore_cmnd(cmd, &hostdata->ses); + hostdata->sensing = NULL; + } + +#ifdef SUPPORT_TAGS + cmd_free_tag(cmd); +#else + hostdata->busy[scmd_id(cmd)] &= ~(1 << cmd->device->lun); +#endif + cmd->scsi_done(cmd); } /** @@ -819,7 +775,7 @@ static void NCR5380_exit(struct Scsi_Host *instance) * @instance: the relevant SCSI adapter * @cmd: SCSI command * - * cmd is added to the per instance issue_queue, with minor + * cmd is added to the per-instance issue queue, with minor * twiddling done to the host specific fields of cmd. If the * main coroutine is not running, it is restarted. */ @@ -828,44 +784,23 @@ static int NCR5380_queue_command(struct Scsi_Host *instance, struct scsi_cmnd *cmd) { struct NCR5380_hostdata *hostdata = shost_priv(instance); - struct scsi_cmnd *tmp; + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); unsigned long flags; #if (NDEBUG & NDEBUG_NO_WRITE) switch (cmd->cmnd[0]) { case WRITE_6: case WRITE_10: - printk(KERN_NOTICE "scsi%d: WRITE attempted with NO_WRITE debugging flag set\n", - H_NO(cmd)); + shost_printk(KERN_DEBUG, instance, "WRITE attempted with NDEBUG_NO_WRITE set\n"); cmd->result = (DID_ERROR << 16); cmd->scsi_done(cmd); return 0; } #endif /* (NDEBUG & NDEBUG_NO_WRITE) */ - /* - * We use the host_scribble field as a pointer to the next command - * in a queue - */ - - SET_NEXT(cmd, NULL); cmd->result = 0; /* - * Insert the cmd into the issue queue. Note that REQUEST SENSE - * commands are added to the head of the queue since any command will - * clear the contingent allegiance condition that exists and the - * sense data is only guaranteed to be valid while the condition exists. - */ - - /* ++guenther: now that the issue queue is being set up, we can lock ST-DMA. - * Otherwise a running NCR5380_main may steal the lock. - * Lock before actually inserting due to fairness reasons explained in - * atari_scsi.c. If we insert first, then it's impossible for this driver - * to release the lock. - * Stop timer for this command while waiting for the lock, or timeouts - * may happen (and they really do), and it's no good if the command doesn't - * appear in any of the queues. * ++roman: Just disabling the NCR interrupt isn't sufficient here, * because also a timer int can trigger an abort or reset, which would * alter queues and touch the lock. @@ -873,7 +808,7 @@ static int NCR5380_queue_command(struct Scsi_Host *instance, if (!NCR5380_acquire_dma_irq(instance)) return SCSI_MLQUEUE_HOST_BUSY; - local_irq_save(flags); + spin_lock_irqsave(&hostdata->lock, flags); /* * Insert the cmd into the issue queue. Note that REQUEST SENSE @@ -882,33 +817,18 @@ static int NCR5380_queue_command(struct Scsi_Host *instance, * sense data is only guaranteed to be valid while the condition exists. */ - if (!(hostdata->issue_queue) || (cmd->cmnd[0] == REQUEST_SENSE)) { - LIST(cmd, hostdata->issue_queue); - SET_NEXT(cmd, hostdata->issue_queue); - hostdata->issue_queue = cmd; - } else { - for (tmp = (struct scsi_cmnd *)hostdata->issue_queue; - NEXT(tmp); tmp = NEXT(tmp)) - ; - LIST(cmd, tmp); - SET_NEXT(tmp, cmd); - } - local_irq_restore(flags); + if (cmd->cmnd[0] == REQUEST_SENSE) + list_add(&ncmd->list, &hostdata->unissued); + else + list_add_tail(&ncmd->list, &hostdata->unissued); - dprintk(NDEBUG_QUEUES, "scsi%d: command added to %s of queue\n", H_NO(cmd), - (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail"); + spin_unlock_irqrestore(&hostdata->lock, flags); - /* If queue_command() is called from an interrupt (real one or bottom - * half), we let queue_main() do the job of taking care about main. If it - * is already running, this is a no-op, else main will be queued. - * - * If we're not in an interrupt, we can call NCR5380_main() - * unconditionally, because it cannot be already running. - */ - if (in_interrupt() || irqs_disabled()) - queue_main(hostdata); - else - NCR5380_main(&hostdata->main_task); + dsprintk(NDEBUG_QUEUES, instance, "command %p added to %s of queue\n", + cmd, (cmd->cmnd[0] == REQUEST_SENSE) ? "head" : "tail"); + + /* Kick off command processing */ + queue_work(hostdata->work_q, &hostdata->main_task); return 0; } @@ -917,22 +837,85 @@ static inline void maybe_release_dma_irq(struct Scsi_Host *instance) struct NCR5380_hostdata *hostdata = shost_priv(instance); /* Caller does the locking needed to set & test these data atomically */ - if (!hostdata->disconnected_queue && - !hostdata->issue_queue && + if (list_empty(&hostdata->disconnected) && + list_empty(&hostdata->unissued) && + list_empty(&hostdata->autosense) && !hostdata->connected && - !hostdata->retain_dma_intr) + !hostdata->selecting) NCR5380_release_dma_irq(instance); } /** + * dequeue_next_cmd - dequeue a command for processing + * @instance: the scsi host instance + * + * Priority is given to commands on the autosense queue. These commands + * need autosense because of a CHECK CONDITION result. + * + * Returns a command pointer if a command is found for a target that is + * not already busy. Otherwise returns NULL. + */ + +static struct scsi_cmnd *dequeue_next_cmd(struct Scsi_Host *instance) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + struct NCR5380_cmd *ncmd; + struct scsi_cmnd *cmd; + + if (list_empty(&hostdata->autosense)) { + list_for_each_entry(ncmd, &hostdata->unissued, list) { + cmd = NCR5380_to_scmd(ncmd); + dsprintk(NDEBUG_QUEUES, instance, "dequeue: cmd=%p target=%d busy=0x%02x lun=%llu\n", + cmd, scmd_id(cmd), hostdata->busy[scmd_id(cmd)], cmd->device->lun); + + if ( +#ifdef SUPPORT_TAGS + !is_lun_busy(cmd, 1) +#else + !(hostdata->busy[scmd_id(cmd)] & (1 << cmd->device->lun)) +#endif + ) { + list_del(&ncmd->list); + dsprintk(NDEBUG_QUEUES, instance, + "dequeue: removed %p from issue queue\n", cmd); + return cmd; + } + } + } else { + /* Autosense processing begins here */ + ncmd = list_first_entry(&hostdata->autosense, + struct NCR5380_cmd, list); + list_del(&ncmd->list); + cmd = NCR5380_to_scmd(ncmd); + dsprintk(NDEBUG_QUEUES, instance, + "dequeue: removed %p from autosense queue\n", cmd); + scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0); + hostdata->sensing = cmd; + return cmd; + } + return NULL; +} + +static void requeue_cmd(struct Scsi_Host *instance, struct scsi_cmnd *cmd) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); + + if (hostdata->sensing) { + scsi_eh_restore_cmnd(cmd, &hostdata->ses); + list_add(&ncmd->list, &hostdata->autosense); + hostdata->sensing = NULL; + } else + list_add(&ncmd->list, &hostdata->unissued); +} + +/** * NCR5380_main - NCR state machines * * NCR5380_main is a coroutine that runs as long as more work can * be done on the NCR5380 host adapters in a system. Both * NCR5380_queue_command() and NCR5380_intr() will try to start it * in case it is not running. - * - * Locks: called as its own thread with no locks held. */ static void NCR5380_main(struct work_struct *work) @@ -940,154 +923,69 @@ static void NCR5380_main(struct work_struct *work) struct NCR5380_hostdata *hostdata = container_of(work, struct NCR5380_hostdata, main_task); struct Scsi_Host *instance = hostdata->host; - struct scsi_cmnd *tmp, *prev; + struct scsi_cmnd *cmd; int done; - unsigned long flags; /* - * We run (with interrupts disabled) until we're sure that none of - * the host adapters have anything that can be done, at which point - * we set main_running to 0 and exit. - * - * Interrupts are enabled before doing various other internal - * instructions, after we've decided that we need to run through - * the loop again. - * - * this should prevent any race conditions. - * * ++roman: Just disabling the NCR interrupt isn't sufficient here, * because also a timer int can trigger an abort or reset, which can * alter queues and touch the Falcon lock. */ - /* Tell int handlers main() is now already executing. Note that - no races are possible here. If an int comes in before - 'main_running' is set here, and queues/executes main via the - task queue, it doesn't do any harm, just this instance of main - won't find any work left to do. */ - if (hostdata->main_running) - return; - hostdata->main_running = 1; - - local_save_flags(flags); do { - local_irq_disable(); /* Freeze request queues */ done = 1; - if (!hostdata->connected) { - dprintk(NDEBUG_MAIN, "scsi%d: not connected\n", HOSTNO); - /* - * Search through the issue_queue for a command destined - * for a target that's not busy. - */ -#if (NDEBUG & NDEBUG_LISTS) - for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, prev = NULL; - tmp && (tmp != prev); prev = tmp, tmp = NEXT(tmp)) - ; - /*printk("%p ", tmp);*/ - if ((tmp == prev) && tmp) - printk(" LOOP\n"); - /* else printk("\n"); */ -#endif - for (tmp = (struct scsi_cmnd *) hostdata->issue_queue, - prev = NULL; tmp; prev = tmp, tmp = NEXT(tmp)) { - u8 lun = tmp->device->lun; - - dprintk(NDEBUG_LISTS, - "MAIN tmp=%p target=%d busy=%d lun=%d\n", - tmp, scmd_id(tmp), hostdata->busy[scmd_id(tmp)], - lun); - /* When we find one, remove it from the issue queue. */ - /* ++guenther: possible race with Falcon locking */ - if ( -#ifdef SUPPORT_TAGS - !is_lun_busy( tmp, tmp->cmnd[0] != REQUEST_SENSE) -#else - !(hostdata->busy[tmp->device->id] & (1 << lun)) -#endif - ) { - /* ++guenther: just to be sure, this must be atomic */ - local_irq_disable(); - if (prev) { - REMOVE(prev, NEXT(prev), tmp, NEXT(tmp)); - SET_NEXT(prev, NEXT(tmp)); - } else { - REMOVE(-1, hostdata->issue_queue, tmp, NEXT(tmp)); - hostdata->issue_queue = NEXT(tmp); - } - SET_NEXT(tmp, NULL); - hostdata->retain_dma_intr++; + spin_lock_irq(&hostdata->lock); + while (!hostdata->connected && + (cmd = dequeue_next_cmd(instance))) { - /* reenable interrupts after finding one */ - local_irq_restore(flags); + dsprintk(NDEBUG_MAIN, instance, "main: dequeued %p\n", cmd); - /* - * Attempt to establish an I_T_L nexus here. - * On success, instance->hostdata->connected is set. - * On failure, we must add the command back to the - * issue queue so we can keep trying. - */ - dprintk(NDEBUG_MAIN, "scsi%d: main(): command for target %d " - "lun %d removed from issue_queue\n", - HOSTNO, tmp->device->id, lun); - /* - * REQUEST SENSE commands are issued without tagged - * queueing, even on SCSI-II devices because the - * contingent allegiance condition exists for the - * entire unit. - */ - /* ++roman: ...and the standard also requires that - * REQUEST SENSE command are untagged. - */ + /* + * Attempt to establish an I_T_L nexus here. + * On success, instance->hostdata->connected is set. + * On failure, we must add the command back to the + * issue queue so we can keep trying. + */ + /* + * REQUEST SENSE commands are issued without tagged + * queueing, even on SCSI-II devices because the + * contingent allegiance condition exists for the + * entire unit. + */ + /* ++roman: ...and the standard also requires that + * REQUEST SENSE command are untagged. + */ #ifdef SUPPORT_TAGS - cmd_get_tag(tmp, tmp->cmnd[0] != REQUEST_SENSE); + cmd_get_tag(cmd, cmd->cmnd[0] != REQUEST_SENSE); #endif - if (!NCR5380_select(instance, tmp)) { - local_irq_disable(); - hostdata->retain_dma_intr--; - /* release if target did not response! */ - maybe_release_dma_irq(instance); - local_irq_restore(flags); - break; - } else { - local_irq_disable(); - LIST(tmp, hostdata->issue_queue); - SET_NEXT(tmp, hostdata->issue_queue); - hostdata->issue_queue = tmp; + cmd = NCR5380_select(instance, cmd); + if (!cmd) { + dsprintk(NDEBUG_MAIN, instance, "main: select complete\n"); + maybe_release_dma_irq(instance); + } else { + dsprintk(NDEBUG_MAIN | NDEBUG_QUEUES, instance, + "main: select failed, returning %p to queue\n", cmd); + requeue_cmd(instance, cmd); #ifdef SUPPORT_TAGS - cmd_free_tag(tmp); + cmd_free_tag(cmd); #endif - hostdata->retain_dma_intr--; - local_irq_restore(flags); - dprintk(NDEBUG_MAIN, "scsi%d: main(): select() failed, " - "returned to issue_queue\n", HOSTNO); - if (hostdata->connected) - break; - } - } /* if target/lun/target queue is not busy */ - } /* for issue_queue */ - } /* if (!hostdata->connected) */ - + } + } if (hostdata->connected #ifdef REAL_DMA && !hostdata->dma_len #endif ) { - local_irq_restore(flags); - dprintk(NDEBUG_MAIN, "scsi%d: main: performing information transfer\n", - HOSTNO); + dsprintk(NDEBUG_MAIN, instance, "main: performing information transfer\n"); NCR5380_information_transfer(instance); - dprintk(NDEBUG_MAIN, "scsi%d: main: done set false\n", HOSTNO); done = 0; } + spin_unlock_irq(&hostdata->lock); + if (!done) + cond_resched(); } while (!done); - - /* Better allow ints _after_ 'main_running' has been cleared, else - an interrupt could believe we'll pick up the work it left for - us, but we won't see it anymore here... */ - hostdata->main_running = 0; - local_irq_restore(flags); } @@ -1096,27 +994,20 @@ static void NCR5380_main(struct work_struct *work) * Function : void NCR5380_dma_complete (struct Scsi_Host *instance) * * Purpose : Called by interrupt handler when DMA finishes or a phase - * mismatch occurs (which would finish the DMA transfer). + * mismatch occurs (which would finish the DMA transfer). * * Inputs : instance - this instance of the NCR5380. - * */ static void NCR5380_dma_complete(struct Scsi_Host *instance) { - SETUP_HOSTDATA(instance); + struct NCR5380_hostdata *hostdata = shost_priv(instance); int transferred; unsigned char **data; - volatile int *count; + int *count; int saved_data = 0, overrun = 0; unsigned char p; - if (!hostdata->connected) { - printk(KERN_WARNING "scsi%d: received end of DMA interrupt with " - "no connected cmd\n", HOSTNO); - return; - } - if (hostdata->read_overruns) { p = hostdata->connected->SCp.phase; if (p & SR_IO) { @@ -1126,15 +1017,11 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance) (BASR_PHASE_MATCH|BASR_ACK)) { saved_data = NCR5380_read(INPUT_DATA_REG); overrun = 1; - dprintk(NDEBUG_DMA, "scsi%d: read overrun handled\n", HOSTNO); + dsprintk(NDEBUG_DMA, instance, "read overrun handled\n"); } } } - dprintk(NDEBUG_DMA, "scsi%d: real DMA transfer complete, basr 0x%X, sr 0x%X\n", - HOSTNO, NCR5380_read(BUS_AND_STATUS_REG), - NCR5380_read(STATUS_REG)); - #if defined(CONFIG_SUN3) if ((sun3scsi_dma_finish(rq_data_dir(hostdata->connected->request)))) { pr_err("scsi%d: overrun in UDC counter -- not prepared to deal with this!\n", @@ -1153,9 +1040,9 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance) } #endif - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); NCR5380_write(MODE_REG, MR_BASE); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + NCR5380_read(RESET_PARITY_INTERRUPT_REG); transferred = hostdata->dma_len - NCR5380_dma_residual(instance); hostdata->dma_len = 0; @@ -1194,140 +1081,160 @@ static void NCR5380_dma_complete(struct Scsi_Host *instance) * Handle interrupts, reestablishing I_T_L or I_T_L_Q nexuses * from the disconnected queue, and restarting NCR5380_main() * as required. + * + * The chip can assert IRQ in any of six different conditions. The IRQ flag + * is then cleared by reading the Reset Parity/Interrupt Register (RPIR). + * Three of these six conditions are latched in the Bus and Status Register: + * - End of DMA (cleared by ending DMA Mode) + * - Parity error (cleared by reading RPIR) + * - Loss of BSY (cleared by reading RPIR) + * Two conditions have flag bits that are not latched: + * - Bus phase mismatch (non-maskable in DMA Mode, cleared by ending DMA Mode) + * - Bus reset (non-maskable) + * The remaining condition has no flag bit at all: + * - Selection/reselection + * + * Hence, establishing the cause(s) of any interrupt is partly guesswork. + * In "The DP8490 and DP5380 Comparison Guide", National Semiconductor + * claimed that "the design of the [DP8490] interrupt logic ensures + * interrupts will not be lost (they can be on the DP5380)." + * The L5380/53C80 datasheet from LOGIC Devices has more details. + * + * Checking for bus reset by reading RST is futile because of interrupt + * latency, but a bus reset will reset chip logic. Checking for parity error + * is unnecessary because that interrupt is never enabled. A Loss of BSY + * condition will clear DMA Mode. We can tell when this occurs because the + * the Busy Monitor interrupt is enabled together with DMA Mode. */ static irqreturn_t NCR5380_intr(int irq, void *dev_id) { struct Scsi_Host *instance = dev_id; - int done = 1, handled = 0; + struct NCR5380_hostdata *hostdata = shost_priv(instance); + int handled = 0; unsigned char basr; + unsigned long flags; - dprintk(NDEBUG_INTR, "scsi%d: NCR5380 irq triggered\n", HOSTNO); + spin_lock_irqsave(&hostdata->lock, flags); - /* Look for pending interrupts */ basr = NCR5380_read(BUS_AND_STATUS_REG); - dprintk(NDEBUG_INTR, "scsi%d: BASR=%02x\n", HOSTNO, basr); - /* dispatch to appropriate routine if found and done=0 */ if (basr & BASR_IRQ) { - NCR5380_dprint(NDEBUG_INTR, instance); - if ((NCR5380_read(STATUS_REG) & (SR_SEL|SR_IO)) == (SR_SEL|SR_IO)) { - done = 0; - dprintk(NDEBUG_INTR, "scsi%d: SEL interrupt\n", HOSTNO); - NCR5380_reselect(instance); - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else if (basr & BASR_PARITY_ERROR) { - dprintk(NDEBUG_INTR, "scsi%d: PARITY interrupt\n", HOSTNO); - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else if ((NCR5380_read(STATUS_REG) & SR_RST) == SR_RST) { - dprintk(NDEBUG_INTR, "scsi%d: RESET interrupt\n", HOSTNO); - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); - } else { - /* - * The rest of the interrupt conditions can occur only during a - * DMA transfer - */ + unsigned char mr = NCR5380_read(MODE_REG); + unsigned char sr = NCR5380_read(STATUS_REG); + + dsprintk(NDEBUG_INTR, instance, "IRQ %d, BASR 0x%02x, SR 0x%02x, MR 0x%02x\n", + irq, basr, sr, mr); #if defined(REAL_DMA) - /* - * We should only get PHASE MISMATCH and EOP interrupts if we have - * DMA enabled, so do a sanity check based on the current setting - * of the MODE register. + if ((mr & MR_DMA_MODE) || (mr & MR_MONITOR_BSY)) { + /* Probably End of DMA, Phase Mismatch or Loss of BSY. + * We ack IRQ after clearing Mode Register. Workarounds + * for End of DMA errata need to happen in DMA Mode. */ - if ((NCR5380_read(MODE_REG) & MR_DMA_MODE) && - ((basr & BASR_END_DMA_TRANSFER) || - !(basr & BASR_PHASE_MATCH))) { + dsprintk(NDEBUG_INTR, instance, "interrupt in DMA mode\n"); - dprintk(NDEBUG_INTR, "scsi%d: PHASE MISM or EOP interrupt\n", HOSTNO); - NCR5380_dma_complete( instance ); - done = 0; - } else + if (hostdata->connected) { + NCR5380_dma_complete(instance); + queue_work(hostdata->work_q, &hostdata->main_task); + } else { + NCR5380_write(MODE_REG, MR_BASE); + NCR5380_read(RESET_PARITY_INTERRUPT_REG); + } + } else #endif /* REAL_DMA */ - { -/* MS: Ignore unknown phase mismatch interrupts (caused by EOP interrupt) */ - if (basr & BASR_PHASE_MATCH) - dprintk(NDEBUG_INTR, "scsi%d: unknown interrupt, " - "BASR 0x%x, MR 0x%x, SR 0x%x\n", - HOSTNO, basr, NCR5380_read(MODE_REG), - NCR5380_read(STATUS_REG)); - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); + if ((NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_mask) && + (sr & (SR_SEL | SR_IO | SR_BSY | SR_RST)) == (SR_SEL | SR_IO)) { + /* Probably reselected */ + NCR5380_write(SELECT_ENABLE_REG, 0); + NCR5380_read(RESET_PARITY_INTERRUPT_REG); + + dsprintk(NDEBUG_INTR, instance, "interrupt with SEL and IO\n"); + + if (!hostdata->connected) { + NCR5380_reselect(instance); + queue_work(hostdata->work_q, &hostdata->main_task); + } + if (!hostdata->connected) + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + } else { + /* Probably Bus Reset */ + NCR5380_read(RESET_PARITY_INTERRUPT_REG); + + dsprintk(NDEBUG_INTR, instance, "unknown interrupt\n"); #ifdef SUN3_SCSI_VME - dregs->csr |= CSR_DMA_ENABLE; + dregs->csr |= CSR_DMA_ENABLE; #endif - } - } /* if !(SELECTION || PARITY) */ + } handled = 1; - } /* BASR & IRQ */ else { - printk(KERN_NOTICE "scsi%d: interrupt without IRQ bit set in BASR, " - "BASR 0x%X, MR 0x%X, SR 0x%x\n", HOSTNO, basr, - NCR5380_read(MODE_REG), NCR5380_read(STATUS_REG)); - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); + } else { + shost_printk(KERN_NOTICE, instance, "interrupt without IRQ bit\n"); #ifdef SUN3_SCSI_VME dregs->csr |= CSR_DMA_ENABLE; #endif } - if (!done) { - dprintk(NDEBUG_INTR, "scsi%d: in int routine, calling main\n", HOSTNO); - /* Put a call to NCR5380_main() on the queue... */ - queue_main(shost_priv(instance)); - } + spin_unlock_irqrestore(&hostdata->lock, flags); + return IRQ_RETVAL(handled); } /* * Function : int NCR5380_select(struct Scsi_Host *instance, - * struct scsi_cmnd *cmd) + * struct scsi_cmnd *cmd) * * Purpose : establishes I_T_L or I_T_L_Q nexus for new or existing command, - * including ARBITRATION, SELECTION, and initial message out for - * IDENTIFY and queue messages. + * including ARBITRATION, SELECTION, and initial message out for + * IDENTIFY and queue messages. * * Inputs : instance - instantiation of the 5380 driver on which this - * target lives, cmd - SCSI command to execute. + * target lives, cmd - SCSI command to execute. * - * Returns : -1 if selection could not execute for some reason, - * 0 if selection succeeded or failed because the target - * did not respond. + * Returns cmd if selection failed but should be retried, + * NULL if selection failed and should not be retried, or + * NULL if selection succeeded (hostdata->connected == cmd). * * Side effects : - * If bus busy, arbitration failed, etc, NCR5380_select() will exit - * with registers as they should have been on entry - ie - * SELECT_ENABLE will be set appropriately, the NCR5380 - * will cease to drive any SCSI bus signals. + * If bus busy, arbitration failed, etc, NCR5380_select() will exit + * with registers as they should have been on entry - ie + * SELECT_ENABLE will be set appropriately, the NCR5380 + * will cease to drive any SCSI bus signals. * - * If successful : I_T_L or I_T_L_Q nexus will be established, - * instance->connected will be set to cmd. - * SELECT interrupt will be disabled. + * If successful : I_T_L or I_T_L_Q nexus will be established, + * instance->connected will be set to cmd. + * SELECT interrupt will be disabled. * - * If failed (no target) : cmd->scsi_done() will be called, and the - * cmd->result host byte set to DID_BAD_TARGET. + * If failed (no target) : cmd->scsi_done() will be called, and the + * cmd->result host byte set to DID_BAD_TARGET. */ -static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) +static struct scsi_cmnd *NCR5380_select(struct Scsi_Host *instance, + struct scsi_cmnd *cmd) { - SETUP_HOSTDATA(instance); + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned char tmp[3], phase; unsigned char *data; int len; - unsigned long timeout; - unsigned long flags; + int err; - hostdata->restart_select = 0; NCR5380_dprint(NDEBUG_ARBITRATION, instance); - dprintk(NDEBUG_ARBITRATION, "scsi%d: starting arbitration, id = %d\n", HOSTNO, - instance->this_id); + dsprintk(NDEBUG_ARBITRATION, instance, "starting arbitration, id = %d\n", + instance->this_id); + + /* + * Arbitration and selection phases are slow and involve dropping the + * lock, so we have to watch out for EH. An exception handler may + * change 'selecting' to NULL. This function will then return NULL + * so that the caller will forget about 'cmd'. (During information + * transfer phases, EH may change 'connected' to NULL.) + */ + hostdata->selecting = cmd; /* * Set the phase bits to 0, otherwise the NCR5380 won't drive the * data bus during SELECTION. */ - local_irq_save(flags); - if (hostdata->connected) { - local_irq_restore(flags); - return -1; - } NCR5380_write(TARGET_COMMAND_REG, 0); /* @@ -1337,96 +1244,77 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask); NCR5380_write(MODE_REG, MR_ARBITRATE); - local_irq_restore(flags); - - /* Wait for arbitration logic to complete */ -#if defined(NCR_TIMEOUT) - { - unsigned long timeout = jiffies + 2*NCR_TIMEOUT; + /* The chip now waits for BUS FREE phase. Then after the 800 ns + * Bus Free Delay, arbitration will begin. + */ - while (!(NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_PROGRESS) && - time_before(jiffies, timeout) && !hostdata->connected) - ; - if (time_after_eq(jiffies, timeout)) { - printk("scsi : arbitration timeout at %d\n", __LINE__); - NCR5380_write(MODE_REG, MR_BASE); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return -1; - } + spin_unlock_irq(&hostdata->lock); + err = NCR5380_poll_politely2(instance, MODE_REG, MR_ARBITRATE, 0, + INITIATOR_COMMAND_REG, ICR_ARBITRATION_PROGRESS, + ICR_ARBITRATION_PROGRESS, HZ); + spin_lock_irq(&hostdata->lock); + if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) { + /* Reselection interrupt */ + goto out; } -#else /* NCR_TIMEOUT */ - while (!(NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_PROGRESS) && - !hostdata->connected) - ; -#endif - - dprintk(NDEBUG_ARBITRATION, "scsi%d: arbitration complete\n", HOSTNO); - - if (hostdata->connected) { + if (err < 0) { NCR5380_write(MODE_REG, MR_BASE); - return -1; + shost_printk(KERN_ERR, instance, + "select: arbitration timeout\n"); + goto out; } - /* - * The arbitration delay is 2.2us, but this is a minimum and there is - * no maximum so we can safely sleep for ceil(2.2) usecs to accommodate - * the integral nature of udelay(). - * - */ + spin_unlock_irq(&hostdata->lock); + /* The SCSI-2 arbitration delay is 2.4 us */ udelay(3); /* Check for lost arbitration */ if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || (NCR5380_read(CURRENT_SCSI_DATA_REG) & hostdata->id_higher_mask) || - (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || - hostdata->connected) { + (NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST)) { NCR5380_write(MODE_REG, MR_BASE); - dprintk(NDEBUG_ARBITRATION, "scsi%d: lost arbitration, deasserting MR_ARBITRATE\n", - HOSTNO); - return -1; + dsprintk(NDEBUG_ARBITRATION, instance, "lost arbitration, deasserting MR_ARBITRATE\n"); + spin_lock_irq(&hostdata->lock); + goto out; } - /* after/during arbitration, BSY should be asserted. - IBM DPES-31080 Version S31Q works now */ - /* Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman) */ + /* After/during arbitration, BSY should be asserted. + * IBM DPES-31080 Version S31Q works now + * Tnx to Thomas_Roesch@m2.maus.de for finding this! (Roman) + */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_SEL | ICR_ASSERT_BSY); - if ((NCR5380_read(INITIATOR_COMMAND_REG) & ICR_ARBITRATION_LOST) || - hostdata->connected) { - NCR5380_write(MODE_REG, MR_BASE); - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - dprintk(NDEBUG_ARBITRATION, "scsi%d: lost arbitration, deasserting ICR_ASSERT_SEL\n", - HOSTNO); - return -1; - } - /* * Again, bus clear + bus settle time is 1.2us, however, this is * a minimum so we'll udelay ceil(1.2) */ -#ifdef CONFIG_ATARI_SCSI_TOSHIBA_DELAY - /* ++roman: But some targets (see above :-) seem to need a bit more... */ - udelay(15); -#else - udelay(2); -#endif + if (hostdata->flags & FLAG_TOSHIBA_DELAY) + udelay(15); + else + udelay(2); - if (hostdata->connected) { + spin_lock_irq(&hostdata->lock); + + /* NCR5380_reselect() clears MODE_REG after a reselection interrupt */ + if (!(NCR5380_read(MODE_REG) & MR_ARBITRATE)) + goto out; + + if (!hostdata->selecting) { NCR5380_write(MODE_REG, MR_BASE); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - return -1; + goto out; } - dprintk(NDEBUG_ARBITRATION, "scsi%d: won arbitration\n", HOSTNO); + dsprintk(NDEBUG_ARBITRATION, instance, "won arbitration\n"); /* * Now that we have won arbitration, start Selection process, asserting * the host and target ID's on the SCSI bus. */ - NCR5380_write(OUTPUT_DATA_REG, (hostdata->id_mask | (1 << cmd->device->id))); + NCR5380_write(OUTPUT_DATA_REG, hostdata->id_mask | (1 << scmd_id(cmd))); /* * Raise ATN while SEL is true before BSY goes false from arbitration, @@ -1434,22 +1322,18 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) * phase immediately after selection. */ - NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_BSY | - ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL )); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY | + ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_SEL); NCR5380_write(MODE_REG, MR_BASE); /* * Reselect interrupts must be turned off prior to the dropping of BSY, * otherwise we will trigger an interrupt. */ - - if (hostdata->connected) { - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - return -1; - } - NCR5380_write(SELECT_ENABLE_REG, 0); + spin_unlock_irq(&hostdata->lock); + /* * The initiator shall then wait at least two deskew delays and release * the BSY signal. @@ -1457,8 +1341,8 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) udelay(1); /* wingel -- wait two bus deskew delay >2*45ns */ /* Reset BSY */ - NCR5380_write(INITIATOR_COMMAND_REG, (ICR_BASE | ICR_ASSERT_DATA | - ICR_ASSERT_ATN | ICR_ASSERT_SEL)); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA | + ICR_ASSERT_ATN | ICR_ASSERT_SEL); /* * Something weird happens when we cease to drive BSY - looks @@ -1479,45 +1363,39 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) udelay(1); - dprintk(NDEBUG_SELECTION, "scsi%d: selecting target %d\n", HOSTNO, cmd->device->id); + dsprintk(NDEBUG_SELECTION, instance, "selecting target %d\n", scmd_id(cmd)); /* * The SCSI specification calls for a 250 ms timeout for the actual * selection. */ - timeout = jiffies + msecs_to_jiffies(250); - - /* - * XXX very interesting - we're seeing a bounce where the BSY we - * asserted is being reflected / still asserted (propagation delay?) - * and it's detecting as true. Sigh. - */ - -#if 0 - /* ++roman: If a target conformed to the SCSI standard, it wouldn't assert - * IO while SEL is true. But again, there are some disks out the in the - * world that do that nevertheless. (Somebody claimed that this announces - * reselection capability of the target.) So we better skip that test and - * only wait for BSY... (Famous german words: Der Klügere gibt nach :-) - */ - - while (time_before(jiffies, timeout) && - !(NCR5380_read(STATUS_REG) & (SR_BSY | SR_IO))) - ; + err = NCR5380_poll_politely(instance, STATUS_REG, SR_BSY, SR_BSY, + msecs_to_jiffies(250)); if ((NCR5380_read(STATUS_REG) & (SR_SEL | SR_IO)) == (SR_SEL | SR_IO)) { + spin_lock_irq(&hostdata->lock); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); NCR5380_reselect(instance); - printk(KERN_ERR "scsi%d: reselection after won arbitration?\n", - HOSTNO); + if (!hostdata->connected) + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + shost_printk(KERN_ERR, instance, "reselection after won arbitration?\n"); + goto out; + } + + if (err < 0) { + spin_lock_irq(&hostdata->lock); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return -1; + /* Can't touch cmd if it has been reclaimed by the scsi ML */ + if (hostdata->selecting) { + cmd->result = DID_BAD_TARGET << 16; + complete_cmd(instance, cmd); + dsprintk(NDEBUG_SELECTION, instance, "target did not respond within 250ms\n"); + cmd = NULL; + } + goto out; } -#else - while (time_before(jiffies, timeout) && !(NCR5380_read(STATUS_REG) & SR_BSY)) - ; -#endif /* * No less than two deskew delays after the initiator detects the @@ -1529,29 +1407,6 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); - if (!(NCR5380_read(STATUS_REG) & SR_BSY)) { - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - if (hostdata->targets_present & (1 << cmd->device->id)) { - printk(KERN_ERR "scsi%d: weirdness\n", HOSTNO); - if (hostdata->restart_select) - printk(KERN_NOTICE "\trestart select\n"); - NCR5380_dprint(NDEBUG_ANY, instance); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return -1; - } - cmd->result = DID_BAD_TARGET << 16; -#ifdef SUPPORT_TAGS - cmd_free_tag(cmd); -#endif - cmd->scsi_done(cmd); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - dprintk(NDEBUG_SELECTION, "scsi%d: target did not respond within 250ms\n", HOSTNO); - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - return 0; - } - - hostdata->targets_present |= (1 << cmd->device->id); - /* * Since we followed the SCSI spec, and raised ATN while SEL * was true but before BSY was false during selection, the information @@ -1563,16 +1418,27 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) * until it wraps back to 0. * * XXX - it turns out that there are some broken SCSI-II devices, - * which claim to support tagged queuing but fail when more than - * some number of commands are issued at once. + * which claim to support tagged queuing but fail when more than + * some number of commands are issued at once. */ /* Wait for start of REQ/ACK handshake */ - while (!(NCR5380_read(STATUS_REG) & SR_REQ)) - ; - dprintk(NDEBUG_SELECTION, "scsi%d: target %d selected, going into MESSAGE OUT phase.\n", - HOSTNO, cmd->device->id); + err = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ); + spin_lock_irq(&hostdata->lock); + if (err < 0) { + shost_printk(KERN_ERR, instance, "select: REQ timeout\n"); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + goto out; + } + if (!hostdata->selecting) { + do_abort(instance); + goto out; + } + + dsprintk(NDEBUG_SELECTION, instance, "target %d selected, going into MESSAGE OUT phase.\n", + scmd_id(cmd)); tmp[0] = IDENTIFY(1, cmd->device->lun); #ifdef SUPPORT_TAGS @@ -1591,11 +1457,12 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) data = tmp; phase = PHASE_MSGOUT; NCR5380_transfer_pio(instance, &phase, &len, &data); - dprintk(NDEBUG_SELECTION, "scsi%d: nexus established.\n", HOSTNO); + dsprintk(NDEBUG_SELECTION, instance, "nexus established.\n"); /* XXX need to handle errors here */ + hostdata->connected = cmd; #ifndef SUPPORT_TAGS - hostdata->busy[cmd->device->id] |= (1 << cmd->device->lun); + hostdata->busy[cmd->device->id] |= 1 << cmd->device->lun; #endif #ifdef SUN3_SCSI_VME dregs->csr |= CSR_INTR; @@ -1603,24 +1470,30 @@ static int NCR5380_select(struct Scsi_Host *instance, struct scsi_cmnd *cmd) initialize_SCp(cmd); - return 0; + cmd = NULL; + +out: + if (!hostdata->selecting) + return NULL; + hostdata->selecting = NULL; + return cmd; } /* * Function : int NCR5380_transfer_pio (struct Scsi_Host *instance, - * unsigned char *phase, int *count, unsigned char **data) + * unsigned char *phase, int *count, unsigned char **data) * * Purpose : transfers data in given phase using polled I/O * * Inputs : instance - instance of driver, *phase - pointer to - * what phase is expected, *count - pointer to number of - * bytes to transfer, **data - pointer to data pointer. + * what phase is expected, *count - pointer to number of + * bytes to transfer, **data - pointer to data pointer. * * Returns : -1 when different phase is entered without transferring - * maximum number of bytes, 0 if all bytes are transferred or exit - * is in same phase. + * maximum number of bytes, 0 if all bytes are transferred or exit + * is in same phase. * - * Also, *phase, *count, *data are modified in place. + * Also, *phase, *count, *data are modified in place. * * XXX Note : handling for bus free may be useful. */ @@ -1635,9 +1508,9 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) { - register unsigned char p = *phase, tmp; - register int c = *count; - register unsigned char *d = *data; + unsigned char p = *phase, tmp; + int c = *count; + unsigned char *d = *data; /* * The NCR5380 chip will only drive the SCSI bus when the @@ -1652,14 +1525,15 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, * Wait for assertion of REQ, after which the phase bits will be * valid */ - while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ)) - ; - dprintk(NDEBUG_HANDSHAKE, "scsi%d: REQ detected\n", HOSTNO); + if (NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ) < 0) + break; + + dsprintk(NDEBUG_HANDSHAKE, instance, "REQ asserted\n"); /* Check for phase mismatch */ - if ((tmp & PHASE_MASK) != p) { - dprintk(NDEBUG_PIO, "scsi%d: phase mismatch\n", HOSTNO); + if ((NCR5380_read(STATUS_REG) & PHASE_MASK) != p) { + dsprintk(NDEBUG_PIO, instance, "phase mismatch\n"); NCR5380_dprint_phase(NDEBUG_PIO, instance); break; } @@ -1684,35 +1558,36 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_DATA); NCR5380_dprint(NDEBUG_PIO, instance); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | - ICR_ASSERT_DATA | ICR_ASSERT_ACK); + ICR_ASSERT_DATA | ICR_ASSERT_ACK); } else { NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | - ICR_ASSERT_DATA | ICR_ASSERT_ATN); + ICR_ASSERT_DATA | ICR_ASSERT_ATN); NCR5380_dprint(NDEBUG_PIO, instance); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | - ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK); + ICR_ASSERT_DATA | ICR_ASSERT_ATN | ICR_ASSERT_ACK); } } else { NCR5380_dprint(NDEBUG_PIO, instance); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ACK); } - while (NCR5380_read(STATUS_REG) & SR_REQ) - ; + if (NCR5380_poll_politely(instance, + STATUS_REG, SR_REQ, 0, 5 * HZ) < 0) + break; - dprintk(NDEBUG_HANDSHAKE, "scsi%d: req false, handshake complete\n", HOSTNO); + dsprintk(NDEBUG_HANDSHAKE, instance, "REQ negated, handshake complete\n"); - /* - * We have several special cases to consider during REQ/ACK handshaking : - * 1. We were in MSGOUT phase, and we are on the last byte of the - * message. ATN must be dropped as ACK is dropped. - * - * 2. We are in a MSGIN phase, and we are on the last byte of the - * message. We must exit with ACK asserted, so that the calling - * code may raise ATN before dropping ACK to reject the message. - * - * 3. ACK and ATN are clear and the target may proceed as normal. - */ +/* + * We have several special cases to consider during REQ/ACK handshaking : + * 1. We were in MSGOUT phase, and we are on the last byte of the + * message. ATN must be dropped as ACK is dropped. + * + * 2. We are in a MSGIN phase, and we are on the last byte of the + * message. We must exit with ACK asserted, so that the calling + * code may raise ATN before dropping ACK to reject the message. + * + * 3. ACK and ATN are clear and the target may proceed as normal. + */ if (!(p == PHASE_MSGIN && c == 1)) { if (p == PHASE_MSGOUT && c > 1) NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); @@ -1721,16 +1596,16 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, } } while (--c); - dprintk(NDEBUG_PIO, "scsi%d: residual %d\n", HOSTNO, c); + dsprintk(NDEBUG_PIO, instance, "residual %d\n", c); *count = c; *data = d; tmp = NCR5380_read(STATUS_REG); /* The phase read from the bus is valid if either REQ is (already) - * asserted or if ACK hasn't been released yet. The latter is the case if - * we're in MSGIN and all wanted bytes have been received. + * asserted or if ACK hasn't been released yet. The latter applies if + * we're in MSG IN, DATA IN or STATUS and all bytes have been received. */ - if ((tmp & SR_REQ) || (p == PHASE_MSGIN && c == 0)) + if ((tmp & SR_REQ) || ((tmp & SR_IO) && c == 0)) *phase = tmp & PHASE_MASK; else *phase = PHASE_UNKNOWN; @@ -1741,19 +1616,45 @@ static int NCR5380_transfer_pio(struct Scsi_Host *instance, return -1; } -/* - * Function : do_abort (Scsi_Host *host) +/** + * do_reset - issue a reset command + * @instance: adapter to reset * - * Purpose : abort the currently established nexus. Should only be - * called from a routine which can drop into a + * Issue a reset sequence to the NCR5380 and try and get the bus + * back into sane shape. * - * Returns : 0 on success, -1 on failure. + * This clears the reset interrupt flag because there may be no handler for + * it. When the driver is initialized, the NCR5380_intr() handler has not yet + * been installed. And when in EH we may have released the ST DMA interrupt. + */ + +static void do_reset(struct Scsi_Host *instance) +{ + unsigned long flags; + + local_irq_save(flags); + NCR5380_write(TARGET_COMMAND_REG, + PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG) & PHASE_MASK)); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST); + udelay(50); + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); + local_irq_restore(flags); +} + +/** + * do_abort - abort the currently established nexus by going to + * MESSAGE OUT phase and sending an ABORT message. + * @instance: relevant scsi host instance + * + * Returns 0 on success, -1 on failure. */ static int do_abort(struct Scsi_Host *instance) { - unsigned char tmp, *msgptr, phase; + unsigned char *msgptr, phase, tmp; int len; + int rc; /* Request message out phase */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); @@ -1768,16 +1669,20 @@ static int do_abort(struct Scsi_Host *instance) * the target sees, so we just handshake. */ - while (!((tmp = NCR5380_read(STATUS_REG)) & SR_REQ)) - ; + rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, 10 * HZ); + if (rc < 0) + goto timeout; + + tmp = NCR5380_read(STATUS_REG) & PHASE_MASK; NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp)); - if ((tmp & PHASE_MASK) != PHASE_MSGOUT) { - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | - ICR_ASSERT_ACK); - while (NCR5380_read(STATUS_REG) & SR_REQ) - ; + if (tmp != PHASE_MSGOUT) { + NCR5380_write(INITIATOR_COMMAND_REG, + ICR_BASE | ICR_ASSERT_ATN | ICR_ASSERT_ACK); + rc = NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, 0, 3 * HZ); + if (rc < 0) + goto timeout; NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN); } @@ -1793,26 +1698,29 @@ static int do_abort(struct Scsi_Host *instance) */ return len ? -1 : 0; + +timeout: + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + return -1; } #if defined(REAL_DMA) /* * Function : int NCR5380_transfer_dma (struct Scsi_Host *instance, - * unsigned char *phase, int *count, unsigned char **data) + * unsigned char *phase, int *count, unsigned char **data) * * Purpose : transfers data in given phase using either real - * or pseudo DMA. + * or pseudo DMA. * * Inputs : instance - instance of driver, *phase - pointer to - * what phase is expected, *count - pointer to number of - * bytes to transfer, **data - pointer to data pointer. + * what phase is expected, *count - pointer to number of + * bytes to transfer, **data - pointer to data pointer. * * Returns : -1 when different phase is entered without transferring - * maximum number of bytes, 0 if all bytes or transferred or exit - * is in same phase. - * - * Also, *phase, *count, *data are modified in place. + * maximum number of bytes, 0 if all bytes or transferred or exit + * is in same phase. * + * Also, *phase, *count, *data are modified in place. */ @@ -1820,10 +1728,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, unsigned char *phase, int *count, unsigned char **data) { - SETUP_HOSTDATA(instance); + struct NCR5380_hostdata *hostdata = shost_priv(instance); register int c = *count; register unsigned char p = *phase; - unsigned long flags; #if defined(CONFIG_SUN3) /* sanity check */ @@ -1834,29 +1741,22 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, } hostdata->dma_len = c; - dprintk(NDEBUG_DMA, "scsi%d: initializing DMA for %s, %d bytes %s %p\n", - instance->host_no, (p & SR_IO) ? "reading" : "writing", - c, (p & SR_IO) ? "to" : "from", *data); + dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n", + (p & SR_IO) ? "receive" : "send", c, *data); /* netbsd turns off ints here, why not be safe and do it too */ - local_irq_save(flags); /* send start chain */ sun3scsi_dma_start(c, *data); + NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p)); + NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY | + MR_ENABLE_EOP_INTR); if (p & SR_IO) { - NCR5380_write(TARGET_COMMAND_REG, 1); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); NCR5380_write(INITIATOR_COMMAND_REG, 0); - NCR5380_write(MODE_REG, - (NCR5380_read(MODE_REG) | MR_DMA_MODE | MR_ENABLE_EOP_INTR)); NCR5380_write(START_DMA_INITIATOR_RECEIVE_REG, 0); } else { - NCR5380_write(TARGET_COMMAND_REG, 0); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_DATA); - NCR5380_write(MODE_REG, - (NCR5380_read(MODE_REG) | MR_DMA_MODE | MR_ENABLE_EOP_INTR)); NCR5380_write(START_DMA_SEND_REG, 0); } @@ -1864,8 +1764,6 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, dregs->csr |= CSR_DMA_ENABLE; #endif - local_irq_restore(flags); - sun3_dma_active = 1; #else /* !defined(CONFIG_SUN3) */ @@ -1880,25 +1778,20 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, if (hostdata->read_overruns && (p & SR_IO)) c -= hostdata->read_overruns; - dprintk(NDEBUG_DMA, "scsi%d: initializing DMA for %s, %d bytes %s %p\n", - HOSTNO, (p & SR_IO) ? "reading" : "writing", - c, (p & SR_IO) ? "to" : "from", d); + dsprintk(NDEBUG_DMA, instance, "initializing DMA %s: length %d, address %p\n", + (p & SR_IO) ? "receive" : "send", c, d); NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(p)); - -#ifdef REAL_DMA - NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_ENABLE_EOP_INTR | MR_MONITOR_BSY); -#endif /* def REAL_DMA */ + NCR5380_write(MODE_REG, MR_BASE | MR_DMA_MODE | MR_MONITOR_BSY | + MR_ENABLE_EOP_INTR); if (!(hostdata->flags & FLAG_LATE_DMA_SETUP)) { /* On the Medusa, it is a must to initialize the DMA before * starting the NCR. This is also the cleaner way for the TT. */ - local_irq_save(flags); hostdata->dma_len = (p & SR_IO) ? NCR5380_dma_read_setup(instance, d, c) : NCR5380_dma_write_setup(instance, d, c); - local_irq_restore(flags); } if (p & SR_IO) @@ -1912,11 +1805,9 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, /* On the Falcon, the DMA setup must be done after the last */ /* NCR access, else the DMA setup gets trashed! */ - local_irq_save(flags); hostdata->dma_len = (p & SR_IO) ? NCR5380_dma_read_setup(instance, d, c) : NCR5380_dma_write_setup(instance, d, c); - local_irq_restore(flags); } #endif /* !defined(CONFIG_SUN3) */ @@ -1928,23 +1819,22 @@ static int NCR5380_transfer_dma(struct Scsi_Host *instance, * Function : NCR5380_information_transfer (struct Scsi_Host *instance) * * Purpose : run through the various SCSI phases and do as the target - * directs us to. Operates on the currently connected command, - * instance->connected. + * directs us to. Operates on the currently connected command, + * instance->connected. * * Inputs : instance, instance for which we are doing commands * * Side effects : SCSI things happen, the disconnected queue will be - * modified if a command disconnects, *instance->connected will - * change. + * modified if a command disconnects, *instance->connected will + * change. * * XXX Note : we need to watch for bus free or a reset condition here - * to recover from an unexpected bus free condition. + * to recover from an unexpected bus free condition. */ static void NCR5380_information_transfer(struct Scsi_Host *instance) { - SETUP_HOSTDATA(instance); - unsigned long flags; + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned char msgout = NOP; int sink = 0; int len; @@ -1953,13 +1843,15 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) #endif unsigned char *data; unsigned char phase, tmp, extended_msg[10], old_phase = 0xff; - struct scsi_cmnd *cmd = (struct scsi_cmnd *) hostdata->connected; + struct scsi_cmnd *cmd; #ifdef SUN3_SCSI_VME dregs->csr |= CSR_INTR; #endif - while (1) { + while ((cmd = hostdata->connected)) { + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); + tmp = NCR5380_read(STATUS_REG); /* We only have a valid SCSI phase when REQ is asserted */ if (tmp & SR_REQ) { @@ -1984,7 +1876,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) /* this command setup for dma yet? */ if ((count >= DMA_MIN_SIZE) && (sun3_dma_setup_done != cmd)) { if (cmd->request->cmd_type == REQ_TYPE_FS) { - sun3scsi_dma_setup(d, count, + sun3scsi_dma_setup(instance, d, count, rq_data_dir(cmd->request)); sun3_dma_setup_done = cmd; } @@ -2000,11 +1892,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) NCR5380_write(TARGET_COMMAND_REG, PHASE_SR_TO_TCR(tmp)); NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_ATN | - ICR_ASSERT_ACK); + ICR_ASSERT_ACK); while (NCR5380_read(STATUS_REG) & SR_REQ) ; NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | - ICR_ASSERT_ATN); + ICR_ASSERT_ATN); sink = 0; continue; } @@ -2012,12 +1904,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) switch (phase) { case PHASE_DATAOUT: #if (NDEBUG & NDEBUG_NO_DATAOUT) - printk("scsi%d: NDEBUG_NO_DATAOUT set, attempted DATAOUT " - "aborted\n", HOSTNO); + shost_printk(KERN_DEBUG, instance, "NDEBUG_NO_DATAOUT set, attempted DATAOUT aborted\n"); sink = 1; do_abort(instance); cmd->result = DID_ERROR << 16; - cmd->scsi_done(cmd); + complete_cmd(instance, cmd); return; #endif case PHASE_DATAIN: @@ -2031,13 +1922,10 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) --cmd->SCp.buffers_residual; cmd->SCp.this_residual = cmd->SCp.buffer->length; cmd->SCp.ptr = sg_virt(cmd->SCp.buffer); - /* ++roman: Try to merge some scatter-buffers if - * they are at contiguous physical addresses. - */ merge_contiguous_buffers(cmd); - dprintk(NDEBUG_INFORMATION, "scsi%d: %d bytes and %d buffers left\n", - HOSTNO, cmd->SCp.this_residual, - cmd->SCp.buffers_residual); + dsprintk(NDEBUG_INFORMATION, instance, "%d bytes and %d buffers left\n", + cmd->SCp.this_residual, + cmd->SCp.buffers_residual); } /* @@ -2051,16 +1939,18 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) */ /* ++roman: I suggest, this should be - * #if def(REAL_DMA) + * #if def(REAL_DMA) * instead of leaving REAL_DMA out. */ #if defined(REAL_DMA) - if ( #if !defined(CONFIG_SUN3) - !cmd->device->borken && + transfersize = 0; + if (!cmd->device->borken) #endif - (transfersize = NCR5380_dma_xfer_len(instance, cmd, phase)) >= DMA_MIN_SIZE) { + transfersize = NCR5380_dma_xfer_len(instance, cmd, phase); + + if (transfersize >= DMA_MIN_SIZE) { len = transfersize; cmd->SCp.phase = phase; if (NCR5380_transfer_dma(instance, &phase, @@ -2068,16 +1958,15 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) /* * If the watchdog timer fires, all future * accesses to this device will use the - * polled-IO. */ + * polled-IO. + */ scmd_printk(KERN_INFO, cmd, "switching to slow handshake\n"); cmd->device->borken = 1; - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | - ICR_ASSERT_ATN); sink = 1; do_abort(instance); cmd->result = DID_ERROR << 16; - cmd->scsi_done(cmd); + complete_cmd(instance, cmd); /* XXX - need to source or sink data here, as appropriate */ } else { #ifdef REAL_DMA @@ -2093,9 +1982,13 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) } } else #endif /* defined(REAL_DMA) */ + { + spin_unlock_irq(&hostdata->lock); NCR5380_transfer_pio(instance, &phase, - (int *)&cmd->SCp.this_residual, - (unsigned char **)&cmd->SCp.ptr); + (int *)&cmd->SCp.this_residual, + (unsigned char **)&cmd->SCp.ptr); + spin_lock_irq(&hostdata->lock); + } #if defined(CONFIG_SUN3) && defined(REAL_DMA) /* if we had intended to dma that command clear it */ if (sun3_dma_setup_done == cmd) @@ -2105,162 +1998,64 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) case PHASE_MSGIN: len = 1; data = &tmp; - NCR5380_write(SELECT_ENABLE_REG, 0); /* disable reselects */ NCR5380_transfer_pio(instance, &phase, &len, &data); cmd->SCp.Message = tmp; switch (tmp) { - /* - * Linking lets us reduce the time required to get the - * next command out to the device, hopefully this will - * mean we don't waste another revolution due to the delays - * required by ARBITRATION and another SELECTION. - * - * In the current implementation proposal, low level drivers - * merely have to start the next command, pointed to by - * next_link, done() is called as with unlinked commands. - */ -#ifdef LINKED - case LINKED_CMD_COMPLETE: - case LINKED_FLG_CMD_COMPLETE: - /* Accept message by clearing ACK */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - - dprintk(NDEBUG_LINKED, "scsi%d: target %d lun %llu linked command " - "complete.\n", HOSTNO, cmd->device->id, cmd->device->lun); - - /* Enable reselect interrupts */ - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - /* - * Sanity check : A linked command should only terminate - * with one of these messages if there are more linked - * commands available. - */ - - if (!cmd->next_link) { - printk(KERN_NOTICE "scsi%d: target %d lun %llu " - "linked command complete, no next_link\n", - HOSTNO, cmd->device->id, cmd->device->lun); - sink = 1; - do_abort(instance); - return; - } - - initialize_SCp(cmd->next_link); - /* The next command is still part of this process; copy it - * and don't free it! */ - cmd->next_link->tag = cmd->tag; - cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8); - dprintk(NDEBUG_LINKED, "scsi%d: target %d lun %llu linked request " - "done, calling scsi_done().\n", - HOSTNO, cmd->device->id, cmd->device->lun); - cmd->scsi_done(cmd); - cmd = hostdata->connected; - break; -#endif /* def LINKED */ case ABORT: case COMMAND_COMPLETE: /* Accept message by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - dprintk(NDEBUG_QUEUES, "scsi%d: command for target %d, lun %llu " - "completed\n", HOSTNO, cmd->device->id, cmd->device->lun); + dsprintk(NDEBUG_QUEUES, instance, + "COMMAND COMPLETE %p target %d lun %llu\n", + cmd, scmd_id(cmd), cmd->device->lun); - local_irq_save(flags); - hostdata->retain_dma_intr++; hostdata->connected = NULL; #ifdef SUPPORT_TAGS cmd_free_tag(cmd); if (status_byte(cmd->SCp.Status) == QUEUE_FULL) { - /* Turn a QUEUE FULL status into BUSY, I think the - * mid level cannot handle QUEUE FULL :-( (The - * command is retried after BUSY). Also update our - * queue size to the number of currently issued - * commands now. - */ - /* ++Andreas: the mid level code knows about - QUEUE_FULL now. */ - struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][cmd->device->lun]; - dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %llu returned " - "QUEUE_FULL after %d commands\n", - HOSTNO, cmd->device->id, cmd->device->lun, - ta->nr_allocated); + u8 lun = cmd->device->lun; + struct tag_alloc *ta = &hostdata->TagAlloc[scmd_id(cmd)][lun]; + + dsprintk(NDEBUG_TAGS, instance, + "QUEUE_FULL %p target %d lun %d nr_allocated %d\n", + cmd, scmd_id(cmd), lun, ta->nr_allocated); if (ta->queue_size > ta->nr_allocated) - ta->nr_allocated = ta->queue_size; + ta->queue_size = ta->nr_allocated; } -#else - hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun); #endif - /* Enable reselect interrupts */ - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - - /* - * I'm not sure what the correct thing to do here is : - * - * If the command that just executed is NOT a request - * sense, the obvious thing to do is to set the result - * code to the values of the stored parameters. - * - * If it was a REQUEST SENSE command, we need some way to - * differentiate between the failure code of the original - * and the failure code of the REQUEST sense - the obvious - * case is success, where we fall through and leave the - * result code unchanged. - * - * The non-obvious place is where the REQUEST SENSE failed - */ - - if (cmd->cmnd[0] != REQUEST_SENSE) - cmd->result = cmd->SCp.Status | (cmd->SCp.Message << 8); - else if (status_byte(cmd->SCp.Status) != GOOD) - cmd->result = (cmd->result & 0x00ffff) | (DID_ERROR << 16); - - if ((cmd->cmnd[0] == REQUEST_SENSE) && - hostdata->ses.cmd_len) { - scsi_eh_restore_cmnd(cmd, &hostdata->ses); - hostdata->ses.cmd_len = 0 ; - } - - if ((cmd->cmnd[0] != REQUEST_SENSE) && - (status_byte(cmd->SCp.Status) == CHECK_CONDITION)) { - scsi_eh_prep_cmnd(cmd, &hostdata->ses, NULL, 0, ~0); - - dprintk(NDEBUG_AUTOSENSE, "scsi%d: performing request sense\n", HOSTNO); - LIST(cmd,hostdata->issue_queue); - SET_NEXT(cmd, hostdata->issue_queue); - hostdata->issue_queue = (struct scsi_cmnd *) cmd; - dprintk(NDEBUG_QUEUES, "scsi%d: REQUEST SENSE added to head of " - "issue queue\n", H_NO(cmd)); - } else { - cmd->scsi_done(cmd); + cmd->result &= ~0xffff; + cmd->result |= cmd->SCp.Status; + cmd->result |= cmd->SCp.Message << 8; + + if (cmd->cmnd[0] == REQUEST_SENSE) + complete_cmd(instance, cmd); + else { + if (cmd->SCp.Status == SAM_STAT_CHECK_CONDITION || + cmd->SCp.Status == SAM_STAT_COMMAND_TERMINATED) { + dsprintk(NDEBUG_QUEUES, instance, "autosense: adding cmd %p to tail of autosense queue\n", + cmd); + list_add_tail(&ncmd->list, + &hostdata->autosense); + } else + complete_cmd(instance, cmd); } - local_irq_restore(flags); - - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); /* * Restore phase bits to 0 so an interrupted selection, * arbitration can resume. */ NCR5380_write(TARGET_COMMAND_REG, 0); - while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected) - barrier(); + /* Enable reselect interrupts */ + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - local_irq_save(flags); - hostdata->retain_dma_intr--; - /* ++roman: For Falcon SCSI, release the lock on the - * ST-DMA here if no other commands are waiting on the - * disconnected queue. - */ maybe_release_dma_irq(instance); - local_irq_restore(flags); return; case MESSAGE_REJECT: /* Accept message by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - /* Enable reselect interrupts */ - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); switch (hostdata->last_message) { case HEAD_OF_QUEUE_TAG: case ORDERED_QUEUE_TAG: @@ -2274,27 +2069,20 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) cmd->device->tagged_supported = 0; hostdata->busy[cmd->device->id] |= (1 << cmd->device->lun); cmd->tag = TAG_NONE; - dprintk(NDEBUG_TAGS, "scsi%d: target %d lun %llu rejected " - "QUEUE_TAG message; tagged queuing " - "disabled\n", - HOSTNO, cmd->device->id, cmd->device->lun); + dsprintk(NDEBUG_TAGS, instance, "target %d lun %llu rejected QUEUE_TAG message; tagged queuing disabled\n", + scmd_id(cmd), cmd->device->lun); break; } break; case DISCONNECT: /* Accept message by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - local_irq_save(flags); - cmd->device->disconnect = 1; - LIST(cmd,hostdata->disconnected_queue); - SET_NEXT(cmd, hostdata->disconnected_queue); hostdata->connected = NULL; - hostdata->disconnected_queue = cmd; - local_irq_restore(flags); - dprintk(NDEBUG_QUEUES, "scsi%d: command for target %d lun %llu was " - "moved from connected to the " - "disconnected_queue\n", HOSTNO, - cmd->device->id, cmd->device->lun); + list_add(&ncmd->list, &hostdata->disconnected); + dsprintk(NDEBUG_INFORMATION | NDEBUG_QUEUES, + instance, "connected command %p for target %d lun %llu moved to disconnected queue\n", + cmd, scmd_id(cmd), cmd->device->lun); + /* * Restore phase bits to 0 so an interrupted selection, * arbitration can resume. @@ -2303,9 +2091,6 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) /* Enable reselect interrupts */ NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); - /* Wait for bus free to avoid nasty timeouts */ - while ((NCR5380_read(STATUS_REG) & SR_BSY) && !hostdata->connected) - barrier(); #ifdef SUN3_SCSI_VME dregs->csr |= CSR_DMA_ENABLE; #endif @@ -2324,37 +2109,30 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) case RESTORE_POINTERS: /* Accept message by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - /* Enable reselect interrupts */ - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); break; case EXTENDED_MESSAGE: /* - * Extended messages are sent in the following format : - * Byte - * 0 EXTENDED_MESSAGE == 1 - * 1 length (includes one byte for code, doesn't - * include first two bytes) - * 2 code - * 3..length+1 arguments - * - * Start the extended message buffer with the EXTENDED_MESSAGE + * Start the message buffer with the EXTENDED_MESSAGE * byte, since spi_print_msg() wants the whole thing. */ extended_msg[0] = EXTENDED_MESSAGE; /* Accept first byte by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - dprintk(NDEBUG_EXTENDED, "scsi%d: receiving extended message\n", HOSTNO); + spin_unlock_irq(&hostdata->lock); + + dsprintk(NDEBUG_EXTENDED, instance, "receiving extended message\n"); len = 2; data = extended_msg + 1; phase = PHASE_MSGIN; NCR5380_transfer_pio(instance, &phase, &len, &data); - dprintk(NDEBUG_EXTENDED, "scsi%d: length=%d, code=0x%02x\n", HOSTNO, - (int)extended_msg[1], (int)extended_msg[2]); + dsprintk(NDEBUG_EXTENDED, instance, "length %d, code 0x%02x\n", + (int)extended_msg[1], + (int)extended_msg[2]); - if (!len && extended_msg[1] <= - (sizeof(extended_msg) - 1)) { + if (!len && extended_msg[1] > 0 && + extended_msg[1] <= sizeof(extended_msg) - 2) { /* Accept third byte by clearing ACK */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); len = extended_msg[1] - 1; @@ -2362,8 +2140,8 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) phase = PHASE_MSGIN; NCR5380_transfer_pio(instance, &phase, &len, &data); - dprintk(NDEBUG_EXTENDED, "scsi%d: message received, residual %d\n", - HOSTNO, len); + dsprintk(NDEBUG_EXTENDED, instance, "message received, residual %d\n", + len); switch (extended_msg[2]) { case EXTENDED_SDTR: @@ -2373,15 +2151,18 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) tmp = 0; } } else if (len) { - printk(KERN_NOTICE "scsi%d: error receiving " - "extended message\n", HOSTNO); + shost_printk(KERN_ERR, instance, "error receiving extended message\n"); tmp = 0; } else { - printk(KERN_NOTICE "scsi%d: extended message " - "code %02x length %d is too long\n", - HOSTNO, extended_msg[2], extended_msg[1]); + shost_printk(KERN_NOTICE, instance, "extended message code %02x length %d is too long\n", + extended_msg[2], extended_msg[1]); tmp = 0; } + + spin_lock_irq(&hostdata->lock); + if (!hostdata->connected) + return; + /* Fall through to reject message */ /* @@ -2390,8 +2171,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) */ default: if (!tmp) { - printk(KERN_INFO "scsi%d: rejecting message ", - instance->host_no); + shost_printk(KERN_ERR, instance, "rejecting message "); spi_print_msg(extended_msg); printk("\n"); } else if (tmp != EXTENDED_MESSAGE) @@ -2414,18 +2194,11 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) hostdata->last_message = msgout; NCR5380_transfer_pio(instance, &phase, &len, &data); if (msgout == ABORT) { - local_irq_save(flags); -#ifdef SUPPORT_TAGS - cmd_free_tag(cmd); -#else - hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun); -#endif hostdata->connected = NULL; cmd->result = DID_ERROR << 16; - NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); + complete_cmd(instance, cmd); maybe_release_dma_irq(instance); - local_irq_restore(flags); - cmd->scsi_done(cmd); + NCR5380_write(SELECT_ENABLE_REG, hostdata->id_mask); return; } msgout = NOP; @@ -2447,22 +2220,25 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) cmd->SCp.Status = tmp; break; default: - printk("scsi%d: unknown phase\n", HOSTNO); + shost_printk(KERN_ERR, instance, "unknown phase\n"); NCR5380_dprint(NDEBUG_ANY, instance); } /* switch(phase) */ - } /* if (tmp * SR_REQ) */ - } /* while (1) */ + } else { + spin_unlock_irq(&hostdata->lock); + NCR5380_poll_politely(instance, STATUS_REG, SR_REQ, SR_REQ, HZ); + spin_lock_irq(&hostdata->lock); + } + } } /* * Function : void NCR5380_reselect (struct Scsi_Host *instance) * * Purpose : does reselection, initializing the instance->connected - * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q - * nexus has been reestablished, + * field to point to the scsi_cmnd for which the I_T_L or I_T_L_Q + * nexus has been reestablished, * * Inputs : instance - this instance of the NCR5380. - * */ @@ -2471,7 +2247,7 @@ static void NCR5380_information_transfer(struct Scsi_Host *instance) static void NCR5380_reselect(struct Scsi_Host *instance) { - SETUP_HOSTDATA(instance); + struct NCR5380_hostdata *hostdata = shost_priv(instance); unsigned char target_mask; unsigned char lun; #ifdef SUPPORT_TAGS @@ -2480,7 +2256,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance) unsigned char msg[3]; int __maybe_unused len; unsigned char __maybe_unused *data, __maybe_unused phase; - struct scsi_cmnd *tmp = NULL, *prev; + struct NCR5380_cmd *ncmd; + struct scsi_cmnd *tmp; /* * Disable arbitration, etc. since the host adapter obviously @@ -2488,11 +2265,10 @@ static void NCR5380_reselect(struct Scsi_Host *instance) */ NCR5380_write(MODE_REG, MR_BASE); - hostdata->restart_select = 1; target_mask = NCR5380_read(CURRENT_SCSI_DATA_REG) & ~(hostdata->id_mask); - dprintk(NDEBUG_RESELECTION, "scsi%d: reselect\n", HOSTNO); + dsprintk(NDEBUG_RESELECTION, instance, "reselect\n"); /* * At this point, we have detected that our SCSI ID is on the bus, @@ -2504,17 +2280,22 @@ static void NCR5380_reselect(struct Scsi_Host *instance) */ NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_BSY); - - while (NCR5380_read(STATUS_REG) & SR_SEL) - ; + if (NCR5380_poll_politely(instance, + STATUS_REG, SR_SEL, 0, 2 * HZ) < 0) { + NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); + return; + } NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); /* * Wait for target to go into MSGIN. */ - while (!(NCR5380_read(STATUS_REG) & SR_REQ)) - ; + if (NCR5380_poll_politely(instance, + STATUS_REG, SR_REQ, SR_REQ, 2 * HZ) < 0) { + do_abort(instance); + return; + } #if defined(CONFIG_SUN3) && defined(REAL_DMA) /* acknowledge toggle to MSGIN */ @@ -2527,15 +2308,21 @@ static void NCR5380_reselect(struct Scsi_Host *instance) data = msg; phase = PHASE_MSGIN; NCR5380_transfer_pio(instance, &phase, &len, &data); + + if (len) { + do_abort(instance); + return; + } #endif if (!(msg[0] & 0x80)) { - printk(KERN_DEBUG "scsi%d: expecting IDENTIFY message, got ", HOSTNO); + shost_printk(KERN_ERR, instance, "expecting IDENTIFY message, got "); spi_print_msg(msg); + printk("\n"); do_abort(instance); return; } - lun = (msg[0] & 0x07); + lun = msg[0] & 0x07; #if defined(SUPPORT_TAGS) && !defined(CONFIG_SUN3) /* If the phase is still MSGIN, the target wants to send some more @@ -2551,8 +2338,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance) if (!NCR5380_transfer_pio(instance, &phase, &len, &data) && msg[1] == SIMPLE_QUEUE_TAG) tag = msg[2]; - dprintk(NDEBUG_TAGS, "scsi%d: target mask %02x, lun %d sent tag %d at " - "reselection\n", HOSTNO, target_mask, lun, tag); + dsprintk(NDEBUG_TAGS, instance, "reselect: target mask %02x, lun %d sent tag %d\n", + target_mask, lun, tag); } #endif @@ -2561,36 +2348,34 @@ static void NCR5380_reselect(struct Scsi_Host *instance) * just reestablished, and remove it from the disconnected queue. */ - for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue, prev = NULL; - tmp; prev = tmp, tmp = NEXT(tmp)) { - if ((target_mask == (1 << tmp->device->id)) && (lun == tmp->device->lun) + tmp = NULL; + list_for_each_entry(ncmd, &hostdata->disconnected, list) { + struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd); + + if (target_mask == (1 << scmd_id(cmd)) && + lun == (u8)cmd->device->lun #ifdef SUPPORT_TAGS - && (tag == tmp->tag) + && (tag == cmd->tag) #endif ) { - if (prev) { - REMOVE(prev, NEXT(prev), tmp, NEXT(tmp)); - SET_NEXT(prev, NEXT(tmp)); - } else { - REMOVE(-1, hostdata->disconnected_queue, tmp, NEXT(tmp)); - hostdata->disconnected_queue = NEXT(tmp); - } - SET_NEXT(tmp, NULL); + list_del(&ncmd->list); + tmp = cmd; break; } } - if (!tmp) { - printk(KERN_WARNING "scsi%d: warning: target bitmask %02x lun %d " -#ifdef SUPPORT_TAGS - "tag %d " -#endif - "not in disconnected_queue.\n", - HOSTNO, target_mask, lun + if (tmp) { + dsprintk(NDEBUG_RESELECTION | NDEBUG_QUEUES, instance, + "reselect: removed %p from disconnected queue\n", tmp); + } else { + #ifdef SUPPORT_TAGS - , tag + shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d tag %d not in disconnected queue.\n", + target_mask, lun, tag); +#else + shost_printk(KERN_ERR, instance, "target bitmask 0x%02x lun %d not in disconnected queue.\n", + target_mask, lun); #endif - ); /* * Since we have an established nexus that we can't do anything * with, we must abort it. @@ -2614,7 +2399,8 @@ static void NCR5380_reselect(struct Scsi_Host *instance) } /* setup this command for dma if not already */ if ((count >= DMA_MIN_SIZE) && (sun3_dma_setup_done != tmp)) { - sun3scsi_dma_setup(d, count, rq_data_dir(tmp->request)); + sun3scsi_dma_setup(instance, d, count, + rq_data_dir(tmp->request)); sun3_dma_setup_done = tmp; } } @@ -2639,235 +2425,196 @@ static void NCR5380_reselect(struct Scsi_Host *instance) if (!NCR5380_transfer_pio(instance, &phase, &len, &data) && msg[1] == SIMPLE_QUEUE_TAG) tag = msg[2]; - dprintk(NDEBUG_TAGS, "scsi%d: target mask %02x, lun %d sent tag %d at reselection\n" - HOSTNO, target_mask, lun, tag); + dsprintk(NDEBUG_TAGS, instance, "reselect: target mask %02x, lun %d sent tag %d\n" + target_mask, lun, tag); } #endif hostdata->connected = tmp; - dprintk(NDEBUG_RESELECTION, "scsi%d: nexus established, target = %d, lun = %llu, tag = %d\n", - HOSTNO, tmp->device->id, tmp->device->lun, tmp->tag); + dsprintk(NDEBUG_RESELECTION, instance, "nexus established, target %d, lun %llu, tag %d\n", + scmd_id(tmp), tmp->device->lun, tmp->tag); } -/* - * Function : int NCR5380_abort (struct scsi_cmnd *cmd) - * - * Purpose : abort a command - * - * Inputs : cmd - the scsi_cmnd to abort, code - code to set the - * host byte of the result field to, if zero DID_ABORTED is - * used. - * - * Returns : SUCCESS - success, FAILED on failure. - * - * XXX - there is no way to abort the command that is currently - * connected, you have to wait for it to complete. If this is - * a problem, we could implement longjmp() / setjmp(), setjmp() - * called where the loop started in NCR5380_main(). +/** + * list_find_cmd - test for presence of a command in a linked list + * @haystack: list of commands + * @needle: command to search for */ -static -int NCR5380_abort(struct scsi_cmnd *cmd) +static bool list_find_cmd(struct list_head *haystack, + struct scsi_cmnd *needle) { - struct Scsi_Host *instance = cmd->device->host; - SETUP_HOSTDATA(instance); - struct scsi_cmnd *tmp, **prev; - unsigned long flags; + struct NCR5380_cmd *ncmd; - scmd_printk(KERN_NOTICE, cmd, "aborting command\n"); + list_for_each_entry(ncmd, haystack, list) + if (NCR5380_to_scmd(ncmd) == needle) + return true; + return false; +} - NCR5380_print_status(instance); +/** + * list_remove_cmd - remove a command from linked list + * @haystack: list of commands + * @needle: command to remove + */ - local_irq_save(flags); +static bool list_del_cmd(struct list_head *haystack, + struct scsi_cmnd *needle) +{ + if (list_find_cmd(haystack, needle)) { + struct NCR5380_cmd *ncmd = scsi_cmd_priv(needle); - dprintk(NDEBUG_ABORT, "scsi%d: abort called basr 0x%02x, sr 0x%02x\n", HOSTNO, - NCR5380_read(BUS_AND_STATUS_REG), - NCR5380_read(STATUS_REG)); + list_del(&ncmd->list); + return true; + } + return false; +} -#if 1 - /* - * Case 1 : If the command is the currently executing command, - * we'll set the aborted flag and return control so that - * information transfer routine can exit cleanly. - */ +/** + * NCR5380_abort - scsi host eh_abort_handler() method + * @cmd: the command to be aborted + * + * Try to abort a given command by removing it from queues and/or sending + * the target an abort message. This may not succeed in causing a target + * to abort the command. Nonetheless, the low-level driver must forget about + * the command because the mid-layer reclaims it and it may be re-issued. + * + * The normal path taken by a command is as follows. For EH we trace this + * same path to locate and abort the command. + * + * unissued -> selecting -> [unissued -> selecting ->]... connected -> + * [disconnected -> connected ->]... + * [autosense -> connected ->] done + * + * If cmd is unissued then just remove it. + * If cmd is disconnected, try to select the target. + * If cmd is connected, try to send an abort message. + * If cmd is waiting for autosense, give it a chance to complete but check + * that it isn't left connected. + * If cmd was not found at all then presumably it has already been completed, + * in which case return SUCCESS to try to avoid further EH measures. + * If the command has not completed yet, we must not fail to find it. + */ - if (hostdata->connected == cmd) { +static int NCR5380_abort(struct scsi_cmnd *cmd) +{ + struct Scsi_Host *instance = cmd->device->host; + struct NCR5380_hostdata *hostdata = shost_priv(instance); + unsigned long flags; + int result = SUCCESS; - dprintk(NDEBUG_ABORT, "scsi%d: aborting connected command\n", HOSTNO); - /* - * We should perform BSY checking, and make sure we haven't slipped - * into BUS FREE. - */ + spin_lock_irqsave(&hostdata->lock, flags); - /* NCR5380_write(INITIATOR_COMMAND_REG, ICR_ASSERT_ATN); */ - /* - * Since we can't change phases until we've completed the current - * handshake, we have to source or sink a byte of data if the current - * phase is not MSGOUT. - */ +#if (NDEBUG & NDEBUG_ANY) + scmd_printk(KERN_INFO, cmd, __func__); +#endif + NCR5380_dprint(NDEBUG_ANY, instance); + NCR5380_dprint_phase(NDEBUG_ANY, instance); - /* - * Return control to the executing NCR drive so we can clear the - * aborted flag and get back into our main loop. - */ + if (list_del_cmd(&hostdata->unissued, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: removed %p from issue queue\n", cmd); + cmd->result = DID_ABORT << 16; + cmd->scsi_done(cmd); /* No tag or busy flag to worry about */ + } - if (do_abort(instance) == 0) { - hostdata->aborted = 1; - hostdata->connected = NULL; - cmd->result = DID_ABORT << 16; -#ifdef SUPPORT_TAGS - cmd_free_tag(cmd); -#else - hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun); -#endif - maybe_release_dma_irq(instance); - local_irq_restore(flags); - cmd->scsi_done(cmd); - return SUCCESS; - } else { - local_irq_restore(flags); - printk("scsi%d: abort of connected command failed!\n", HOSTNO); - return FAILED; - } + if (hostdata->selecting == cmd) { + dsprintk(NDEBUG_ABORT, instance, + "abort: cmd %p == selecting\n", cmd); + hostdata->selecting = NULL; + cmd->result = DID_ABORT << 16; + complete_cmd(instance, cmd); + goto out; } -#endif - /* - * Case 2 : If the command hasn't been issued yet, we simply remove it - * from the issue queue. - */ - for (prev = (struct scsi_cmnd **)&(hostdata->issue_queue), - tmp = (struct scsi_cmnd *)hostdata->issue_queue; - tmp; prev = NEXTADDR(tmp), tmp = NEXT(tmp)) { - if (cmd == tmp) { - REMOVE(5, *prev, tmp, NEXT(tmp)); - (*prev) = NEXT(tmp); - SET_NEXT(tmp, NULL); - tmp->result = DID_ABORT << 16; - maybe_release_dma_irq(instance); - local_irq_restore(flags); - dprintk(NDEBUG_ABORT, "scsi%d: abort removed command from issue queue.\n", - HOSTNO); - /* Tagged queuing note: no tag to free here, hasn't been assigned - * yet... */ - tmp->scsi_done(tmp); - return SUCCESS; + if (list_del_cmd(&hostdata->disconnected, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: removed %p from disconnected list\n", cmd); + cmd->result = DID_ERROR << 16; + if (!hostdata->connected) + NCR5380_select(instance, cmd); + if (hostdata->connected != cmd) { + complete_cmd(instance, cmd); + result = FAILED; + goto out; } } - /* - * Case 3 : If any commands are connected, we're going to fail the abort - * and let the high level SCSI driver retry at a later time or - * issue a reset. - * - * Timeouts, and therefore aborted commands, will be highly unlikely - * and handling them cleanly in this situation would make the common - * case of noresets less efficient, and would pollute our code. So, - * we fail. - */ + if (hostdata->connected == cmd) { + dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd); + hostdata->connected = NULL; + if (do_abort(instance)) { + set_host_byte(cmd, DID_ERROR); + complete_cmd(instance, cmd); + result = FAILED; + goto out; + } + set_host_byte(cmd, DID_ABORT); +#ifdef REAL_DMA + hostdata->dma_len = 0; +#endif + if (cmd->cmnd[0] == REQUEST_SENSE) + complete_cmd(instance, cmd); + else { + struct NCR5380_cmd *ncmd = scsi_cmd_priv(cmd); - if (hostdata->connected) { - local_irq_restore(flags); - dprintk(NDEBUG_ABORT, "scsi%d: abort failed, command connected.\n", HOSTNO); - return FAILED; + /* Perform autosense for this command */ + list_add(&ncmd->list, &hostdata->autosense); + } } - /* - * Case 4: If the command is currently disconnected from the bus, and - * there are no connected commands, we reconnect the I_T_L or - * I_T_L_Q nexus associated with it, go into message out, and send - * an abort message. - * - * This case is especially ugly. In order to reestablish the nexus, we - * need to call NCR5380_select(). The easiest way to implement this - * function was to abort if the bus was busy, and let the interrupt - * handler triggered on the SEL for reselect take care of lost arbitrations - * where necessary, meaning interrupts need to be enabled. - * - * When interrupts are enabled, the queues may change - so we - * can't remove it from the disconnected queue before selecting it - * because that could cause a failure in hashing the nexus if that - * device reselected. - * - * Since the queues may change, we can't use the pointers from when we - * first locate it. - * - * So, we must first locate the command, and if NCR5380_select() - * succeeds, then issue the abort, relocate the command and remove - * it from the disconnected queue. - */ - - for (tmp = (struct scsi_cmnd *) hostdata->disconnected_queue; tmp; - tmp = NEXT(tmp)) { - if (cmd == tmp) { - local_irq_restore(flags); - dprintk(NDEBUG_ABORT, "scsi%d: aborting disconnected command.\n", HOSTNO); - - if (NCR5380_select(instance, cmd)) - return FAILED; - - dprintk(NDEBUG_ABORT, "scsi%d: nexus reestablished.\n", HOSTNO); - - do_abort(instance); - - local_irq_save(flags); - for (prev = (struct scsi_cmnd **)&(hostdata->disconnected_queue), - tmp = (struct scsi_cmnd *)hostdata->disconnected_queue; - tmp; prev = NEXTADDR(tmp), tmp = NEXT(tmp)) { - if (cmd == tmp) { - REMOVE(5, *prev, tmp, NEXT(tmp)); - *prev = NEXT(tmp); - SET_NEXT(tmp, NULL); - tmp->result = DID_ABORT << 16; - /* We must unlock the tag/LUN immediately here, since the - * target goes to BUS FREE and doesn't send us another - * message (COMMAND_COMPLETE or the like) - */ -#ifdef SUPPORT_TAGS - cmd_free_tag(tmp); -#else - hostdata->busy[cmd->device->id] &= ~(1 << cmd->device->lun); -#endif - maybe_release_dma_irq(instance); - local_irq_restore(flags); - tmp->scsi_done(tmp); - return SUCCESS; - } - } + if (list_find_cmd(&hostdata->autosense, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: found %p on sense queue\n", cmd); + spin_unlock_irqrestore(&hostdata->lock, flags); + queue_work(hostdata->work_q, &hostdata->main_task); + msleep(1000); + spin_lock_irqsave(&hostdata->lock, flags); + if (list_del_cmd(&hostdata->autosense, cmd)) { + dsprintk(NDEBUG_ABORT, instance, + "abort: removed %p from sense queue\n", cmd); + set_host_byte(cmd, DID_ABORT); + complete_cmd(instance, cmd); + goto out; } } - /* Maybe it is sufficient just to release the ST-DMA lock... (if - * possible at all) At least, we should check if the lock could be - * released after the abort, in case it is kept due to some bug. - */ - maybe_release_dma_irq(instance); - local_irq_restore(flags); + if (hostdata->connected == cmd) { + dsprintk(NDEBUG_ABORT, instance, "abort: cmd %p is connected\n", cmd); + hostdata->connected = NULL; + if (do_abort(instance)) { + set_host_byte(cmd, DID_ERROR); + complete_cmd(instance, cmd); + result = FAILED; + goto out; + } + set_host_byte(cmd, DID_ABORT); +#ifdef REAL_DMA + hostdata->dma_len = 0; +#endif + complete_cmd(instance, cmd); + } - /* - * Case 5 : If we reached this point, the command was not found in any of - * the queues. - * - * We probably reached this point because of an unlikely race condition - * between the command completing successfully and the abortion code, - * so we won't panic, but we will notify the user in case something really - * broke. - */ +out: + if (result == FAILED) + dsprintk(NDEBUG_ABORT, instance, "abort: failed to abort %p\n", cmd); + else + dsprintk(NDEBUG_ABORT, instance, "abort: successfully aborted %p\n", cmd); - printk(KERN_INFO "scsi%d: warning : SCSI command probably completed successfully before abortion\n", HOSTNO); + queue_work(hostdata->work_q, &hostdata->main_task); + maybe_release_dma_irq(instance); + spin_unlock_irqrestore(&hostdata->lock, flags); - return FAILED; + return result; } -/* - * Function : int NCR5380_reset (struct scsi_cmnd *cmd) - * - * Purpose : reset the SCSI bus. - * - * Returns : SUCCESS or FAILURE +/** + * NCR5380_bus_reset - reset the SCSI bus + * @cmd: SCSI command undergoing EH * + * Returns SUCCESS */ static int NCR5380_bus_reset(struct scsi_cmnd *cmd) @@ -2876,23 +2623,22 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd) struct NCR5380_hostdata *hostdata = shost_priv(instance); int i; unsigned long flags; + struct NCR5380_cmd *ncmd; - NCR5380_print_status(instance); + spin_lock_irqsave(&hostdata->lock, flags); + +#if (NDEBUG & NDEBUG_ANY) + scmd_printk(KERN_INFO, cmd, __func__); +#endif + NCR5380_dprint(NDEBUG_ANY, instance); + NCR5380_dprint_phase(NDEBUG_ANY, instance); + + do_reset(instance); - /* get in phase */ - NCR5380_write(TARGET_COMMAND_REG, - PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG))); - /* assert RST */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST); - udelay(40); /* reset NCR registers */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); NCR5380_write(MODE_REG, MR_BASE); NCR5380_write(TARGET_COMMAND_REG, 0); NCR5380_write(SELECT_ENABLE_REG, 0); - /* ++roman: reset interrupt condition! otherwise no interrupts don't get - * through anymore ... */ - (void)NCR5380_read(RESET_PARITY_INTERRUPT_REG); /* After the reset, there are no more connected or disconnected commands * and no busy units; so clear the low-level status here to avoid @@ -2900,17 +2646,34 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd) * commands! */ - if (hostdata->issue_queue) - dprintk(NDEBUG_ABORT, "scsi%d: reset aborted issued command(s)\n", H_NO(cmd)); - if (hostdata->connected) - dprintk(NDEBUG_ABORT, "scsi%d: reset aborted a connected command\n", H_NO(cmd)); - if (hostdata->disconnected_queue) - dprintk(NDEBUG_ABORT, "scsi%d: reset aborted disconnected command(s)\n", H_NO(cmd)); + hostdata->selecting = NULL; + + list_for_each_entry(ncmd, &hostdata->disconnected, list) { + struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd); + + set_host_byte(cmd, DID_RESET); + cmd->scsi_done(cmd); + } + + list_for_each_entry(ncmd, &hostdata->autosense, list) { + struct scsi_cmnd *cmd = NCR5380_to_scmd(ncmd); + + set_host_byte(cmd, DID_RESET); + cmd->scsi_done(cmd); + } + + if (hostdata->connected) { + set_host_byte(hostdata->connected, DID_RESET); + complete_cmd(instance, hostdata->connected); + hostdata->connected = NULL; + } + + if (hostdata->sensing) { + set_host_byte(hostdata->connected, DID_RESET); + complete_cmd(instance, hostdata->sensing); + hostdata->sensing = NULL; + } - local_irq_save(flags); - hostdata->issue_queue = NULL; - hostdata->connected = NULL; - hostdata->disconnected_queue = NULL; #ifdef SUPPORT_TAGS free_all_tags(hostdata); #endif @@ -2920,8 +2683,9 @@ static int NCR5380_bus_reset(struct scsi_cmnd *cmd) hostdata->dma_len = 0; #endif + queue_work(hostdata->work_q, &hostdata->main_task); maybe_release_dma_irq(instance); - local_irq_restore(flags); + spin_unlock_irqrestore(&hostdata->lock, flags); return SUCCESS; } diff --git a/drivers/scsi/atari_scsi.c b/drivers/scsi/atari_scsi.c index 5ede3daa93dc..78d1b2963f2c 100644 --- a/drivers/scsi/atari_scsi.c +++ b/drivers/scsi/atari_scsi.c @@ -66,7 +66,6 @@ #include <linux/module.h> #include <linux/types.h> -#include <linux/delay.h> #include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/init.h> @@ -98,7 +97,6 @@ #define NCR5380_queue_command atari_scsi_queue_command #define NCR5380_abort atari_scsi_abort -#define NCR5380_show_info atari_scsi_show_info #define NCR5380_info atari_scsi_info #define NCR5380_dma_read_setup(instance, data, count) \ @@ -161,23 +159,10 @@ static inline unsigned long SCSI_DMA_GETADR(void) return adr; } -#define HOSTDATA_DMALEN (((struct NCR5380_hostdata *) \ - (atari_scsi_host->hostdata))->dma_len) - -/* Time (in jiffies) to wait after a reset; the SCSI standard calls for 250ms, - * we usually do 0.5s to be on the safe side. But Toshiba CD-ROMs once more - * need ten times the standard value... */ -#ifndef CONFIG_ATARI_SCSI_TOSHIBA_DELAY -#define AFTER_RESET_DELAY (HZ/2) -#else -#define AFTER_RESET_DELAY (5*HZ/2) -#endif - #ifdef REAL_DMA static void atari_scsi_fetch_restbytes(void); #endif -static struct Scsi_Host *atari_scsi_host; static unsigned char (*atari_scsi_reg_read)(unsigned char reg); static void (*atari_scsi_reg_write)(unsigned char reg, unsigned char value); @@ -208,12 +193,12 @@ static int setup_cmd_per_lun = -1; module_param(setup_cmd_per_lun, int, 0); static int setup_sg_tablesize = -1; module_param(setup_sg_tablesize, int, 0); -#ifdef SUPPORT_TAGS static int setup_use_tagged_queuing = -1; module_param(setup_use_tagged_queuing, int, 0); -#endif static int setup_hostid = -1; module_param(setup_hostid, int, 0); +static int setup_toshiba_delay = -1; +module_param(setup_toshiba_delay, int, 0); #if defined(REAL_DMA) @@ -273,15 +258,17 @@ static void scsi_dma_buserr(int irq, void *dummy) #endif -static irqreturn_t scsi_tt_intr(int irq, void *dummy) +static irqreturn_t scsi_tt_intr(int irq, void *dev) { #ifdef REAL_DMA + struct Scsi_Host *instance = dev; + struct NCR5380_hostdata *hostdata = shost_priv(instance); int dma_stat; dma_stat = tt_scsi_dma.dma_ctrl; - dprintk(NDEBUG_INTR, "scsi%d: NCR5380 interrupt, DMA status = %02x\n", - atari_scsi_host->host_no, dma_stat & 0xff); + dsprintk(NDEBUG_INTR, instance, "NCR5380 interrupt, DMA status = %02x\n", + dma_stat & 0xff); /* Look if it was the DMA that has interrupted: First possibility * is that a bus error occurred... @@ -304,7 +291,8 @@ static irqreturn_t scsi_tt_intr(int irq, void *dummy) * data reg! */ if ((dma_stat & 0x02) && !(dma_stat & 0x40)) { - atari_dma_residual = HOSTDATA_DMALEN - (SCSI_DMA_READ_P(dma_addr) - atari_dma_startaddr); + atari_dma_residual = hostdata->dma_len - + (SCSI_DMA_READ_P(dma_addr) - atari_dma_startaddr); dprintk(NDEBUG_DMA, "SCSI DMA: There are %ld residual bytes.\n", atari_dma_residual); @@ -356,15 +344,17 @@ static irqreturn_t scsi_tt_intr(int irq, void *dummy) #endif /* REAL_DMA */ - NCR5380_intr(irq, dummy); + NCR5380_intr(irq, dev); return IRQ_HANDLED; } -static irqreturn_t scsi_falcon_intr(int irq, void *dummy) +static irqreturn_t scsi_falcon_intr(int irq, void *dev) { #ifdef REAL_DMA + struct Scsi_Host *instance = dev; + struct NCR5380_hostdata *hostdata = shost_priv(instance); int dma_stat; /* Turn off DMA and select sector counter register before @@ -399,7 +389,7 @@ static irqreturn_t scsi_falcon_intr(int irq, void *dummy) printk(KERN_ERR "SCSI DMA error: %ld bytes lost in " "ST-DMA fifo\n", transferred & 15); - atari_dma_residual = HOSTDATA_DMALEN - transferred; + atari_dma_residual = hostdata->dma_len - transferred; dprintk(NDEBUG_DMA, "SCSI DMA: There are %ld residual bytes.\n", atari_dma_residual); } else @@ -411,13 +401,14 @@ static irqreturn_t scsi_falcon_intr(int irq, void *dummy) * data to the original destination address. */ memcpy(atari_dma_orig_addr, phys_to_virt(atari_dma_startaddr), - HOSTDATA_DMALEN - atari_dma_residual); + hostdata->dma_len - atari_dma_residual); atari_dma_orig_addr = NULL; } #endif /* REAL_DMA */ - NCR5380_intr(irq, dummy); + NCR5380_intr(irq, dev); + return IRQ_HANDLED; } @@ -488,7 +479,7 @@ static int __init atari_scsi_setup(char *str) * Defaults depend on TT or Falcon, determined at run time. * Negative values mean don't change. */ - int ints[6]; + int ints[8]; get_options(str, ARRAY_SIZE(ints), ints); @@ -504,10 +495,11 @@ static int __init atari_scsi_setup(char *str) setup_sg_tablesize = ints[3]; if (ints[0] >= 4) setup_hostid = ints[4]; -#ifdef SUPPORT_TAGS if (ints[0] >= 5) setup_use_tagged_queuing = ints[5]; -#endif + /* ints[6] (use_pdma) is ignored */ + if (ints[0] >= 7) + setup_toshiba_delay = ints[7]; return 1; } @@ -516,38 +508,6 @@ __setup("atascsi=", atari_scsi_setup); #endif /* !MODULE */ -#ifdef CONFIG_ATARI_SCSI_RESET_BOOT -static void __init atari_scsi_reset_boot(void) -{ - unsigned long end; - - /* - * Do a SCSI reset to clean up the bus during initialization. No messing - * with the queues, interrupts, or locks necessary here. - */ - - printk("Atari SCSI: resetting the SCSI bus..."); - - /* get in phase */ - NCR5380_write(TARGET_COMMAND_REG, - PHASE_SR_TO_TCR(NCR5380_read(STATUS_REG))); - - /* assert RST */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST); - /* The min. reset hold time is 25us, so 40us should be enough */ - udelay(50); - /* reset RST and interrupt */ - NCR5380_write(INITIATOR_COMMAND_REG, ICR_BASE); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); - - end = jiffies + AFTER_RESET_DELAY; - while (time_before(jiffies, end)) - barrier(); - - printk(" done\n"); -} -#endif - #if defined(REAL_DMA) static unsigned long atari_scsi_dma_setup(struct Scsi_Host *instance, @@ -815,14 +775,14 @@ static int atari_scsi_bus_reset(struct scsi_cmnd *cmd) static struct scsi_host_template atari_scsi_template = { .module = THIS_MODULE, .proc_name = DRV_MODULE_NAME, - .show_info = atari_scsi_show_info, .name = "Atari native SCSI", .info = atari_scsi_info, .queuecommand = atari_scsi_queue_command, .eh_abort_handler = atari_scsi_abort, .eh_bus_reset_handler = atari_scsi_bus_reset, .this_id = 7, - .use_clustering = DISABLE_CLUSTERING + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, }; static int __init atari_scsi_probe(struct platform_device *pdev) @@ -880,7 +840,7 @@ static int __init atari_scsi_probe(struct platform_device *pdev) } else { /* Test if a host id is set in the NVRam */ if (ATARIHW_PRESENT(TT_CLK) && nvram_check_checksum()) { - unsigned char b = nvram_read_byte(14); + unsigned char b = nvram_read_byte(16); /* Arbitration enabled? (for TOS) * If yes, use configured host ID @@ -915,21 +875,18 @@ static int __init atari_scsi_probe(struct platform_device *pdev) error = -ENOMEM; goto fail_alloc; } - atari_scsi_host = instance; - -#ifdef CONFIG_ATARI_SCSI_RESET_BOOT - atari_scsi_reset_boot(); -#endif instance->irq = irq->start; host_flags |= IS_A_TT() ? 0 : FLAG_LATE_DMA_SETUP; - #ifdef SUPPORT_TAGS host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0; #endif + host_flags |= setup_toshiba_delay > 0 ? FLAG_TOSHIBA_DELAY : 0; - NCR5380_init(instance, host_flags); + error = NCR5380_init(instance, host_flags); + if (error) + goto fail_init; if (IS_A_TT()) { error = request_irq(instance->irq, scsi_tt_intr, 0, @@ -975,6 +932,8 @@ static int __init atari_scsi_probe(struct platform_device *pdev) #endif } + NCR5380_maybe_reset_bus(instance); + error = scsi_add_host(instance, NULL); if (error) goto fail_host; @@ -989,6 +948,7 @@ fail_host: free_irq(instance->irq, instance); fail_irq: NCR5380_exit(instance); +fail_init: scsi_host_put(instance); fail_alloc: if (atari_dma_buffer) diff --git a/drivers/scsi/be2iscsi/Kconfig b/drivers/scsi/be2iscsi/Kconfig index 4e7cad272469..bad5f32e1f67 100644 --- a/drivers/scsi/be2iscsi/Kconfig +++ b/drivers/scsi/be2iscsi/Kconfig @@ -3,6 +3,7 @@ config BE2ISCSI depends on PCI && SCSI && NET select SCSI_ISCSI_ATTRS select ISCSI_BOOT_SYSFS + select IRQ_POLL help This driver implements the iSCSI functionality for Emulex diff --git a/drivers/scsi/be2iscsi/be.h b/drivers/scsi/be2iscsi/be.h index 77f992e74726..a41c6432f444 100644 --- a/drivers/scsi/be2iscsi/be.h +++ b/drivers/scsi/be2iscsi/be.h @@ -20,7 +20,7 @@ #include <linux/pci.h> #include <linux/if_vlan.h> -#include <linux/blk-iopoll.h> +#include <linux/irq_poll.h> #define FW_VER_LEN 32 #define MCC_Q_LEN 128 #define MCC_CQ_LEN 256 @@ -101,7 +101,7 @@ struct be_eq_obj { struct beiscsi_hba *phba; struct be_queue_info *cq; struct work_struct work_cqs; /* Work Item */ - struct blk_iopoll iopoll; + struct irq_poll iopoll; }; struct be_mcc_obj { diff --git a/drivers/scsi/be2iscsi/be_iscsi.c b/drivers/scsi/be2iscsi/be_iscsi.c index b7087ba69d8d..022e87b62e40 100644 --- a/drivers/scsi/be2iscsi/be_iscsi.c +++ b/drivers/scsi/be2iscsi/be_iscsi.c @@ -1292,9 +1292,9 @@ static void beiscsi_flush_cq(struct beiscsi_hba *phba) for (i = 0; i < phba->num_cpus; i++) { pbe_eq = &phwi_context->be_eq[i]; - blk_iopoll_disable(&pbe_eq->iopoll); + irq_poll_disable(&pbe_eq->iopoll); beiscsi_process_cq(pbe_eq); - blk_iopoll_enable(&pbe_eq->iopoll); + irq_poll_enable(&pbe_eq->iopoll); } } diff --git a/drivers/scsi/be2iscsi/be_main.c b/drivers/scsi/be2iscsi/be_main.c index fe0c5143f8e6..cb9072a841be 100644 --- a/drivers/scsi/be2iscsi/be_main.c +++ b/drivers/scsi/be2iscsi/be_main.c @@ -910,8 +910,7 @@ static irqreturn_t be_isr_msix(int irq, void *dev_id) num_eq_processed = 0; while (eqe->dw[offsetof(struct amap_eq_entry, valid) / 32] & EQE_VALID_MASK) { - if (!blk_iopoll_sched_prep(&pbe_eq->iopoll)) - blk_iopoll_sched(&pbe_eq->iopoll); + irq_poll_sched(&pbe_eq->iopoll); AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0); queue_tail_inc(eq); @@ -972,8 +971,7 @@ static irqreturn_t be_isr(int irq, void *dev_id) spin_unlock_irqrestore(&phba->isr_lock, flags); num_mcceq_processed++; } else { - if (!blk_iopoll_sched_prep(&pbe_eq->iopoll)) - blk_iopoll_sched(&pbe_eq->iopoll); + irq_poll_sched(&pbe_eq->iopoll); num_ioeq_processed++; } AMAP_SET_BITS(struct amap_eq_entry, valid, eqe, 0); @@ -2295,7 +2293,7 @@ void beiscsi_process_all_cqs(struct work_struct *work) hwi_ring_eq_db(phba, pbe_eq->q.id, 0, 0, 1, 1); } -static int be_iopoll(struct blk_iopoll *iop, int budget) +static int be_iopoll(struct irq_poll *iop, int budget) { unsigned int ret; struct beiscsi_hba *phba; @@ -2306,7 +2304,7 @@ static int be_iopoll(struct blk_iopoll *iop, int budget) pbe_eq->cq_count += ret; if (ret < budget) { phba = pbe_eq->phba; - blk_iopoll_complete(iop); + irq_poll_complete(iop); beiscsi_log(phba, KERN_INFO, BEISCSI_LOG_CONFIG | BEISCSI_LOG_IO, "BM_%d : rearm pbe_eq->q.id =%d\n", @@ -5293,7 +5291,7 @@ static void beiscsi_quiesce(struct beiscsi_hba *phba, for (i = 0; i < phba->num_cpus; i++) { pbe_eq = &phwi_context->be_eq[i]; - blk_iopoll_disable(&pbe_eq->iopoll); + irq_poll_disable(&pbe_eq->iopoll); } if (unload_state == BEISCSI_CLEAN_UNLOAD) { @@ -5579,9 +5577,8 @@ static void beiscsi_eeh_resume(struct pci_dev *pdev) for (i = 0; i < phba->num_cpus; i++) { pbe_eq = &phwi_context->be_eq[i]; - blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget, + irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget, be_iopoll); - blk_iopoll_enable(&pbe_eq->iopoll); } i = (phba->msix_enabled) ? i : 0; @@ -5752,9 +5749,8 @@ static int beiscsi_dev_probe(struct pci_dev *pcidev, for (i = 0; i < phba->num_cpus; i++) { pbe_eq = &phwi_context->be_eq[i]; - blk_iopoll_init(&pbe_eq->iopoll, be_iopoll_budget, + irq_poll_init(&pbe_eq->iopoll, be_iopoll_budget, be_iopoll); - blk_iopoll_enable(&pbe_eq->iopoll); } i = (phba->msix_enabled) ? i : 0; @@ -5795,7 +5791,7 @@ free_blkenbld: destroy_workqueue(phba->wq); for (i = 0; i < phba->num_cpus; i++) { pbe_eq = &phwi_context->be_eq[i]; - blk_iopoll_disable(&pbe_eq->iopoll); + irq_poll_disable(&pbe_eq->iopoll); } free_twq: beiscsi_clean_port(phba); diff --git a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c index 0e2bee937fe8..e22a268fd311 100644 --- a/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c +++ b/drivers/scsi/cxgbi/cxgb3i/cxgb3i.c @@ -57,7 +57,7 @@ MODULE_PARM_DESC(cxgb3i_snd_win, "TCP send window in bytes (default=128KB)"); static int cxgb3i_rx_credit_thres = 10 * 1024; module_param(cxgb3i_rx_credit_thres, int, 0644); -MODULE_PARM_DESC(rx_credit_thres, +MODULE_PARM_DESC(cxgb3i_rx_credit_thres, "RX credits return threshold in bytes (default=10KB)"); static unsigned int cxgb3i_max_connect = 8 * 1024; diff --git a/drivers/scsi/dmx3191d.c b/drivers/scsi/dmx3191d.c index 3e088125a8be..6c14e68b9e1a 100644 --- a/drivers/scsi/dmx3191d.c +++ b/drivers/scsi/dmx3191d.c @@ -36,17 +36,10 @@ #define DONT_USE_INTR -#define NCR5380_read(reg) inb(port + reg) -#define NCR5380_write(reg, value) outb(value, port + reg) +#define NCR5380_read(reg) inb(instance->io_port + reg) +#define NCR5380_write(reg, value) outb(value, instance->io_port + reg) #define NCR5380_implementation_fields /* none */ -#define NCR5380_local_declare() unsigned int port -#define NCR5380_setup(instance) port = instance->io_port - -/* - * Includes needed for NCR5380.[ch] (XXX: Move them to NCR5380.h) - */ -#include <linux/delay.h> #include "NCR5380.h" #include "NCR5380.c" @@ -56,6 +49,7 @@ static struct scsi_host_template dmx3191d_driver_template = { + .module = THIS_MODULE, .proc_name = DMX3191D_DRIVER_NAME, .name = "Domex DMX3191D", .info = NCR5380_info, @@ -67,6 +61,8 @@ static struct scsi_host_template dmx3191d_driver_template = { .sg_tablesize = SG_ALL, .cmd_per_lun = 2, .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; static int dmx3191d_probe_one(struct pci_dev *pdev, @@ -97,17 +93,25 @@ static int dmx3191d_probe_one(struct pci_dev *pdev, */ shost->irq = NO_IRQ; - NCR5380_init(shost, FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E); + error = NCR5380_init(shost, FLAG_NO_PSEUDO_DMA); + if (error) + goto out_host_put; + + NCR5380_maybe_reset_bus(shost); pci_set_drvdata(pdev, shost); error = scsi_add_host(shost, &pdev->dev); if (error) - goto out_release_region; + goto out_exit; scsi_scan_host(shost); return 0; +out_exit: + NCR5380_exit(shost); +out_host_put: + scsi_host_put(shost); out_release_region: release_region(io, DMX3191D_REGION_LEN); out_disable_device: @@ -119,15 +123,14 @@ static int dmx3191d_probe_one(struct pci_dev *pdev, static void dmx3191d_remove_one(struct pci_dev *pdev) { struct Scsi_Host *shost = pci_get_drvdata(pdev); + unsigned long io = shost->io_port; scsi_remove_host(shost); NCR5380_exit(shost); - - release_region(shost->io_port, DMX3191D_REGION_LEN); - pci_disable_device(pdev); - scsi_host_put(shost); + release_region(io, DMX3191D_REGION_LEN); + pci_disable_device(pdev); } static struct pci_device_id dmx3191d_pci_tbl[] = { diff --git a/drivers/scsi/dtc.c b/drivers/scsi/dtc.c index 4c74c7ba2dff..6c736b071cf4 100644 --- a/drivers/scsi/dtc.c +++ b/drivers/scsi/dtc.c @@ -1,9 +1,5 @@ - #define PSEUDO_DMA #define DONT_USE_INTR -#define UNSAFE /* Leave interrupts enabled during pseudo-dma I/O */ -#define DMA_WORKS_RIGHT - /* * DTC 3180/3280 driver, by @@ -50,15 +46,13 @@ #include <linux/module.h> -#include <linux/signal.h> #include <linux/blkdev.h> -#include <linux/delay.h> -#include <linux/stat.h> #include <linux/string.h> #include <linux/init.h> #include <linux/interrupt.h> #include <linux/io.h> #include <scsi/scsi_host.h> + #include "dtc.h" #define AUTOPROBE_IRQ #include "NCR5380.h" @@ -150,7 +144,7 @@ static const struct signature { static int __init dtc_setup(char *str) { - static int commandline_current = 0; + static int commandline_current; int i; int ints[10]; @@ -188,7 +182,7 @@ __setup("dtc=", dtc_setup); static int __init dtc_detect(struct scsi_host_template * tpnt) { - static int current_override = 0, current_base = 0; + static int current_override, current_base; struct Scsi_Host *instance; unsigned int addr; void __iomem *base; @@ -205,9 +199,8 @@ static int __init dtc_detect(struct scsi_host_template * tpnt) addr = 0; } else for (; !addr && (current_base < NO_BASES); ++current_base) { -#if (DTCDEBUG & DTCDEBUG_INIT) - printk(KERN_DEBUG "scsi-dtc : probing address %08x\n", bases[current_base].address); -#endif + dprintk(NDEBUG_INIT, "dtc: probing address 0x%08x\n", + (unsigned int)bases[current_base].address); if (bases[current_base].noauto) continue; base = ioremap(bases[current_base].address, 0x2000); @@ -216,18 +209,14 @@ static int __init dtc_detect(struct scsi_host_template * tpnt) for (sig = 0; sig < NO_SIGNATURES; ++sig) { if (check_signature(base + signatures[sig].offset, signatures[sig].string, strlen(signatures[sig].string))) { addr = bases[current_base].address; -#if (DTCDEBUG & DTCDEBUG_INIT) - printk(KERN_DEBUG "scsi-dtc : detected board.\n"); -#endif + dprintk(NDEBUG_INIT, "dtc: detected board\n"); goto found; } } iounmap(base); } -#if defined(DTCDEBUG) && (DTCDEBUG & DTCDEBUG_INIT) - printk(KERN_DEBUG "scsi-dtc : base = %08x\n", addr); -#endif + dprintk(NDEBUG_INIT, "dtc: addr = 0x%08x\n", addr); if (!addr) break; @@ -235,12 +224,15 @@ static int __init dtc_detect(struct scsi_host_template * tpnt) found: instance = scsi_register(tpnt, sizeof(struct NCR5380_hostdata)); if (instance == NULL) - break; + goto out_unmap; instance->base = addr; ((struct NCR5380_hostdata *)(instance)->hostdata)->base = base; - NCR5380_init(instance, 0); + if (NCR5380_init(instance, FLAG_NO_DMA_FIXUP)) + goto out_unregister; + + NCR5380_maybe_reset_bus(instance); NCR5380_write(DTC_CONTROL_REG, CSR_5380_INTR); /* Enable int's */ if (overrides[current_override].irq != IRQ_AUTO) @@ -271,14 +263,19 @@ found: printk(KERN_WARNING "scsi%d : interrupts not used. Might as well not jumper it.\n", instance->host_no); instance->irq = NO_IRQ; #endif -#if defined(DTCDEBUG) && (DTCDEBUG & DTCDEBUG_INIT) - printk("scsi%d : irq = %d\n", instance->host_no, instance->irq); -#endif + dprintk(NDEBUG_INIT, "scsi%d : irq = %d\n", + instance->host_no, instance->irq); ++current_override; ++count; } return count; + +out_unregister: + scsi_unregister(instance); +out_unmap: + iounmap(base); + return count; } /* @@ -331,12 +328,8 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, unsigned char *d = dst; int i; /* For counting time spent in the poll-loop */ struct NCR5380_hostdata *hostdata = shost_priv(instance); - NCR5380_local_declare(); - NCR5380_setup(instance); i = 0; - NCR5380_read(RESET_PARITY_INTERRUPT_REG); - NCR5380_write(MODE_REG, MR_ENABLE_EOP_INTR | MR_DMA_MODE); if (instance->irq == NO_IRQ) NCR5380_write(DTC_CONTROL_REG, CSR_DIR_READ); else @@ -348,7 +341,7 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, while (NCR5380_read(DTC_CONTROL_REG) & CSR_HOST_BUF_NOT_RDY) ++i; rtrc(3); - memcpy_fromio(d, base + DTC_DATA_BUF, 128); + memcpy_fromio(d, hostdata->base + DTC_DATA_BUF, 128); d += 128; len -= 128; rtrc(7); @@ -358,9 +351,7 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, rtrc(4); while (!(NCR5380_read(DTC_CONTROL_REG) & D_CR_ACCESS)) ++i; - NCR5380_write(MODE_REG, 0); /* Clear the operating mode */ rtrc(0); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); if (i > hostdata->spin_max_r) hostdata->spin_max_r = i; return (0); @@ -383,12 +374,7 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, { int i; struct NCR5380_hostdata *hostdata = shost_priv(instance); - NCR5380_local_declare(); - NCR5380_setup(instance); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); - NCR5380_write(MODE_REG, MR_ENABLE_EOP_INTR | MR_DMA_MODE); - /* set direction (write) */ if (instance->irq == NO_IRQ) NCR5380_write(DTC_CONTROL_REG, 0); else @@ -400,7 +386,7 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, while (NCR5380_read(DTC_CONTROL_REG) & CSR_HOST_BUF_NOT_RDY) ++i; rtrc(3); - memcpy_toio(base + DTC_DATA_BUF, src, 128); + memcpy_toio(hostdata->base + DTC_DATA_BUF, src, 128); src += 128; len -= 128; } @@ -413,47 +399,60 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, ++i; rtrc(7); /* Check for parity error here. fixme. */ - NCR5380_write(MODE_REG, 0); /* Clear the operating mode */ rtrc(0); if (i > hostdata->spin_max_w) hostdata->spin_max_w = i; return (0); } +static int dtc_dma_xfer_len(struct scsi_cmnd *cmd) +{ + int transfersize = cmd->transfersize; + + /* Limit transfers to 32K, for xx400 & xx406 + * pseudoDMA that transfers in 128 bytes blocks. + */ + if (transfersize > 32 * 1024 && cmd->SCp.this_residual && + !(cmd->SCp.this_residual % transfersize)) + transfersize = 32 * 1024; + + return transfersize; +} + MODULE_LICENSE("GPL"); #include "NCR5380.c" static int dtc_release(struct Scsi_Host *shost) { - NCR5380_local_declare(); - NCR5380_setup(shost); + struct NCR5380_hostdata *hostdata = shost_priv(shost); + if (shost->irq != NO_IRQ) free_irq(shost->irq, shost); NCR5380_exit(shost); - if (shost->io_port && shost->n_io_port) - release_region(shost->io_port, shost->n_io_port); scsi_unregister(shost); - iounmap(base); + iounmap(hostdata->base); return 0; } static struct scsi_host_template driver_template = { - .name = "DTC 3180/3280 ", - .detect = dtc_detect, - .release = dtc_release, - .proc_name = "dtc3x80", - .show_info = dtc_show_info, - .write_info = dtc_write_info, - .info = dtc_info, - .queuecommand = dtc_queue_command, - .eh_abort_handler = dtc_abort, - .eh_bus_reset_handler = dtc_bus_reset, - .bios_param = dtc_biosparam, - .can_queue = CAN_QUEUE, - .this_id = 7, - .sg_tablesize = SG_ALL, - .cmd_per_lun = CMD_PER_LUN, - .use_clustering = DISABLE_CLUSTERING, + .name = "DTC 3180/3280", + .detect = dtc_detect, + .release = dtc_release, + .proc_name = "dtc3x80", + .show_info = dtc_show_info, + .write_info = dtc_write_info, + .info = dtc_info, + .queuecommand = dtc_queue_command, + .eh_abort_handler = dtc_abort, + .eh_bus_reset_handler = dtc_bus_reset, + .bios_param = dtc_biosparam, + .can_queue = 32, + .this_id = 7, + .sg_tablesize = SG_ALL, + .cmd_per_lun = 2, + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; #include "scsi_module.c" diff --git a/drivers/scsi/dtc.h b/drivers/scsi/dtc.h index 78a2332e9064..56732cba8aba 100644 --- a/drivers/scsi/dtc.h +++ b/drivers/scsi/dtc.h @@ -10,54 +10,17 @@ #ifndef DTC3280_H #define DTC3280_H -#define DTCDEBUG 0 -#define DTCDEBUG_INIT 0x1 -#define DTCDEBUG_TRANSFER 0x2 - -#ifndef CMD_PER_LUN -#define CMD_PER_LUN 2 -#endif - -#ifndef CAN_QUEUE -#define CAN_QUEUE 32 -#endif - #define NCR5380_implementation_fields \ void __iomem *base -#define NCR5380_local_declare() \ - void __iomem *base - -#define NCR5380_setup(instance) \ - base = ((struct NCR5380_hostdata *)(instance)->hostdata)->base +#define DTC_address(reg) \ + (((struct NCR5380_hostdata *)shost_priv(instance))->base + DTC_5380_OFFSET + reg) -#define DTC_address(reg) (base + DTC_5380_OFFSET + reg) - -#define dbNCR5380_read(reg) \ - (rval=readb(DTC_address(reg)), \ - (((unsigned char) printk("DTC : read register %d at addr %p is: %02x\n"\ - , (reg), DTC_address(reg), rval)), rval ) ) - -#define dbNCR5380_write(reg, value) do { \ - printk("DTC : write %02x to register %d at address %p\n", \ - (value), (reg), DTC_address(reg)); \ - writeb(value, DTC_address(reg));} while(0) - - -#if !(DTCDEBUG & DTCDEBUG_TRANSFER) #define NCR5380_read(reg) (readb(DTC_address(reg))) #define NCR5380_write(reg, value) (writeb(value, DTC_address(reg))) -#else -#define NCR5380_read(reg) (readb(DTC_address(reg))) -#define xNCR5380_read(reg) \ - (((unsigned char) printk("DTC : read register %d at address %p\n"\ - , (reg), DTC_address(reg))), readb(DTC_address(reg))) -#define NCR5380_write(reg, value) do { \ - printk("DTC : write %02x to register %d at address %p\n", \ - (value), (reg), DTC_address(reg)); \ - writeb(value, DTC_address(reg));} while(0) -#endif +#define NCR5380_dma_xfer_len(instance, cmd, phase) \ + dtc_dma_xfer_len(cmd) #define NCR5380_intr dtc_intr #define NCR5380_queue_command dtc_queue_command diff --git a/drivers/scsi/g_NCR5380.c b/drivers/scsi/g_NCR5380.c index f8d2478b11cc..90091e693020 100644 --- a/drivers/scsi/g_NCR5380.c +++ b/drivers/scsi/g_NCR5380.c @@ -56,40 +56,31 @@ * */ -/* settings for DTC3181E card with only Mustek scanner attached */ -#define USLEEP_POLL msecs_to_jiffies(10) -#define USLEEP_SLEEP msecs_to_jiffies(200) -#define USLEEP_WAITLONG msecs_to_jiffies(5000) - #define AUTOPROBE_IRQ #ifdef CONFIG_SCSI_GENERIC_NCR53C400 -#define NCR53C400_PSEUDO_DMA 1 #define PSEUDO_DMA -#define NCR53C400 #endif #include <asm/io.h> -#include <linux/signal.h> #include <linux/blkdev.h> +#include <linux/module.h> #include <scsi/scsi_host.h> #include "g_NCR5380.h" #include "NCR5380.h" -#include <linux/stat.h> #include <linux/init.h> #include <linux/ioport.h> #include <linux/isapnp.h> -#include <linux/delay.h> #include <linux/interrupt.h> -#define NCR_NOT_SET 0 -static int ncr_irq = NCR_NOT_SET; -static int ncr_dma = NCR_NOT_SET; -static int ncr_addr = NCR_NOT_SET; -static int ncr_5380 = NCR_NOT_SET; -static int ncr_53c400 = NCR_NOT_SET; -static int ncr_53c400a = NCR_NOT_SET; -static int dtc_3181e = NCR_NOT_SET; +static int ncr_irq; +static int ncr_dma; +static int ncr_addr; +static int ncr_5380; +static int ncr_53c400; +static int ncr_53c400a; +static int dtc_3181e; +static int hp_c2502; static struct override { NCR5380_map_type NCR5380_map_name; @@ -121,7 +112,7 @@ static struct override { static void __init internal_setup(int board, char *str, int *ints) { - static int commandline_current = 0; + static int commandline_current; switch (board) { case BOARD_NCR5380: if (ints[0] != 2 && ints[0] != 3) { @@ -235,6 +226,30 @@ static int __init do_DTC3181E_setup(char *str) #endif +#ifndef SCSI_G_NCR5380_MEM +/* + * Configure I/O address of 53C400A or DTC436 by writing magic numbers + * to ports 0x779 and 0x379. + */ +static void magic_configure(int idx, u8 irq, u8 magic[]) +{ + u8 cfg = 0; + + outb(magic[0], 0x779); + outb(magic[1], 0x379); + outb(magic[2], 0x379); + outb(magic[3], 0x379); + outb(magic[4], 0x379); + + /* allowed IRQs for HP C2502 */ + if (irq != 2 && irq != 3 && irq != 4 && irq != 5 && irq != 7) + irq = 0; + if (idx >= 0 && idx <= 7) + cfg = 0x80 | idx | (irq << 4); + outb(cfg, 0x379); +} +#endif + /** * generic_NCR5380_detect - look for NCR5380 controllers * @tpnt: the scsi template @@ -243,19 +258,18 @@ static int __init do_DTC3181E_setup(char *str) * and DTC436(ISAPnP) controllers. If overrides have been set we use * them. * - * The caller supplied NCR5380_init function is invoked from here, before - * the interrupt line is taken. - * * Locks: none */ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) { - static int current_override = 0; + static int current_override; int count; unsigned int *ports; + u8 *magic = NULL; #ifndef SCSI_G_NCR5380_MEM int i; + int port_idx = -1; unsigned long region_size = 16; #endif static unsigned int __initdata ncr_53c400a_ports[] = { @@ -264,27 +278,36 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) static unsigned int __initdata dtc_3181e_ports[] = { 0x220, 0x240, 0x280, 0x2a0, 0x2c0, 0x300, 0x320, 0x340, 0 }; - int flags = 0; + static u8 ncr_53c400a_magic[] __initdata = { /* 53C400A & DTC436 */ + 0x59, 0xb9, 0xc5, 0xae, 0xa6 + }; + static u8 hp_c2502_magic[] __initdata = { /* HP C2502 */ + 0x0f, 0x22, 0xf0, 0x20, 0x80 + }; + int flags; struct Scsi_Host *instance; + struct NCR5380_hostdata *hostdata; #ifdef SCSI_G_NCR5380_MEM unsigned long base; void __iomem *iomem; #endif - if (ncr_irq != NCR_NOT_SET) + if (ncr_irq) overrides[0].irq = ncr_irq; - if (ncr_dma != NCR_NOT_SET) + if (ncr_dma) overrides[0].dma = ncr_dma; - if (ncr_addr != NCR_NOT_SET) + if (ncr_addr) overrides[0].NCR5380_map_name = (NCR5380_map_type) ncr_addr; - if (ncr_5380 != NCR_NOT_SET) + if (ncr_5380) overrides[0].board = BOARD_NCR5380; - else if (ncr_53c400 != NCR_NOT_SET) + else if (ncr_53c400) overrides[0].board = BOARD_NCR53C400; - else if (ncr_53c400a != NCR_NOT_SET) + else if (ncr_53c400a) overrides[0].board = BOARD_NCR53C400A; - else if (dtc_3181e != NCR_NOT_SET) + else if (dtc_3181e) overrides[0].board = BOARD_DTC3181E; + else if (hp_c2502) + overrides[0].board = BOARD_HP_C2502; #ifndef SCSI_G_NCR5380_MEM if (!current_override && isapnp_present()) { struct pnp_dev *dev = NULL; @@ -318,41 +341,45 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) } } #endif - tpnt->proc_name = "g_NCR5380"; for (count = 0; current_override < NO_OVERRIDES; ++current_override) { if (!(overrides[current_override].NCR5380_map_name)) continue; ports = NULL; + flags = 0; switch (overrides[current_override].board) { case BOARD_NCR5380: flags = FLAG_NO_PSEUDO_DMA; break; case BOARD_NCR53C400: - flags = FLAG_NCR53C400; +#ifdef PSEUDO_DMA + flags = FLAG_NO_DMA_FIXUP; +#endif break; case BOARD_NCR53C400A: - flags = FLAG_NO_PSEUDO_DMA; + flags = FLAG_NO_DMA_FIXUP; + ports = ncr_53c400a_ports; + magic = ncr_53c400a_magic; + break; + case BOARD_HP_C2502: + flags = FLAG_NO_DMA_FIXUP; ports = ncr_53c400a_ports; + magic = hp_c2502_magic; break; case BOARD_DTC3181E: - flags = FLAG_NO_PSEUDO_DMA | FLAG_DTC3181E; + flags = FLAG_NO_DMA_FIXUP; ports = dtc_3181e_ports; + magic = ncr_53c400a_magic; break; } #ifndef SCSI_G_NCR5380_MEM - if (ports) { + if (ports && magic) { /* wakeup sequence for the NCR53C400A and DTC3181E */ /* Disable the adapter and look for a free io port */ - outb(0x59, 0x779); - outb(0xb9, 0x379); - outb(0xc5, 0x379); - outb(0xae, 0x379); - outb(0xa6, 0x379); - outb(0x00, 0x379); + magic_configure(-1, 0, magic); if (overrides[current_override].NCR5380_map_name != PORT_AUTO) for (i = 0; ports[i]; i++) { @@ -371,17 +398,12 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) } if (ports[i]) { /* At this point we have our region reserved */ - outb(0x59, 0x779); - outb(0xb9, 0x379); - outb(0xc5, 0x379); - outb(0xae, 0x379); - outb(0xa6, 0x379); - outb(0x80 | i, 0x379); /* set io port to be used */ + magic_configure(i, 0, magic); /* no IRQ yet */ outb(0xc0, ports[i] + 9); if (inb(ports[i] + 9) != 0x80) continue; - else - overrides[current_override].NCR5380_map_name = ports[i]; + overrides[current_override].NCR5380_map_name = ports[i]; + port_idx = i; } else continue; } @@ -403,24 +425,65 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) } #endif instance = scsi_register(tpnt, sizeof(struct NCR5380_hostdata)); - if (instance == NULL) { -#ifndef SCSI_G_NCR5380_MEM - release_region(overrides[current_override].NCR5380_map_name, region_size); -#else - iounmap(iomem); - release_mem_region(base, NCR5380_region_size); -#endif - continue; - } + if (instance == NULL) + goto out_release; + hostdata = shost_priv(instance); - instance->NCR5380_instance_name = overrides[current_override].NCR5380_map_name; #ifndef SCSI_G_NCR5380_MEM + instance->io_port = overrides[current_override].NCR5380_map_name; instance->n_io_port = region_size; + hostdata->io_width = 1; /* 8-bit PDMA by default */ + + /* + * On NCR53C400 boards, NCR5380 registers are mapped 8 past + * the base address. + */ + switch (overrides[current_override].board) { + case BOARD_NCR53C400: + instance->io_port += 8; + hostdata->c400_ctl_status = 0; + hostdata->c400_blk_cnt = 1; + hostdata->c400_host_buf = 4; + break; + case BOARD_DTC3181E: + hostdata->io_width = 2; /* 16-bit PDMA */ + /* fall through */ + case BOARD_NCR53C400A: + case BOARD_HP_C2502: + hostdata->c400_ctl_status = 9; + hostdata->c400_blk_cnt = 10; + hostdata->c400_host_buf = 8; + break; + } #else - ((struct NCR5380_hostdata *)instance->hostdata)->iomem = iomem; + instance->base = overrides[current_override].NCR5380_map_name; + hostdata->iomem = iomem; + switch (overrides[current_override].board) { + case BOARD_NCR53C400: + hostdata->c400_ctl_status = 0x100; + hostdata->c400_blk_cnt = 0x101; + hostdata->c400_host_buf = 0x104; + break; + case BOARD_DTC3181E: + case BOARD_NCR53C400A: + case BOARD_HP_C2502: + pr_err(DRV_MODULE_NAME ": unknown register offsets\n"); + goto out_unregister; + } #endif - NCR5380_init(instance, flags); + if (NCR5380_init(instance, flags)) + goto out_unregister; + + switch (overrides[current_override].board) { + case BOARD_NCR53C400: + case BOARD_DTC3181E: + case BOARD_NCR53C400A: + case BOARD_HP_C2502: + NCR5380_write(hostdata->c400_ctl_status, CSR_BASE); + } + + NCR5380_maybe_reset_bus(instance); if (overrides[current_override].irq != IRQ_AUTO) instance->irq = overrides[current_override].irq; @@ -431,12 +494,18 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) if (instance->irq == 255) instance->irq = NO_IRQ; - if (instance->irq != NO_IRQ) + if (instance->irq != NO_IRQ) { +#ifndef SCSI_G_NCR5380_MEM + /* set IRQ for HP C2502 */ + if (overrides[current_override].board == BOARD_HP_C2502) + magic_configure(port_idx, instance->irq, magic); +#endif if (request_irq(instance->irq, generic_NCR5380_intr, 0, "NCR5380", instance)) { printk(KERN_WARNING "scsi%d : IRQ%d not free, interrupts disabled\n", instance->host_no, instance->irq); instance->irq = NO_IRQ; } + } if (instance->irq == NO_IRQ) { printk(KERN_INFO "scsi%d : interrupts not enabled. for better interactive performance,\n", instance->host_no); @@ -447,6 +516,17 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) ++count; } return count; + +out_unregister: + scsi_unregister(instance); +out_release: +#ifndef SCSI_G_NCR5380_MEM + release_region(overrides[current_override].NCR5380_map_name, region_size); +#else + iounmap(iomem); + release_mem_region(base, NCR5380_region_size); +#endif + return count; } /** @@ -460,21 +540,15 @@ static int __init generic_NCR5380_detect(struct scsi_host_template *tpnt) static int generic_NCR5380_release_resources(struct Scsi_Host *instance) { - NCR5380_local_declare(); - NCR5380_setup(instance); - if (instance->irq != NO_IRQ) free_irq(instance->irq, instance); NCR5380_exit(instance); - #ifndef SCSI_G_NCR5380_MEM - release_region(instance->NCR5380_instance_name, instance->n_io_port); + release_region(instance->io_port, instance->n_io_port); #else iounmap(((struct NCR5380_hostdata *)instance->hostdata)->iomem); - release_mem_region(instance->NCR5380_instance_name, NCR5380_region_size); + release_mem_region(instance->base, NCR5380_region_size); #endif - - return 0; } @@ -507,7 +581,7 @@ generic_NCR5380_biosparam(struct scsi_device *sdev, struct block_device *bdev, } #endif -#ifdef NCR53C400_PSEUDO_DMA +#ifdef PSEUDO_DMA /** * NCR5380_pread - pseudo DMA read @@ -521,75 +595,68 @@ generic_NCR5380_biosparam(struct scsi_device *sdev, struct block_device *bdev, static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, int len) { + struct NCR5380_hostdata *hostdata = shost_priv(instance); int blocks = len / 128; int start = 0; - int bl; - - NCR5380_local_declare(); - NCR5380_setup(instance); - NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE | CSR_TRANS_DIR); - NCR5380_write(C400_BLOCK_COUNTER_REG, blocks); + NCR5380_write(hostdata->c400_ctl_status, CSR_BASE | CSR_TRANS_DIR); + NCR5380_write(hostdata->c400_blk_cnt, blocks); while (1) { - if ((bl = NCR5380_read(C400_BLOCK_COUNTER_REG)) == 0) { + if (NCR5380_read(hostdata->c400_blk_cnt) == 0) break; - } - if (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ) { + if (NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ) { printk(KERN_ERR "53C400r: Got 53C80_IRQ start=%d, blocks=%d\n", start, blocks); return -1; } - while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY); + while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY) + ; /* FIXME - no timeout */ #ifndef SCSI_G_NCR5380_MEM - { - int i; - for (i = 0; i < 128; i++) - dst[start + i] = NCR5380_read(C400_HOST_BUFFER); - } + if (hostdata->io_width == 2) + insw(instance->io_port + hostdata->c400_host_buf, + dst + start, 64); + else + insb(instance->io_port + hostdata->c400_host_buf, + dst + start, 128); #else /* implies SCSI_G_NCR5380_MEM */ - memcpy_fromio(dst + start, iomem + NCR53C400_host_buffer, 128); + memcpy_fromio(dst + start, + hostdata->iomem + NCR53C400_host_buffer, 128); #endif start += 128; blocks--; } if (blocks) { - while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY) - { - // FIXME - no timeout - } + while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY) + ; /* FIXME - no timeout */ #ifndef SCSI_G_NCR5380_MEM - { - int i; - for (i = 0; i < 128; i++) - dst[start + i] = NCR5380_read(C400_HOST_BUFFER); - } + if (hostdata->io_width == 2) + insw(instance->io_port + hostdata->c400_host_buf, + dst + start, 64); + else + insb(instance->io_port + hostdata->c400_host_buf, + dst + start, 128); #else /* implies SCSI_G_NCR5380_MEM */ - memcpy_fromio(dst + start, iomem + NCR53C400_host_buffer, 128); + memcpy_fromio(dst + start, + hostdata->iomem + NCR53C400_host_buffer, 128); #endif start += 128; blocks--; } - if (!(NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ)) + if (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ)) printk("53C400r: no 53C80 gated irq after transfer"); -#if 0 - /* - * DON'T DO THIS - THEY NEVER ARRIVE! - */ - printk("53C400r: Waiting for 53C80 registers\n"); - while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_53C80_REG) + /* wait for 53C80 registers to be available */ + while (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_53C80_REG)) ; -#endif + if (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_END_DMA_TRANSFER)) printk(KERN_ERR "53C400r: no end dma signal\n"); - NCR5380_write(MODE_REG, MR_BASE); - NCR5380_read(RESET_PARITY_INTERRUPT_REG); return 0; } @@ -605,89 +672,91 @@ static inline int NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, int len) { + struct NCR5380_hostdata *hostdata = shost_priv(instance); int blocks = len / 128; int start = 0; - int bl; - int i; - NCR5380_local_declare(); - NCR5380_setup(instance); - - NCR5380_write(C400_CONTROL_STATUS_REG, CSR_BASE); - NCR5380_write(C400_BLOCK_COUNTER_REG, blocks); + NCR5380_write(hostdata->c400_ctl_status, CSR_BASE); + NCR5380_write(hostdata->c400_blk_cnt, blocks); while (1) { - if (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ) { + if (NCR5380_read(hostdata->c400_ctl_status) & CSR_GATED_53C80_IRQ) { printk(KERN_ERR "53C400w: Got 53C80_IRQ start=%d, blocks=%d\n", start, blocks); return -1; } - if ((bl = NCR5380_read(C400_BLOCK_COUNTER_REG)) == 0) { + if (NCR5380_read(hostdata->c400_blk_cnt) == 0) break; - } - while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY) + while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY) ; // FIXME - timeout #ifndef SCSI_G_NCR5380_MEM - { - for (i = 0; i < 128; i++) - NCR5380_write(C400_HOST_BUFFER, src[start + i]); - } + if (hostdata->io_width == 2) + outsw(instance->io_port + hostdata->c400_host_buf, + src + start, 64); + else + outsb(instance->io_port + hostdata->c400_host_buf, + src + start, 128); #else /* implies SCSI_G_NCR5380_MEM */ - memcpy_toio(iomem + NCR53C400_host_buffer, src + start, 128); + memcpy_toio(hostdata->iomem + NCR53C400_host_buffer, + src + start, 128); #endif start += 128; blocks--; } if (blocks) { - while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_HOST_BUF_NOT_RDY) + while (NCR5380_read(hostdata->c400_ctl_status) & CSR_HOST_BUF_NOT_RDY) ; // FIXME - no timeout #ifndef SCSI_G_NCR5380_MEM - { - for (i = 0; i < 128; i++) - NCR5380_write(C400_HOST_BUFFER, src[start + i]); - } + if (hostdata->io_width == 2) + outsw(instance->io_port + hostdata->c400_host_buf, + src + start, 64); + else + outsb(instance->io_port + hostdata->c400_host_buf, + src + start, 128); #else /* implies SCSI_G_NCR5380_MEM */ - memcpy_toio(iomem + NCR53C400_host_buffer, src + start, 128); + memcpy_toio(hostdata->iomem + NCR53C400_host_buffer, + src + start, 128); #endif start += 128; blocks--; } -#if 0 - printk("53C400w: waiting for registers to be available\n"); - THEY NEVER DO ! while (NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_53C80_REG); - printk("53C400w: Got em\n"); -#endif - - /* Let's wait for this instead - could be ugly */ - /* All documentation says to check for this. Maybe my hardware is too - * fast. Waiting for it seems to work fine! KLL - */ - while (!(i = NCR5380_read(C400_CONTROL_STATUS_REG) & CSR_GATED_53C80_IRQ)) - ; // FIXME - no timeout - - /* - * I know. i is certainly != 0 here but the loop is new. See previous - * comment. - */ - if (i) { - if (!((i = NCR5380_read(BUS_AND_STATUS_REG)) & BASR_END_DMA_TRANSFER)) - printk(KERN_ERR "53C400w: No END OF DMA bit - WHOOPS! BASR=%0x\n", i); - } else - printk(KERN_ERR "53C400w: no 53C80 gated irq after transfer (last block)\n"); + /* wait for 53C80 registers to be available */ + while (!(NCR5380_read(hostdata->c400_ctl_status) & CSR_53C80_REG)) { + udelay(4); /* DTC436 chip hangs without this */ + /* FIXME - no timeout */ + } -#if 0 if (!(NCR5380_read(BUS_AND_STATUS_REG) & BASR_END_DMA_TRANSFER)) { printk(KERN_ERR "53C400w: no end dma signal\n"); } -#endif + while (!(NCR5380_read(TARGET_COMMAND_REG) & TCR_LAST_BYTE_SENT)) ; // TIMEOUT return 0; } -#endif /* PSEUDO_DMA */ + +static int generic_NCR5380_dma_xfer_len(struct scsi_cmnd *cmd) +{ + int transfersize = cmd->transfersize; + + /* Limit transfers to 32K, for xx400 & xx406 + * pseudoDMA that transfers in 128 bytes blocks. + */ + if (transfersize > 32 * 1024 && cmd->SCp.this_residual && + !(cmd->SCp.this_residual % transfersize)) + transfersize = 32 * 1024; + + /* 53C400 datasheet: non-modulo-128-byte transfers should use PIO */ + if (transfersize % 128) + transfersize = 0; + + return transfersize; +} + +#endif /* PSEUDO_DMA */ /* * Include the NCR5380 core code that we build our driver around @@ -696,22 +765,24 @@ static inline int NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, #include "NCR5380.c" static struct scsi_host_template driver_template = { - .show_info = generic_NCR5380_show_info, - .name = "Generic NCR5380/NCR53C400 SCSI", - .detect = generic_NCR5380_detect, - .release = generic_NCR5380_release_resources, - .info = generic_NCR5380_info, - .queuecommand = generic_NCR5380_queue_command, + .proc_name = DRV_MODULE_NAME, + .name = "Generic NCR5380/NCR53C400 SCSI", + .detect = generic_NCR5380_detect, + .release = generic_NCR5380_release_resources, + .info = generic_NCR5380_info, + .queuecommand = generic_NCR5380_queue_command, .eh_abort_handler = generic_NCR5380_abort, .eh_bus_reset_handler = generic_NCR5380_bus_reset, - .bios_param = NCR5380_BIOSPARAM, - .can_queue = CAN_QUEUE, - .this_id = 7, - .sg_tablesize = SG_ALL, - .cmd_per_lun = CMD_PER_LUN, - .use_clustering = DISABLE_CLUSTERING, + .bios_param = NCR5380_BIOSPARAM, + .can_queue = 16, + .this_id = 7, + .sg_tablesize = SG_ALL, + .cmd_per_lun = 2, + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; -#include <linux/module.h> + #include "scsi_module.c" module_param(ncr_irq, int, 0); @@ -721,6 +792,7 @@ module_param(ncr_5380, int, 0); module_param(ncr_53c400, int, 0); module_param(ncr_53c400a, int, 0); module_param(dtc_3181e, int, 0); +module_param(hp_c2502, int, 0); MODULE_LICENSE("GPL"); #if !defined(SCSI_G_NCR5380_MEM) && defined(MODULE) diff --git a/drivers/scsi/g_NCR5380.h b/drivers/scsi/g_NCR5380.h index bea1a3b9b862..6f3d2ac4f185 100644 --- a/drivers/scsi/g_NCR5380.h +++ b/drivers/scsi/g_NCR5380.h @@ -14,81 +14,67 @@ #ifndef GENERIC_NCR5380_H #define GENERIC_NCR5380_H -#ifdef NCR53C400 +#ifdef CONFIG_SCSI_GENERIC_NCR53C400 #define BIOSPARAM #define NCR5380_BIOSPARAM generic_NCR5380_biosparam #else #define NCR5380_BIOSPARAM NULL #endif -#ifndef ASM - -#ifndef CMD_PER_LUN -#define CMD_PER_LUN 2 -#endif - -#ifndef CAN_QUEUE -#define CAN_QUEUE 16 -#endif - #define __STRVAL(x) #x #define STRVAL(x) __STRVAL(x) #ifndef SCSI_G_NCR5380_MEM +#define DRV_MODULE_NAME "g_NCR5380" -#define NCR5380_map_config port #define NCR5380_map_type int #define NCR5380_map_name port -#define NCR5380_instance_name io_port -#define NCR53C400_register_offset 0 -#define NCR53C400_address_adjust 8 -#ifdef NCR53C400 +#ifdef CONFIG_SCSI_GENERIC_NCR53C400 #define NCR5380_region_size 16 #else #define NCR5380_region_size 8 #endif -#define NCR5380_read(reg) (inb(NCR5380_map_name + (reg))) -#define NCR5380_write(reg, value) (outb((value), (NCR5380_map_name + (reg)))) +#define NCR5380_read(reg) \ + inb(instance->io_port + (reg)) +#define NCR5380_write(reg, value) \ + outb(value, instance->io_port + (reg)) #define NCR5380_implementation_fields \ - NCR5380_map_type NCR5380_map_name - -#define NCR5380_local_declare() \ - register NCR5380_implementation_fields - -#define NCR5380_setup(instance) \ - NCR5380_map_name = (NCR5380_map_type)((instance)->NCR5380_instance_name) + int c400_ctl_status; \ + int c400_blk_cnt; \ + int c400_host_buf; \ + int io_width; #else /* therefore SCSI_G_NCR5380_MEM */ +#define DRV_MODULE_NAME "g_NCR5380_mmio" -#define NCR5380_map_config memory #define NCR5380_map_type unsigned long #define NCR5380_map_name base -#define NCR5380_instance_name base -#define NCR53C400_register_offset 0x108 -#define NCR53C400_address_adjust 0 #define NCR53C400_mem_base 0x3880 #define NCR53C400_host_buffer 0x3900 #define NCR5380_region_size 0x3a00 -#define NCR5380_read(reg) readb(iomem + NCR53C400_mem_base + (reg)) -#define NCR5380_write(reg, value) writeb(value, iomem + NCR53C400_mem_base + (reg)) +#define NCR5380_read(reg) \ + readb(((struct NCR5380_hostdata *)shost_priv(instance))->iomem + \ + NCR53C400_mem_base + (reg)) +#define NCR5380_write(reg, value) \ + writeb(value, ((struct NCR5380_hostdata *)shost_priv(instance))->iomem + \ + NCR53C400_mem_base + (reg)) #define NCR5380_implementation_fields \ - NCR5380_map_type NCR5380_map_name; \ - void __iomem *iomem; - -#define NCR5380_local_declare() \ - register void __iomem *iomem - -#define NCR5380_setup(instance) \ - iomem = (((struct NCR5380_hostdata *)(instance)->hostdata)->iomem) + void __iomem *iomem; \ + int c400_ctl_status; \ + int c400_blk_cnt; \ + int c400_host_buf; #endif +#define NCR5380_dma_xfer_len(instance, cmd, phase) \ + generic_NCR5380_dma_xfer_len(cmd) + #define NCR5380_intr generic_NCR5380_intr #define NCR5380_queue_command generic_NCR5380_queue_command #define NCR5380_abort generic_NCR5380_abort @@ -102,7 +88,7 @@ #define BOARD_NCR53C400 1 #define BOARD_NCR53C400A 2 #define BOARD_DTC3181E 3 +#define BOARD_HP_C2502 4 -#endif /* ndef ASM */ #endif /* GENERIC_NCR5380_H */ diff --git a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c index d54381149c0d..057fdeb720ac 100644 --- a/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c +++ b/drivers/scsi/hisi_sas/hisi_sas_v1_hw.c @@ -247,41 +247,36 @@ /* ITCT header */ /* qw0 */ #define ITCT_HDR_DEV_TYPE_OFF 0 -#define ITCT_HDR_DEV_TYPE_MSK (0x3 << ITCT_HDR_DEV_TYPE_OFF) +#define ITCT_HDR_DEV_TYPE_MSK (0x3ULL << ITCT_HDR_DEV_TYPE_OFF) #define ITCT_HDR_VALID_OFF 2 -#define ITCT_HDR_VALID_MSK (0x1 << ITCT_HDR_VALID_OFF) -#define ITCT_HDR_BREAK_REPLY_ENA_OFF 3 -#define ITCT_HDR_BREAK_REPLY_ENA_MSK (0x1 << ITCT_HDR_BREAK_REPLY_ENA_OFF) +#define ITCT_HDR_VALID_MSK (0x1ULL << ITCT_HDR_VALID_OFF) #define ITCT_HDR_AWT_CONTROL_OFF 4 -#define ITCT_HDR_AWT_CONTROL_MSK (0x1 << ITCT_HDR_AWT_CONTROL_OFF) +#define ITCT_HDR_AWT_CONTROL_MSK (0x1ULL << ITCT_HDR_AWT_CONTROL_OFF) #define ITCT_HDR_MAX_CONN_RATE_OFF 5 -#define ITCT_HDR_MAX_CONN_RATE_MSK (0xf << ITCT_HDR_MAX_CONN_RATE_OFF) +#define ITCT_HDR_MAX_CONN_RATE_MSK (0xfULL << ITCT_HDR_MAX_CONN_RATE_OFF) #define ITCT_HDR_VALID_LINK_NUM_OFF 9 -#define ITCT_HDR_VALID_LINK_NUM_MSK (0xf << ITCT_HDR_VALID_LINK_NUM_OFF) +#define ITCT_HDR_VALID_LINK_NUM_MSK (0xfULL << ITCT_HDR_VALID_LINK_NUM_OFF) #define ITCT_HDR_PORT_ID_OFF 13 -#define ITCT_HDR_PORT_ID_MSK (0x7 << ITCT_HDR_PORT_ID_OFF) +#define ITCT_HDR_PORT_ID_MSK (0x7ULL << ITCT_HDR_PORT_ID_OFF) #define ITCT_HDR_SMP_TIMEOUT_OFF 16 -#define ITCT_HDR_SMP_TIMEOUT_MSK (0xffff << ITCT_HDR_SMP_TIMEOUT_OFF) -#define ITCT_HDR_MAX_BURST_BYTES_OFF 16 -#define ITCT_HDR_MAX_BURST_BYTES_MSK (0xffffffff << \ - ITCT_MAX_BURST_BYTES_OFF) +#define ITCT_HDR_SMP_TIMEOUT_MSK (0xffffULL << ITCT_HDR_SMP_TIMEOUT_OFF) /* qw1 */ #define ITCT_HDR_MAX_SAS_ADDR_OFF 0 #define ITCT_HDR_MAX_SAS_ADDR_MSK (0xffffffffffffffff << \ ITCT_HDR_MAX_SAS_ADDR_OFF) /* qw2 */ #define ITCT_HDR_IT_NEXUS_LOSS_TL_OFF 0 -#define ITCT_HDR_IT_NEXUS_LOSS_TL_MSK (0xffff << \ +#define ITCT_HDR_IT_NEXUS_LOSS_TL_MSK (0xffffULL << \ ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) #define ITCT_HDR_BUS_INACTIVE_TL_OFF 16 -#define ITCT_HDR_BUS_INACTIVE_TL_MSK (0xffff << \ +#define ITCT_HDR_BUS_INACTIVE_TL_MSK (0xffffULL << \ ITCT_HDR_BUS_INACTIVE_TL_OFF) #define ITCT_HDR_MAX_CONN_TL_OFF 32 -#define ITCT_HDR_MAX_CONN_TL_MSK (0xffff << \ +#define ITCT_HDR_MAX_CONN_TL_MSK (0xffffULL << \ ITCT_HDR_MAX_CONN_TL_OFF) #define ITCT_HDR_REJ_OPEN_TL_OFF 48 -#define ITCT_HDR_REJ_OPEN_TL_MSK (0xffff << \ - ITCT_REJ_OPEN_TL_OFF) +#define ITCT_HDR_REJ_OPEN_TL_MSK (0xffffULL << \ + ITCT_HDR_REJ_OPEN_TL_OFF) /* Err record header */ #define ERR_HDR_DMA_TX_ERR_TYPE_OFF 0 @@ -533,10 +528,10 @@ static void setup_itct_v1_hw(struct hisi_hba *hisi_hba, itct->sas_addr = __swab64(itct->sas_addr); /* qw2 */ - itct->qw2 = cpu_to_le64((500 < ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) | - (0xff00 < ITCT_HDR_BUS_INACTIVE_TL_OFF) | - (0xff00 < ITCT_HDR_MAX_CONN_TL_OFF) | - (0xff00 < ITCT_HDR_REJ_OPEN_TL_OFF)); + itct->qw2 = cpu_to_le64((500ULL << ITCT_HDR_IT_NEXUS_LOSS_TL_OFF) | + (0xff00ULL << ITCT_HDR_BUS_INACTIVE_TL_OFF) | + (0xff00ULL << ITCT_HDR_MAX_CONN_TL_OFF) | + (0xff00ULL << ITCT_HDR_REJ_OPEN_TL_OFF)); } static void free_device_v1_hw(struct hisi_hba *hisi_hba, @@ -544,7 +539,8 @@ static void free_device_v1_hw(struct hisi_hba *hisi_hba, { u64 dev_id = sas_dev->device_id; struct hisi_sas_itct *itct = &hisi_hba->itct[dev_id]; - u32 qw0, reg_val = hisi_sas_read32(hisi_hba, CFG_AGING_TIME); + u64 qw0; + u32 reg_val = hisi_sas_read32(hisi_hba, CFG_AGING_TIME); reg_val |= CFG_AGING_TIME_ITCT_REL_MSK; hisi_sas_write32(hisi_hba, CFG_AGING_TIME, reg_val); diff --git a/drivers/scsi/imm.c b/drivers/scsi/imm.c index 4e1a632ccf16..f8b88fa78e62 100644 --- a/drivers/scsi/imm.c +++ b/drivers/scsi/imm.c @@ -43,6 +43,7 @@ typedef struct { unsigned dp:1; /* Data phase present */ unsigned rd:1; /* Read data in data phase */ unsigned wanted:1; /* Parport sharing busy flag */ + unsigned int dev_no; /* Device number */ wait_queue_head_t *waiting; struct Scsi_Host *host; struct list_head list; @@ -1120,15 +1121,40 @@ static struct scsi_host_template imm_template = { static LIST_HEAD(imm_hosts); +/* + * Finds the first available device number that can be alloted to the + * new imm device and returns the address of the previous node so that + * we can add to the tail and have a list in the ascending order. + */ + +static inline imm_struct *find_parent(void) +{ + imm_struct *dev, *par = NULL; + unsigned int cnt = 0; + + if (list_empty(&imm_hosts)) + return NULL; + + list_for_each_entry(dev, &imm_hosts, list) { + if (dev->dev_no != cnt) + return par; + cnt++; + par = dev; + } + + return par; +} + static int __imm_attach(struct parport *pb) { struct Scsi_Host *host; - imm_struct *dev; + imm_struct *dev, *temp; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(waiting); DEFINE_WAIT(wait); int ports; int modes, ppb; int err = -ENOMEM; + struct pardev_cb imm_cb; init_waitqueue_head(&waiting); @@ -1141,9 +1167,15 @@ static int __imm_attach(struct parport *pb) dev->mode = IMM_AUTODETECT; INIT_LIST_HEAD(&dev->list); - dev->dev = parport_register_device(pb, "imm", NULL, imm_wakeup, - NULL, 0, dev); + temp = find_parent(); + if (temp) + dev->dev_no = temp->dev_no + 1; + + memset(&imm_cb, 0, sizeof(imm_cb)); + imm_cb.private = dev; + imm_cb.wakeup = imm_wakeup; + dev->dev = parport_register_dev_model(pb, "imm", &imm_cb, dev->dev_no); if (!dev->dev) goto out; @@ -1207,7 +1239,10 @@ static int __imm_attach(struct parport *pb) host->unique_id = pb->number; *(imm_struct **)&host->hostdata = dev; dev->host = host; - list_add_tail(&dev->list, &imm_hosts); + if (!temp) + list_add_tail(&dev->list, &imm_hosts); + else + list_add_tail(&dev->list, &temp->list); err = scsi_add_host(host, NULL); if (err) goto out2; @@ -1245,9 +1280,10 @@ static void imm_detach(struct parport *pb) } static struct parport_driver imm_driver = { - .name = "imm", - .attach = imm_attach, - .detach = imm_detach, + .name = "imm", + .match_port = imm_attach, + .detach = imm_detach, + .devmodel = true, }; static int __init imm_driver_init(void) diff --git a/drivers/scsi/ipr.c b/drivers/scsi/ipr.c index 536cd5a80422..3b3e0998fa6e 100644 --- a/drivers/scsi/ipr.c +++ b/drivers/scsi/ipr.c @@ -3638,7 +3638,7 @@ static struct device_attribute ipr_ioa_reset_attr = { .store = ipr_store_reset_adapter }; -static int ipr_iopoll(struct blk_iopoll *iop, int budget); +static int ipr_iopoll(struct irq_poll *iop, int budget); /** * ipr_show_iopoll_weight - Show ipr polling mode * @dev: class device struct @@ -3681,34 +3681,33 @@ static ssize_t ipr_store_iopoll_weight(struct device *dev, int i; if (!ioa_cfg->sis64) { - dev_info(&ioa_cfg->pdev->dev, "blk-iopoll not supported on this adapter\n"); + dev_info(&ioa_cfg->pdev->dev, "irq_poll not supported on this adapter\n"); return -EINVAL; } if (kstrtoul(buf, 10, &user_iopoll_weight)) return -EINVAL; if (user_iopoll_weight > 256) { - dev_info(&ioa_cfg->pdev->dev, "Invalid blk-iopoll weight. It must be less than 256\n"); + dev_info(&ioa_cfg->pdev->dev, "Invalid irq_poll weight. It must be less than 256\n"); return -EINVAL; } if (user_iopoll_weight == ioa_cfg->iopoll_weight) { - dev_info(&ioa_cfg->pdev->dev, "Current blk-iopoll weight has the same weight\n"); + dev_info(&ioa_cfg->pdev->dev, "Current irq_poll weight has the same weight\n"); return strlen(buf); } if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) { for (i = 1; i < ioa_cfg->hrrq_num; i++) - blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll); + irq_poll_disable(&ioa_cfg->hrrq[i].iopoll); } spin_lock_irqsave(shost->host_lock, lock_flags); ioa_cfg->iopoll_weight = user_iopoll_weight; if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) { for (i = 1; i < ioa_cfg->hrrq_num; i++) { - blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll, + irq_poll_init(&ioa_cfg->hrrq[i].iopoll, ioa_cfg->iopoll_weight, ipr_iopoll); - blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll); } } spin_unlock_irqrestore(shost->host_lock, lock_flags); @@ -4003,13 +4002,12 @@ static ssize_t ipr_store_update_fw(struct device *dev, struct ipr_sglist *sglist; char fname[100]; char *src; - int len, result, dnld_size; + int result, dnld_size; if (!capable(CAP_SYS_ADMIN)) return -EACCES; - len = snprintf(fname, 99, "%s", buf); - fname[len-1] = '\0'; + snprintf(fname, sizeof(fname), "%s", buf); if (request_firmware(&fw_entry, fname, &ioa_cfg->pdev->dev)) { dev_err(&ioa_cfg->pdev->dev, "Firmware file %s not found\n", fname); @@ -5569,7 +5567,7 @@ static int ipr_process_hrrq(struct ipr_hrr_queue *hrr_queue, int budget, return num_hrrq; } -static int ipr_iopoll(struct blk_iopoll *iop, int budget) +static int ipr_iopoll(struct irq_poll *iop, int budget) { struct ipr_ioa_cfg *ioa_cfg; struct ipr_hrr_queue *hrrq; @@ -5585,7 +5583,7 @@ static int ipr_iopoll(struct blk_iopoll *iop, int budget) completed_ops = ipr_process_hrrq(hrrq, budget, &doneq); if (completed_ops < budget) - blk_iopoll_complete(iop); + irq_poll_complete(iop); spin_unlock_irqrestore(hrrq->lock, hrrq_flags); list_for_each_entry_safe(ipr_cmd, temp, &doneq, queue) { @@ -5693,8 +5691,7 @@ static irqreturn_t ipr_isr_mhrrq(int irq, void *devp) if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) { if ((be32_to_cpu(*hrrq->hrrq_curr) & IPR_HRRQ_TOGGLE_BIT) == hrrq->toggle_bit) { - if (!blk_iopoll_sched_prep(&hrrq->iopoll)) - blk_iopoll_sched(&hrrq->iopoll); + irq_poll_sched(&hrrq->iopoll); spin_unlock_irqrestore(hrrq->lock, hrrq_flags); return IRQ_HANDLED; } @@ -10405,9 +10402,8 @@ static int ipr_probe(struct pci_dev *pdev, const struct pci_device_id *dev_id) if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) { for (i = 1; i < ioa_cfg->hrrq_num; i++) { - blk_iopoll_init(&ioa_cfg->hrrq[i].iopoll, + irq_poll_init(&ioa_cfg->hrrq[i].iopoll, ioa_cfg->iopoll_weight, ipr_iopoll); - blk_iopoll_enable(&ioa_cfg->hrrq[i].iopoll); } } @@ -10436,7 +10432,7 @@ static void ipr_shutdown(struct pci_dev *pdev) if (ioa_cfg->iopoll_weight && ioa_cfg->sis64 && ioa_cfg->nvectors > 1) { ioa_cfg->iopoll_weight = 0; for (i = 1; i < ioa_cfg->hrrq_num; i++) - blk_iopoll_disable(&ioa_cfg->hrrq[i].iopoll); + irq_poll_disable(&ioa_cfg->hrrq[i].iopoll); } while (ioa_cfg->in_reset_reload) { diff --git a/drivers/scsi/ipr.h b/drivers/scsi/ipr.h index a34c7a5a995e..56c57068300a 100644 --- a/drivers/scsi/ipr.h +++ b/drivers/scsi/ipr.h @@ -32,7 +32,7 @@ #include <linux/libata.h> #include <linux/list.h> #include <linux/kref.h> -#include <linux/blk-iopoll.h> +#include <linux/irq_poll.h> #include <scsi/scsi.h> #include <scsi/scsi_cmnd.h> @@ -517,7 +517,7 @@ struct ipr_hrr_queue { u8 allow_cmds:1; u8 removing_ioa:1; - struct blk_iopoll iopoll; + struct irq_poll iopoll; }; /* Command packet structure */ diff --git a/drivers/scsi/mac_scsi.c b/drivers/scsi/mac_scsi.c index d64a769b8155..bb2381314a2b 100644 --- a/drivers/scsi/mac_scsi.c +++ b/drivers/scsi/mac_scsi.c @@ -12,7 +12,6 @@ */ #include <linux/types.h> -#include <linux/delay.h> #include <linux/module.h> #include <linux/ioport.h> #include <linux/init.h> @@ -32,14 +31,13 @@ #define PSEUDO_DMA #define NCR5380_implementation_fields unsigned char *pdma_base -#define NCR5380_local_declare() struct Scsi_Host *_instance -#define NCR5380_setup(instance) _instance = instance -#define NCR5380_read(reg) macscsi_read(_instance, reg) -#define NCR5380_write(reg, value) macscsi_write(_instance, reg, value) +#define NCR5380_read(reg) macscsi_read(instance, reg) +#define NCR5380_write(reg, value) macscsi_write(instance, reg, value) #define NCR5380_pread macscsi_pread #define NCR5380_pwrite macscsi_pwrite +#define NCR5380_dma_xfer_len(instance, cmd, phase) (cmd->transfersize) #define NCR5380_intr macscsi_intr #define NCR5380_queue_command macscsi_queue_command @@ -51,8 +49,6 @@ #include "NCR5380.h" -#define RESET_BOOT - static int setup_can_queue = -1; module_param(setup_can_queue, int, 0); static int setup_cmd_per_lun = -1; @@ -65,17 +61,8 @@ static int setup_use_tagged_queuing = -1; module_param(setup_use_tagged_queuing, int, 0); static int setup_hostid = -1; module_param(setup_hostid, int, 0); - -/* Time (in jiffies) to wait after a reset; the SCSI standard calls for 250ms, - * we usually do 0.5s to be on the safe side. But Toshiba CD-ROMs once more - * need ten times the standard value... */ -#define TOSHIBA_DELAY - -#ifdef TOSHIBA_DELAY -#define AFTER_RESET_DELAY (5*HZ/2) -#else -#define AFTER_RESET_DELAY (HZ/2) -#endif +static int setup_toshiba_delay = -1; +module_param(setup_toshiba_delay, int, 0); /* * NCR 5380 register access functions @@ -94,12 +81,12 @@ static inline void macscsi_write(struct Scsi_Host *instance, int reg, int value) #ifndef MODULE static int __init mac_scsi_setup(char *str) { - int ints[7]; + int ints[8]; (void)get_options(str, ARRAY_SIZE(ints), ints); - if (ints[0] < 1 || ints[0] > 6) { - pr_err("Usage: mac5380=<can_queue>[,<cmd_per_lun>[,<sg_tablesize>[,<hostid>[,<use_tags>[,<use_pdma>]]]]]\n"); + if (ints[0] < 1) { + pr_err("Usage: mac5380=<can_queue>[,<cmd_per_lun>[,<sg_tablesize>[,<hostid>[,<use_tags>[,<use_pdma>[,<toshiba_delay>]]]]]]\n"); return 0; } if (ints[0] >= 1) @@ -114,50 +101,14 @@ static int __init mac_scsi_setup(char *str) setup_use_tagged_queuing = ints[5]; if (ints[0] >= 6) setup_use_pdma = ints[6]; + if (ints[0] >= 7) + setup_toshiba_delay = ints[7]; return 1; } __setup("mac5380=", mac_scsi_setup); #endif /* !MODULE */ -#ifdef RESET_BOOT -/* - * Our 'bus reset on boot' function - */ - -static void mac_scsi_reset_boot(struct Scsi_Host *instance) -{ - unsigned long end; - - NCR5380_local_declare(); - NCR5380_setup(instance); - - /* - * Do a SCSI reset to clean up the bus during initialization. No messing - * with the queues, interrupts, or locks necessary here. - */ - - printk(KERN_INFO "Macintosh SCSI: resetting the SCSI bus..." ); - - /* get in phase */ - NCR5380_write( TARGET_COMMAND_REG, - PHASE_SR_TO_TCR( NCR5380_read(STATUS_REG) )); - - /* assert RST */ - NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST ); - /* The min. reset hold time is 25us, so 40us should be enough */ - udelay( 50 ); - /* reset RST and interrupt */ - NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE ); - NCR5380_read( RESET_PARITY_INTERRUPT_REG ); - - for( end = jiffies + AFTER_RESET_DELAY; time_before(jiffies, end); ) - barrier(); - - printk(KERN_INFO " done\n" ); -} -#endif - #ifdef PSEUDO_DMA /* Pseudo-DMA: (Ove Edlund) @@ -235,9 +186,6 @@ static int macscsi_pread(struct Scsi_Host *instance, unsigned char *d; unsigned char *s; - NCR5380_local_declare(); - NCR5380_setup(instance); - s = hostdata->pdma_base + (INPUT_DATA_REG << 4); d = dst; @@ -329,9 +277,6 @@ static int macscsi_pwrite(struct Scsi_Host *instance, unsigned char *s; unsigned char *d; - NCR5380_local_declare(); - NCR5380_setup(instance); - s = src; d = hostdata->pdma_base + (OUTPUT_DATA_REG << 4); @@ -364,20 +309,22 @@ static int macscsi_pwrite(struct Scsi_Host *instance, #define PFX DRV_MODULE_NAME ": " static struct scsi_host_template mac_scsi_template = { - .module = THIS_MODULE, - .proc_name = DRV_MODULE_NAME, - .show_info = macscsi_show_info, - .write_info = macscsi_write_info, - .name = "Macintosh NCR5380 SCSI", - .info = macscsi_info, - .queuecommand = macscsi_queue_command, - .eh_abort_handler = macscsi_abort, - .eh_bus_reset_handler = macscsi_bus_reset, - .can_queue = 16, - .this_id = 7, - .sg_tablesize = SG_ALL, - .cmd_per_lun = 2, - .use_clustering = DISABLE_CLUSTERING + .module = THIS_MODULE, + .proc_name = DRV_MODULE_NAME, + .show_info = macscsi_show_info, + .write_info = macscsi_write_info, + .name = "Macintosh NCR5380 SCSI", + .info = macscsi_info, + .queuecommand = macscsi_queue_command, + .eh_abort_handler = macscsi_abort, + .eh_bus_reset_handler = macscsi_bus_reset, + .can_queue = 16, + .this_id = 7, + .sg_tablesize = SG_ALL, + .cmd_per_lun = 2, + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; static int __init mac_scsi_probe(struct platform_device *pdev) @@ -432,15 +379,14 @@ static int __init mac_scsi_probe(struct platform_device *pdev) } else host_flags |= FLAG_NO_PSEUDO_DMA; -#ifdef RESET_BOOT - mac_scsi_reset_boot(instance); -#endif - #ifdef SUPPORT_TAGS host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0; #endif + host_flags |= setup_toshiba_delay > 0 ? FLAG_TOSHIBA_DELAY : 0; - NCR5380_init(instance, host_flags); + error = NCR5380_init(instance, host_flags); + if (error) + goto fail_init; if (instance->irq != NO_IRQ) { error = request_irq(instance->irq, macscsi_intr, IRQF_SHARED, @@ -449,6 +395,8 @@ static int __init mac_scsi_probe(struct platform_device *pdev) goto fail_irq; } + NCR5380_maybe_reset_bus(instance); + error = scsi_add_host(instance, NULL); if (error) goto fail_host; @@ -463,6 +411,7 @@ fail_host: free_irq(instance->irq, instance); fail_irq: NCR5380_exit(instance); +fail_init: scsi_host_put(instance); return error; } diff --git a/drivers/scsi/megaraid/megaraid_mm.c b/drivers/scsi/megaraid/megaraid_mm.c index a70692779a16..4cf9ed96414f 100644 --- a/drivers/scsi/megaraid/megaraid_mm.c +++ b/drivers/scsi/megaraid/megaraid_mm.c @@ -179,8 +179,12 @@ mraid_mm_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) /* * The following call will block till a kioc is available + * or return NULL if the list head is empty for the pointer + * of type mraid_mmapt passed to mraid_mm_alloc_kioc */ kioc = mraid_mm_alloc_kioc(adp); + if (!kioc) + return -ENXIO; /* * User sent the old mimd_t ioctl packet. Convert it to uioc_t. diff --git a/drivers/scsi/pas16.c b/drivers/scsi/pas16.c index e81eadd08afc..512037e27783 100644 --- a/drivers/scsi/pas16.c +++ b/drivers/scsi/pas16.c @@ -1,6 +1,4 @@ #define PSEUDO_DMA -#define UNSAFE /* Not unsafe for PAS16 -- use it */ -#define PDEBUG 0 /* * This driver adapted from Drew Eckhardt's Trantor T128 driver @@ -71,14 +69,10 @@ #include <linux/module.h> -#include <linux/signal.h> -#include <linux/proc_fs.h> #include <asm/io.h> #include <asm/dma.h> #include <linux/blkdev.h> -#include <linux/delay.h> #include <linux/interrupt.h> -#include <linux/stat.h> #include <linux/init.h> #include <scsi/scsi_host.h> @@ -87,8 +81,8 @@ #include "NCR5380.h" -static unsigned short pas16_addr = 0; -static int pas16_irq = 0; +static unsigned short pas16_addr; +static int pas16_irq; static const int scsi_irq_translate[] = @@ -146,22 +140,6 @@ static const unsigned short pas16_offset[ 8 ] = * START_DMA_INITIATOR_RECEIVE_REG wo */ }; -/*----------------------------------------------------------------*/ -/* the following will set the monitor border color (useful to find - where something crashed or gets stuck at */ -/* 1 = blue - 2 = green - 3 = cyan - 4 = red - 5 = magenta - 6 = yellow - 7 = white -*/ -#if 1 -#define rtrc(i) {inb(0x3da); outb(0x31, 0x3c0); outb((i), 0x3c0);} -#else -#define rtrc(i) {} -#endif /* @@ -205,7 +183,7 @@ static void __init outb( 0x01, io_port + P_TIMEOUT_STATUS_REG_OFFSET ); /* Reset TC */ outb( 0x01, io_port + WAIT_STATE ); /* 1 Wait state */ - NCR5380_read( RESET_PARITY_INTERRUPT_REG ); + inb(io_port + pas16_offset[RESET_PARITY_INTERRUPT_REG]); /* Set the SCSI interrupt pointer without mucking up the sound * interrupt pointer in the same byte. @@ -280,13 +258,13 @@ static int __init * put in an additional test to try to weed them out. */ - outb( 0x01, io_port + WAIT_STATE ); /* 1 Wait state */ - NCR5380_write( MODE_REG, 0x20 ); /* Is it really SCSI? */ - if( NCR5380_read( MODE_REG ) != 0x20 ) /* Write to a reg. */ - return 0; /* and try to read */ - NCR5380_write( MODE_REG, 0x00 ); /* it back. */ - if( NCR5380_read( MODE_REG ) != 0x00 ) - return 0; + outb(0x01, io_port + WAIT_STATE); /* 1 Wait state */ + outb(0x20, io_port + pas16_offset[MODE_REG]); /* Is it really SCSI? */ + if (inb(io_port + pas16_offset[MODE_REG]) != 0x20) /* Write to a reg. */ + return 0; /* and try to read */ + outb(0x00, io_port + pas16_offset[MODE_REG]); /* it back. */ + if (inb(io_port + pas16_offset[MODE_REG]) != 0x00) + return 0; return 1; } @@ -305,7 +283,7 @@ static int __init static int __init pas16_setup(char *str) { - static int commandline_current = 0; + static int commandline_current; int i; int ints[10]; @@ -344,8 +322,8 @@ __setup("pas16=", pas16_setup); static int __init pas16_detect(struct scsi_host_template *tpnt) { - static int current_override = 0; - static unsigned short current_base = 0; + static int current_override; + static unsigned short current_base; struct Scsi_Host *instance; unsigned short io_port; int count; @@ -377,34 +355,32 @@ static int __init pas16_detect(struct scsi_host_template *tpnt) } else for (; !io_port && (current_base < NO_BASES); ++current_base) { -#if (PDEBUG & PDEBUG_INIT) - printk("scsi-pas16 : probing io_port %04x\n", (unsigned int) bases[current_base].io_port); -#endif + dprintk(NDEBUG_INIT, "pas16: probing io_port 0x%04x\n", + (unsigned int)bases[current_base].io_port); if ( !bases[current_base].noauto && pas16_hw_detect( current_base ) ){ io_port = bases[current_base].io_port; init_board( io_port, default_irqs[ current_base ], 0 ); -#if (PDEBUG & PDEBUG_INIT) - printk("scsi-pas16 : detected board.\n"); -#endif + dprintk(NDEBUG_INIT, "pas16: detected board\n"); } } - -#if defined(PDEBUG) && (PDEBUG & PDEBUG_INIT) - printk("scsi-pas16 : io_port = %04x\n", (unsigned int) io_port); -#endif + dprintk(NDEBUG_INIT, "pas16: io_port = 0x%04x\n", + (unsigned int)io_port); if (!io_port) break; instance = scsi_register (tpnt, sizeof(struct NCR5380_hostdata)); if(instance == NULL) - break; + goto out; instance->io_port = io_port; - NCR5380_init(instance, 0); + if (NCR5380_init(instance, 0)) + goto out_unregister; + + NCR5380_maybe_reset_bus(instance); if (overrides[current_override].irq != IRQ_AUTO) instance->irq = overrides[current_override].irq; @@ -431,14 +407,18 @@ static int __init pas16_detect(struct scsi_host_template *tpnt) outb( (inb(io_port + IO_CONFIG_3) & 0x0f), io_port + IO_CONFIG_3 ); } -#if defined(PDEBUG) && (PDEBUG & PDEBUG_INIT) - printk("scsi%d : irq = %d\n", instance->host_no, instance->irq); -#endif + dprintk(NDEBUG_INIT, "scsi%d : irq = %d\n", + instance->host_no, instance->irq); ++current_override; ++count; } return count; + +out_unregister: + scsi_unregister(instance); +out: + return count; } /* @@ -561,29 +541,29 @@ static int pas16_release(struct Scsi_Host *shost) if (shost->irq != NO_IRQ) free_irq(shost->irq, shost); NCR5380_exit(shost); - if (shost->io_port && shost->n_io_port) - release_region(shost->io_port, shost->n_io_port); scsi_unregister(shost); return 0; } static struct scsi_host_template driver_template = { - .name = "Pro Audio Spectrum-16 SCSI", - .detect = pas16_detect, - .release = pas16_release, - .proc_name = "pas16", - .show_info = pas16_show_info, - .write_info = pas16_write_info, - .info = pas16_info, - .queuecommand = pas16_queue_command, - .eh_abort_handler = pas16_abort, - .eh_bus_reset_handler = pas16_bus_reset, - .bios_param = pas16_biosparam, - .can_queue = CAN_QUEUE, - .this_id = 7, - .sg_tablesize = SG_ALL, - .cmd_per_lun = CMD_PER_LUN, - .use_clustering = DISABLE_CLUSTERING, + .name = "Pro Audio Spectrum-16 SCSI", + .detect = pas16_detect, + .release = pas16_release, + .proc_name = "pas16", + .show_info = pas16_show_info, + .write_info = pas16_write_info, + .info = pas16_info, + .queuecommand = pas16_queue_command, + .eh_abort_handler = pas16_abort, + .eh_bus_reset_handler = pas16_bus_reset, + .bios_param = pas16_biosparam, + .can_queue = 32, + .this_id = 7, + .sg_tablesize = SG_ALL, + .cmd_per_lun = 2, + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; #include "scsi_module.c" diff --git a/drivers/scsi/pas16.h b/drivers/scsi/pas16.h index c6109c80050b..d37527717225 100644 --- a/drivers/scsi/pas16.h +++ b/drivers/scsi/pas16.h @@ -24,9 +24,6 @@ #ifndef PAS16_H #define PAS16_H -#define PDEBUG_INIT 0x1 -#define PDEBUG_TRANSFER 0x2 - #define PAS16_DEFAULT_BASE_1 0x388 #define PAS16_DEFAULT_BASE_2 0x384 #define PAS16_DEFAULT_BASE_3 0x38c @@ -98,46 +95,16 @@ #define OPERATION_MODE_1 0xec03 #define IO_CONFIG_3 0xf002 +#define NCR5380_implementation_fields /* none */ -#ifndef ASM - -#ifndef CMD_PER_LUN -#define CMD_PER_LUN 2 -#endif - -#ifndef CAN_QUEUE -#define CAN_QUEUE 32 -#endif - -#define NCR5380_implementation_fields \ - volatile unsigned short io_port - -#define NCR5380_local_declare() \ - volatile unsigned short io_port +#define PAS16_io_port(reg) (instance->io_port + pas16_offset[(reg)]) -#define NCR5380_setup(instance) \ - io_port = (instance)->io_port - -#define PAS16_io_port(reg) ( io_port + pas16_offset[(reg)] ) - -#if !(PDEBUG & PDEBUG_TRANSFER) #define NCR5380_read(reg) ( inb(PAS16_io_port(reg)) ) #define NCR5380_write(reg, value) ( outb((value),PAS16_io_port(reg)) ) -#else -#define NCR5380_read(reg) \ - (((unsigned char) printk("scsi%d : read register %d at io_port %04x\n"\ - , instance->hostno, (reg), PAS16_io_port(reg))), inb( PAS16_io_port(reg)) ) - -#define NCR5380_write(reg, value) \ - (printk("scsi%d : write %02x to register %d at io_port %04x\n", \ - instance->hostno, (value), (reg), PAS16_io_port(reg)), \ - outb( (value),PAS16_io_port(reg) ) ) - -#endif +#define NCR5380_dma_xfer_len(instance, cmd, phase) (cmd->transfersize) #define NCR5380_intr pas16_intr -#define do_NCR5380_intr do_pas16_intr #define NCR5380_queue_command pas16_queue_command #define NCR5380_abort pas16_abort #define NCR5380_bus_reset pas16_bus_reset @@ -150,5 +117,4 @@ #define PAS16_IRQS 0xd4a8 -#endif /* ndef ASM */ #endif /* PAS16_H */ diff --git a/drivers/scsi/scsi_devinfo.c b/drivers/scsi/scsi_devinfo.c index 2c1160c7ec92..47b9d13f97b8 100644 --- a/drivers/scsi/scsi_devinfo.c +++ b/drivers/scsi/scsi_devinfo.c @@ -227,6 +227,7 @@ static struct { {"Promise", "VTrak E610f", NULL, BLIST_SPARSELUN | BLIST_NO_RSOC}, {"Promise", "", NULL, BLIST_SPARSELUN}, {"QNAP", "iSCSI Storage", NULL, BLIST_MAX_1024}, + {"SYNOLOGY", "iSCSI Storage", NULL, BLIST_MAX_1024}, {"QUANTUM", "XP34301", "1071", BLIST_NOTQ}, {"REGAL", "CDC-4X", NULL, BLIST_MAX5LUN | BLIST_SINGLELUN}, {"SanDisk", "ImageMate CF-SD1", NULL, BLIST_FORCELUN}, diff --git a/drivers/scsi/storvsc_drv.c b/drivers/scsi/storvsc_drv.c index 41c115c230d9..55627d097873 100644 --- a/drivers/scsi/storvsc_drv.c +++ b/drivers/scsi/storvsc_drv.c @@ -390,7 +390,7 @@ module_param(storvsc_ringbuffer_size, int, S_IRUGO); MODULE_PARM_DESC(storvsc_ringbuffer_size, "Ring buffer size (bytes)"); module_param(storvsc_vcpus_per_sub_channel, int, S_IRUGO); -MODULE_PARM_DESC(vcpus_per_sub_channel, "Ratio of VCPUs to subchannels"); +MODULE_PARM_DESC(storvsc_vcpus_per_sub_channel, "Ratio of VCPUs to subchannels"); /* * Timeout in seconds for all devices managed by this driver. */ diff --git a/drivers/scsi/sun3_scsi.c b/drivers/scsi/sun3_scsi.c index 22a42836d193..b9de487bbd31 100644 --- a/drivers/scsi/sun3_scsi.c +++ b/drivers/scsi/sun3_scsi.c @@ -53,13 +53,12 @@ #define NCR5380_queue_command sun3scsi_queue_command #define NCR5380_bus_reset sun3scsi_bus_reset #define NCR5380_abort sun3scsi_abort -#define NCR5380_show_info sun3scsi_show_info #define NCR5380_info sun3scsi_info #define NCR5380_dma_read_setup(instance, data, count) \ - sun3scsi_dma_setup(data, count, 0) + sun3scsi_dma_setup(instance, data, count, 0) #define NCR5380_dma_write_setup(instance, data, count) \ - sun3scsi_dma_setup(data, count, 1) + sun3scsi_dma_setup(instance, data, count, 1) #define NCR5380_dma_residual(instance) \ sun3scsi_dma_residual(instance) #define NCR5380_dma_xfer_len(instance, cmd, phase) \ @@ -86,10 +85,6 @@ module_param(setup_use_tagged_queuing, int, 0); static int setup_hostid = -1; module_param(setup_hostid, int, 0); -/* #define RESET_BOOT */ - -#define AFTER_RESET_DELAY (HZ/2) - /* ms to wait after hitting dma regs */ #define SUN3_DMA_DELAY 10 @@ -100,11 +95,10 @@ static struct scsi_cmnd *sun3_dma_setup_done; static unsigned char *sun3_scsi_regp; static volatile struct sun3_dma_regs *dregs; static struct sun3_udc_regs *udc_regs; -static unsigned char *sun3_dma_orig_addr = NULL; -static unsigned long sun3_dma_orig_count = 0; -static int sun3_dma_active = 0; -static unsigned long last_residual = 0; -static struct Scsi_Host *default_instance; +static unsigned char *sun3_dma_orig_addr; +static unsigned long sun3_dma_orig_count; +static int sun3_dma_active; +static unsigned long last_residual; /* * NCR 5380 register access functions @@ -144,50 +138,12 @@ static inline void sun3_udc_write(unsigned short val, unsigned char reg) } #endif -#ifdef RESET_BOOT -static void sun3_scsi_reset_boot(struct Scsi_Host *instance) -{ - unsigned long end; - - /* - * Do a SCSI reset to clean up the bus during initialization. No - * messing with the queues, interrupts, or locks necessary here. - */ - - printk( "Sun3 SCSI: resetting the SCSI bus..." ); - - /* switch off SCSI IRQ - catch an interrupt without IRQ bit set else */ -// sun3_disable_irq( IRQ_SUN3_SCSI ); - - /* get in phase */ - NCR5380_write( TARGET_COMMAND_REG, - PHASE_SR_TO_TCR( NCR5380_read(STATUS_REG) )); - - /* assert RST */ - NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE | ICR_ASSERT_RST ); - - /* The min. reset hold time is 25us, so 40us should be enough */ - udelay( 50 ); - - /* reset RST and interrupt */ - NCR5380_write( INITIATOR_COMMAND_REG, ICR_BASE ); - NCR5380_read( RESET_PARITY_INTERRUPT_REG ); - - for( end = jiffies + AFTER_RESET_DELAY; time_before(jiffies, end); ) - barrier(); - - /* switch on SCSI IRQ again */ -// sun3_enable_irq( IRQ_SUN3_SCSI ); - - printk( " done\n" ); -} -#endif - // safe bits for the CSR #define CSR_GOOD 0x060f -static irqreturn_t scsi_sun3_intr(int irq, void *dummy) +static irqreturn_t scsi_sun3_intr(int irq, void *dev) { + struct Scsi_Host *instance = dev; unsigned short csr = dregs->csr; int handled = 0; @@ -196,46 +152,24 @@ static irqreturn_t scsi_sun3_intr(int irq, void *dummy) #endif if(csr & ~CSR_GOOD) { - if(csr & CSR_DMA_BUSERR) { - printk("scsi%d: bus error in dma\n", default_instance->host_no); - } - - if(csr & CSR_DMA_CONFLICT) { - printk("scsi%d: dma conflict\n", default_instance->host_no); - } + if (csr & CSR_DMA_BUSERR) + shost_printk(KERN_ERR, instance, "bus error in DMA\n"); + if (csr & CSR_DMA_CONFLICT) + shost_printk(KERN_ERR, instance, "DMA conflict\n"); handled = 1; } if(csr & (CSR_SDB_INT | CSR_DMA_INT)) { - NCR5380_intr(irq, dummy); + NCR5380_intr(irq, dev); handled = 1; } return IRQ_RETVAL(handled); } -/* - * Debug stuff - to be called on NMI, or sysrq key. Use at your own risk; - * reentering NCR5380_print_status seems to have ugly side effects - */ - -/* this doesn't seem to get used at all -- sam */ -#if 0 -void sun3_sun3_debug (void) -{ - unsigned long flags; - - if (default_instance) { - local_irq_save(flags); - NCR5380_print_status(default_instance); - local_irq_restore(flags); - } -} -#endif - - /* sun3scsi_dma_setup() -- initialize the dma controller for a read/write */ -static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int write_flag) +static unsigned long sun3scsi_dma_setup(struct Scsi_Host *instance, + void *data, unsigned long count, int write_flag) { void *addr; @@ -287,10 +221,9 @@ static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int wri dregs->csr |= CSR_FIFO; if(dregs->fifo_count != count) { - printk("scsi%d: fifo_mismatch %04x not %04x\n", - default_instance->host_no, dregs->fifo_count, - (unsigned int) count); - NCR5380_dprint(NDEBUG_DMA, default_instance); + shost_printk(KERN_ERR, instance, "FIFO mismatch %04x not %04x\n", + dregs->fifo_count, (unsigned int) count); + NCR5380_dprint(NDEBUG_DMA, instance); } /* setup udc */ @@ -325,21 +258,6 @@ static unsigned long sun3scsi_dma_setup(void *data, unsigned long count, int wri } -#ifndef SUN3_SCSI_VME -static inline unsigned long sun3scsi_dma_count(struct Scsi_Host *instance) -{ - unsigned short resid; - - dregs->udc_addr = 0x32; - udelay(SUN3_DMA_DELAY); - resid = dregs->udc_data; - udelay(SUN3_DMA_DELAY); - resid *= 2; - - return (unsigned long) resid; -} -#endif - static inline unsigned long sun3scsi_dma_residual(struct Scsi_Host *instance) { return last_residual; @@ -437,7 +355,10 @@ static int sun3scsi_dma_finish(int write_flag) } } - count = sun3scsi_dma_count(default_instance); + dregs->udc_addr = 0x32; + udelay(SUN3_DMA_DELAY); + count = 2 * dregs->udc_data; + udelay(SUN3_DMA_DELAY); fifo = dregs->fifo_count; last_residual = fifo; @@ -502,17 +423,17 @@ static int sun3scsi_dma_finish(int write_flag) static struct scsi_host_template sun3_scsi_template = { .module = THIS_MODULE, .proc_name = DRV_MODULE_NAME, - .show_info = sun3scsi_show_info, .name = SUN3_SCSI_NAME, .info = sun3scsi_info, .queuecommand = sun3scsi_queue_command, - .eh_abort_handler = sun3scsi_abort, - .eh_bus_reset_handler = sun3scsi_bus_reset, + .eh_abort_handler = sun3scsi_abort, + .eh_bus_reset_handler = sun3scsi_bus_reset, .can_queue = 16, .this_id = 7, .sg_tablesize = SG_NONE, .cmd_per_lun = 2, - .use_clustering = DISABLE_CLUSTERING + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, }; static int __init sun3_scsi_probe(struct platform_device *pdev) @@ -591,7 +512,6 @@ static int __init sun3_scsi_probe(struct platform_device *pdev) error = -ENOMEM; goto fail_alloc; } - default_instance = instance; instance->io_port = (unsigned long)ioaddr; instance->irq = irq->start; @@ -600,7 +520,9 @@ static int __init sun3_scsi_probe(struct platform_device *pdev) host_flags |= setup_use_tagged_queuing > 0 ? FLAG_TAGGED_QUEUING : 0; #endif - NCR5380_init(instance, host_flags); + error = NCR5380_init(instance, host_flags); + if (error) + goto fail_init; error = request_irq(instance->irq, scsi_sun3_intr, 0, "NCR5380", instance); @@ -631,9 +553,7 @@ static int __init sun3_scsi_probe(struct platform_device *pdev) dregs->ivect = VME_DATA24 | (instance->irq & 0xff); #endif -#ifdef RESET_BOOT - sun3_scsi_reset_boot(instance); -#endif + NCR5380_maybe_reset_bus(instance); error = scsi_add_host(instance, NULL); if (error) @@ -649,6 +569,7 @@ fail_host: free_irq(instance->irq, instance); fail_irq: NCR5380_exit(instance); +fail_init: scsi_host_put(instance); fail_alloc: if (udc_regs) diff --git a/drivers/scsi/t128.c b/drivers/scsi/t128.c index 87828acbf7c6..4615fda60dbd 100644 --- a/drivers/scsi/t128.c +++ b/drivers/scsi/t128.c @@ -68,14 +68,11 @@ * 15 9-11 */ -#include <linux/signal.h> #include <linux/io.h> #include <linux/blkdev.h> #include <linux/interrupt.h> -#include <linux/stat.h> #include <linux/init.h> #include <linux/module.h> -#include <linux/delay.h> #include <scsi/scsi_host.h> #include "t128.h" @@ -126,7 +123,7 @@ static struct signature { static int __init t128_setup(char *str) { - static int commandline_current = 0; + static int commandline_current; int i; int ints[10]; @@ -165,7 +162,7 @@ __setup("t128=", t128_setup); static int __init t128_detect(struct scsi_host_template *tpnt) { - static int current_override = 0, current_base = 0; + static int current_override, current_base; struct Scsi_Host *instance; unsigned long base; void __iomem *p; @@ -182,9 +179,8 @@ static int __init t128_detect(struct scsi_host_template *tpnt) base = 0; } else for (; !base && (current_base < NO_BASES); ++current_base) { -#if (TDEBUG & TDEBUG_INIT) - printk("scsi-t128 : probing address %08x\n", bases[current_base].address); -#endif + dprintk(NDEBUG_INIT, "t128: probing address 0x%08x\n", + bases[current_base].address); if (bases[current_base].noauto) continue; p = ioremap(bases[current_base].address, 0x2000); @@ -195,17 +191,13 @@ static int __init t128_detect(struct scsi_host_template *tpnt) signatures[sig].string, strlen(signatures[sig].string))) { base = bases[current_base].address; -#if (TDEBUG & TDEBUG_INIT) - printk("scsi-t128 : detected board.\n"); -#endif + dprintk(NDEBUG_INIT, "t128: detected board\n"); goto found; } iounmap(p); } -#if defined(TDEBUG) && (TDEBUG & TDEBUG_INIT) - printk("scsi-t128 : base = %08x\n", (unsigned int) base); -#endif + dprintk(NDEBUG_INIT, "t128: base = 0x%08x\n", (unsigned int)base); if (!base) break; @@ -213,12 +205,15 @@ static int __init t128_detect(struct scsi_host_template *tpnt) found: instance = scsi_register (tpnt, sizeof(struct NCR5380_hostdata)); if(instance == NULL) - break; - + goto out_unmap; + instance->base = base; ((struct NCR5380_hostdata *)instance->hostdata)->base = p; - NCR5380_init(instance, 0); + if (NCR5380_init(instance, 0)) + goto out_unregister; + + NCR5380_maybe_reset_bus(instance); if (overrides[current_override].irq != IRQ_AUTO) instance->irq = overrides[current_override].irq; @@ -242,27 +237,30 @@ found: printk("scsi%d : please jumper the board for a free IRQ.\n", instance->host_no); } -#if defined(TDEBUG) && (TDEBUG & TDEBUG_INIT) - printk("scsi%d : irq = %d\n", instance->host_no, instance->irq); -#endif + dprintk(NDEBUG_INIT, "scsi%d: irq = %d\n", + instance->host_no, instance->irq); ++current_override; ++count; } return count; + +out_unregister: + scsi_unregister(instance); +out_unmap: + iounmap(p); + return count; } static int t128_release(struct Scsi_Host *shost) { - NCR5380_local_declare(); - NCR5380_setup(shost); + struct NCR5380_hostdata *hostdata = shost_priv(shost); + if (shost->irq != NO_IRQ) free_irq(shost->irq, shost); NCR5380_exit(shost); - if (shost->io_port && shost->n_io_port) - release_region(shost->io_port, shost->n_io_port); scsi_unregister(shost); - iounmap(base); + iounmap(hostdata->base); return 0; } @@ -308,14 +306,14 @@ static int t128_biosparam(struct scsi_device *sdev, struct block_device *bdev, * timeout. */ -static inline int NCR5380_pread (struct Scsi_Host *instance, unsigned char *dst, - int len) { - NCR5380_local_declare(); - void __iomem *reg; +static inline int +NCR5380_pread(struct Scsi_Host *instance, unsigned char *dst, int len) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + void __iomem *reg, *base = hostdata->base; unsigned char *d = dst; register int i = len; - NCR5380_setup(instance); reg = base + T_DATA_REG_OFFSET; #if 0 @@ -354,14 +352,14 @@ static inline int NCR5380_pread (struct Scsi_Host *instance, unsigned char *dst, * timeout. */ -static inline int NCR5380_pwrite (struct Scsi_Host *instance, unsigned char *src, - int len) { - NCR5380_local_declare(); - void __iomem *reg; +static inline int +NCR5380_pwrite(struct Scsi_Host *instance, unsigned char *src, int len) +{ + struct NCR5380_hostdata *hostdata = shost_priv(instance); + void __iomem *reg, *base = hostdata->base; unsigned char *s = src; register int i = len; - NCR5380_setup(instance); reg = base + T_DATA_REG_OFFSET; #if 0 @@ -392,21 +390,23 @@ MODULE_LICENSE("GPL"); #include "NCR5380.c" static struct scsi_host_template driver_template = { - .name = "Trantor T128/T128F/T228", - .detect = t128_detect, - .release = t128_release, - .proc_name = "t128", - .show_info = t128_show_info, - .write_info = t128_write_info, - .info = t128_info, - .queuecommand = t128_queue_command, - .eh_abort_handler = t128_abort, - .eh_bus_reset_handler = t128_bus_reset, - .bios_param = t128_biosparam, - .can_queue = CAN_QUEUE, - .this_id = 7, - .sg_tablesize = SG_ALL, - .cmd_per_lun = CMD_PER_LUN, - .use_clustering = DISABLE_CLUSTERING, + .name = "Trantor T128/T128F/T228", + .detect = t128_detect, + .release = t128_release, + .proc_name = "t128", + .show_info = t128_show_info, + .write_info = t128_write_info, + .info = t128_info, + .queuecommand = t128_queue_command, + .eh_abort_handler = t128_abort, + .eh_bus_reset_handler = t128_bus_reset, + .bios_param = t128_biosparam, + .can_queue = 32, + .this_id = 7, + .sg_tablesize = SG_ALL, + .cmd_per_lun = 2, + .use_clustering = DISABLE_CLUSTERING, + .cmd_size = NCR5380_CMD_SIZE, + .max_sectors = 128, }; #include "scsi_module.c" diff --git a/drivers/scsi/t128.h b/drivers/scsi/t128.h index 2c7371454dfd..dd16d85497e1 100644 --- a/drivers/scsi/t128.h +++ b/drivers/scsi/t128.h @@ -23,10 +23,6 @@ #ifndef T128_H #define T128_H -#define TDEBUG 0 -#define TDEBUG_INIT 0x1 -#define TDEBUG_TRANSFER 0x2 - /* * The trantor boards are memory mapped. They use an NCR5380 or * equivalent (my sample board had part second sourced from ZILOG). @@ -71,44 +67,18 @@ #define T_DATA_REG_OFFSET 0x1e00 /* rw 512 bytes long */ -#ifndef ASM - -#ifndef CMD_PER_LUN -#define CMD_PER_LUN 2 -#endif - -#ifndef CAN_QUEUE -#define CAN_QUEUE 32 -#endif - #define NCR5380_implementation_fields \ void __iomem *base -#define NCR5380_local_declare() \ - void __iomem *base - -#define NCR5380_setup(instance) \ - base = ((struct NCR5380_hostdata *)(instance->hostdata))->base +#define T128_address(reg) \ + (((struct NCR5380_hostdata *)shost_priv(instance))->base + T_5380_OFFSET + ((reg) * 0x20)) -#define T128_address(reg) (base + T_5380_OFFSET + ((reg) * 0x20)) - -#if !(TDEBUG & TDEBUG_TRANSFER) #define NCR5380_read(reg) readb(T128_address(reg)) #define NCR5380_write(reg, value) writeb((value),(T128_address(reg))) -#else -#define NCR5380_read(reg) \ - (((unsigned char) printk("scsi%d : read register %d at address %08x\n"\ - , instance->hostno, (reg), T128_address(reg))), readb(T128_address(reg))) - -#define NCR5380_write(reg, value) { \ - printk("scsi%d : write %02x to register %d at address %08x\n", \ - instance->hostno, (value), (reg), T128_address(reg)); \ - writeb((value), (T128_address(reg))); \ -} -#endif + +#define NCR5380_dma_xfer_len(instance, cmd, phase) (cmd->transfersize) #define NCR5380_intr t128_intr -#define do_NCR5380_intr do_t128_intr #define NCR5380_queue_command t128_queue_command #define NCR5380_abort t128_abort #define NCR5380_bus_reset t128_bus_reset @@ -121,5 +91,4 @@ #define T128_IRQS 0xc4a8 -#endif /* ndef ASM */ #endif /* T128_H */ diff --git a/drivers/soc/Kconfig b/drivers/soc/Kconfig index fb2b3935b658..88260205a261 100644 --- a/drivers/soc/Kconfig +++ b/drivers/soc/Kconfig @@ -7,6 +7,7 @@ source "drivers/soc/mediatek/Kconfig" source "drivers/soc/qcom/Kconfig" source "drivers/soc/rockchip/Kconfig" source "drivers/soc/sunxi/Kconfig" +source "drivers/soc/tegra/Kconfig" source "drivers/soc/ti/Kconfig" source "drivers/soc/versatile/Kconfig" diff --git a/drivers/soc/qcom/spm.c b/drivers/soc/qcom/spm.c index 0ad66fa9bb1a..5548a31e1a39 100644 --- a/drivers/soc/qcom/spm.c +++ b/drivers/soc/qcom/spm.c @@ -288,7 +288,7 @@ static struct spm_driver_data *spm_get_drv(struct platform_device *pdev, struct spm_driver_data *drv = NULL; struct device_node *cpu_node, *saw_node; int cpu; - bool found; + bool found = 0; for_each_possible_cpu(cpu) { cpu_node = of_cpu_device_node_get(cpu); diff --git a/drivers/soc/tegra/Kconfig b/drivers/soc/tegra/Kconfig new file mode 100644 index 000000000000..d0c3c3e085e3 --- /dev/null +++ b/drivers/soc/tegra/Kconfig @@ -0,0 +1,83 @@ +if ARCH_TEGRA + +# 32-bit ARM SoCs +if ARM + +config ARCH_TEGRA_2x_SOC + bool "Enable support for Tegra20 family" + select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP + select ARM_ERRATA_720789 + select ARM_ERRATA_754327 if SMP + select ARM_ERRATA_764369 if SMP + select PINCTRL_TEGRA20 + select PL310_ERRATA_727915 if CACHE_L2X0 + select PL310_ERRATA_769419 if CACHE_L2X0 + select TEGRA_TIMER + help + Support for NVIDIA Tegra AP20 and T20 processors, based on the + ARM CortexA9MP CPU and the ARM PL310 L2 cache controller + +config ARCH_TEGRA_3x_SOC + bool "Enable support for Tegra30 family" + select ARM_ERRATA_754322 + select ARM_ERRATA_764369 if SMP + select PINCTRL_TEGRA30 + select PL310_ERRATA_769419 if CACHE_L2X0 + select TEGRA_TIMER + help + Support for NVIDIA Tegra T30 processor family, based on the + ARM CortexA9MP CPU and the ARM PL310 L2 cache controller + +config ARCH_TEGRA_114_SOC + bool "Enable support for Tegra114 family" + select ARM_ERRATA_798181 if SMP + select ARM_L1_CACHE_SHIFT_6 + select HAVE_ARM_ARCH_TIMER + select PINCTRL_TEGRA114 + select TEGRA_TIMER + help + Support for NVIDIA Tegra T114 processor family, based on the + ARM CortexA15MP CPU + +config ARCH_TEGRA_124_SOC + bool "Enable support for Tegra124 family" + select ARM_L1_CACHE_SHIFT_6 + select HAVE_ARM_ARCH_TIMER + select PINCTRL_TEGRA124 + select TEGRA_TIMER + help + Support for NVIDIA Tegra T124 processor family, based on the + ARM CortexA15MP CPU + +endif + +# 64-bit ARM SoCs +if ARM64 + +config ARCH_TEGRA_132_SOC + bool "NVIDIA Tegra132 SoC" + select PINCTRL_TEGRA124 + help + Enable support for NVIDIA Tegra132 SoC, based on the Denver + ARMv8 CPU. The Tegra132 SoC is similar to the Tegra124 SoC, + but contains an NVIDIA Denver CPU complex in place of + Tegra124's "4+1" Cortex-A15 CPU complex. + +config ARCH_TEGRA_210_SOC + bool "NVIDIA Tegra210 SoC" + select PINCTRL_TEGRA210 + help + Enable support for the NVIDIA Tegra210 SoC. Also known as Tegra X1, + the Tegra210 has four Cortex-A57 cores paired with four Cortex-A53 + cores in a switched configuration. It features a GPU of the Maxwell + architecture with support for DX11, SM4, OpenGL 4.5, OpenGL ES 3.1 + and providing 256 CUDA cores. It supports hardware-accelerated en- + and decoding of various video standards including H.265, H.264 and + VP8 at 4K resolution and up to 60 fps. + + Besides the multimedia features it also comes with a variety of I/O + controllers, such as GPIO, I2C, SPI, SDHCI, PCIe, SATA and XHCI, to + name only a few. + +endif +endif diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index d6273e143324..a80d993b882e 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -151,16 +151,12 @@ do { \ #define LIBCFS_FREE(ptr, size) \ do { \ - int s = (size); \ if (unlikely((ptr) == NULL)) { \ CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ - "%s:%d\n", s, __FILE__, __LINE__); \ + "%s:%d\n", (int)(size), __FILE__, __LINE__); \ break; \ } \ - if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ - vfree(ptr); \ - else \ - kfree(ptr); \ + kvfree(ptr); \ } while (0) /******************************************************************************/ diff --git a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c index 72af486b65df..cb74ae731b95 100644 --- a/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c +++ b/drivers/staging/lustre/lnet/klnds/o2iblnd/o2iblnd.c @@ -2070,32 +2070,13 @@ static int kiblnd_net_init_pools(kib_net_t *net, __u32 *cpts, int ncpts) static int kiblnd_hdev_get_attr(kib_hca_dev_t *hdev) { - struct ib_device_attr *attr; - int rc; - /* It's safe to assume a HCA can handle a page size * matching that of the native system */ hdev->ibh_page_shift = PAGE_SHIFT; hdev->ibh_page_size = 1 << PAGE_SHIFT; hdev->ibh_page_mask = ~((__u64)hdev->ibh_page_size - 1); - LIBCFS_ALLOC(attr, sizeof(*attr)); - if (attr == NULL) { - CERROR("Out of memory\n"); - return -ENOMEM; - } - - rc = ib_query_device(hdev->ibh_ibdev, attr); - if (rc == 0) - hdev->ibh_mr_size = attr->max_mr_size; - - LIBCFS_FREE(attr, sizeof(*attr)); - - if (rc != 0) { - CERROR("Failed to query IB device: %d\n", rc); - return rc; - } - + hdev->ibh_mr_size = hdev->ibh_ibdev->attrs.max_mr_size; if (hdev->ibh_mr_size == ~0ULL) { hdev->ibh_mr_shift = 64; return 0; diff --git a/drivers/staging/lustre/lustre/llite/dir.c b/drivers/staging/lustre/lustre/llite/dir.c index 7b355319079c..8982f7d1b374 100644 --- a/drivers/staging/lustre/lustre/llite/dir.c +++ b/drivers/staging/lustre/lustre/llite/dir.c @@ -1858,7 +1858,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) int api32 = ll_need_32bit_api(sbi); loff_t ret = -EINVAL; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (origin) { case SEEK_SET: break; @@ -1896,7 +1896,7 @@ static loff_t ll_dir_seek(struct file *file, loff_t offset, int origin) goto out; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/drivers/staging/lustre/lustre/llite/file.c b/drivers/staging/lustre/lustre/llite/file.c index c92d58b770ec..39e2ffd5f97f 100644 --- a/drivers/staging/lustre/lustre/llite/file.c +++ b/drivers/staging/lustre/lustre/llite/file.c @@ -2082,17 +2082,17 @@ putgl: /* update time if requested */ rc = 0; if (llss->ia2.ia_valid != 0) { - mutex_lock(&llss->inode1->i_mutex); + inode_lock(llss->inode1); rc = ll_setattr(file1->f_path.dentry, &llss->ia2); - mutex_unlock(&llss->inode1->i_mutex); + inode_unlock(llss->inode1); } if (llss->ia1.ia_valid != 0) { int rc1; - mutex_lock(&llss->inode2->i_mutex); + inode_lock(llss->inode2); rc1 = ll_setattr(file2->f_path.dentry, &llss->ia1); - mutex_unlock(&llss->inode2->i_mutex); + inode_unlock(llss->inode2); if (rc == 0) rc = rc1; } @@ -2179,13 +2179,13 @@ static int ll_hsm_import(struct inode *inode, struct file *file, ATTR_MTIME | ATTR_MTIME_SET | ATTR_ATIME | ATTR_ATIME_SET; - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = ll_setattr_raw(file->f_path.dentry, attr, true); if (rc == -ENODATA) rc = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(attr); free_hss: @@ -2609,7 +2609,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_FSYNC, 1); rc = filemap_write_and_wait_range(inode->i_mapping, start, end); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* catch async errors that were recorded back when async writeback * failed for pages in this mapping. */ @@ -2641,7 +2641,7 @@ int ll_fsync(struct file *file, loff_t start, loff_t end, int datasync) fd->fd_write_failed = false; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } diff --git a/drivers/staging/lustre/lustre/llite/llite_internal.h b/drivers/staging/lustre/lustre/llite/llite_internal.h index ee8a1d67d191..845e992ca5fc 100644 --- a/drivers/staging/lustre/lustre/llite/llite_internal.h +++ b/drivers/staging/lustre/lustre/llite/llite_internal.h @@ -631,8 +631,6 @@ struct ll_file_data { struct lov_stripe_md; -extern spinlock_t inode_lock; - extern struct dentry *llite_root; extern struct kset *llite_kset; diff --git a/drivers/staging/lustre/lustre/llite/llite_lib.c b/drivers/staging/lustre/lustre/llite/llite_lib.c index 1db93af62bad..b2fc5b3786ee 100644 --- a/drivers/staging/lustre/lustre/llite/llite_lib.c +++ b/drivers/staging/lustre/lustre/llite/llite_lib.c @@ -1277,7 +1277,7 @@ int ll_setattr_raw(struct dentry *dentry, struct iattr *attr, bool hsm_import) return -ENOMEM; if (!S_ISDIR(inode->i_mode)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); memcpy(&op_data->op_attr, attr, sizeof(*attr)); @@ -1358,7 +1358,7 @@ out: ll_finish_md_op_data(op_data); if (!S_ISDIR(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((attr->ia_valid & ATTR_SIZE) && !hsm_import) inode_dio_wait(inode); } diff --git a/drivers/staging/lustre/lustre/llite/llite_nfs.c b/drivers/staging/lustre/lustre/llite/llite_nfs.c index e578a1130ad1..18aab25f9cd9 100644 --- a/drivers/staging/lustre/lustre/llite/llite_nfs.c +++ b/drivers/staging/lustre/lustre/llite/llite_nfs.c @@ -245,9 +245,9 @@ static int ll_get_name(struct dentry *dentry, char *name, goto out; } - mutex_lock(&dir->i_mutex); + inode_lock(dir); rc = ll_dir_read(dir, &lgd.ctx); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); if (!rc && !lgd.lgd_found) rc = -ENOENT; out: diff --git a/drivers/staging/lustre/lustre/llite/lloop.c b/drivers/staging/lustre/lustre/llite/lloop.c index 420d39123877..871924b3f2e7 100644 --- a/drivers/staging/lustre/lustre/llite/lloop.c +++ b/drivers/staging/lustre/lustre/llite/lloop.c @@ -257,9 +257,9 @@ static int do_bio_lustrebacked(struct lloop_device *lo, struct bio *head) * be asked to write less pages once, this purely depends on * implementation. Anyway, we should be careful to avoid deadlocking. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); bytes = ll_direct_rw_pages(env, io, rw, inode, pvec); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); cl_io_fini(env, io); return (bytes == pvec->ldp_size) ? 0 : (int)bytes; } diff --git a/drivers/staging/lustre/lustre/llite/rw.c b/drivers/staging/lustre/lustre/llite/rw.c index 95cdb0c58b04..f355474967d6 100644 --- a/drivers/staging/lustre/lustre/llite/rw.c +++ b/drivers/staging/lustre/lustre/llite/rw.c @@ -115,8 +115,8 @@ static struct ll_cl_context *ll_cl_init(struct file *file, struct inode *inode = vmpage->mapping->host; loff_t pos; - if (mutex_trylock(&inode->i_mutex)) { - mutex_unlock(&(inode)->i_mutex); + if (inode_trylock(inode)) { + inode_unlock((inode)); /* this is too bad. Someone is trying to write the * page w/o holding inode mutex. This means we can diff --git a/drivers/staging/lustre/lustre/llite/rw26.c b/drivers/staging/lustre/lustre/llite/rw26.c index 39fa13b74cbd..711fda93a58d 100644 --- a/drivers/staging/lustre/lustre/llite/rw26.c +++ b/drivers/staging/lustre/lustre/llite/rw26.c @@ -403,7 +403,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter, * 1. Need inode mutex to operate transient pages. */ if (iov_iter_rw(iter) == READ) - mutex_lock(&inode->i_mutex); + inode_lock(inode); LASSERT(obj->cob_transient_pages == 0); while (iov_iter_count(iter)) { @@ -454,7 +454,7 @@ static ssize_t ll_direct_IO_26(struct kiocb *iocb, struct iov_iter *iter, out: LASSERT(obj->cob_transient_pages == 0); if (iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (tot_bytes > 0) { if (iov_iter_rw(iter) == WRITE) { diff --git a/drivers/staging/lustre/lustre/llite/vvp_io.c b/drivers/staging/lustre/lustre/llite/vvp_io.c index f68e972886ca..0920ac6b3003 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_io.c +++ b/drivers/staging/lustre/lustre/llite/vvp_io.c @@ -439,7 +439,7 @@ static int vvp_io_setattr_start(const struct lu_env *env, struct inode *inode = ccc_object_inode(io->ci_obj); int result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (cl_io_is_trunc(io)) result = vvp_io_setattr_trunc(env, ios, inode, io->u.ci_setattr.sa_attr.lvb_size); @@ -459,7 +459,7 @@ static void vvp_io_setattr_end(const struct lu_env *env, * because osc has already notified to destroy osc_extents. */ vvp_do_vmtruncate(inode, io->u.ci_setattr.sa_attr.lvb_size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static void vvp_io_setattr_fini(const struct lu_env *env, diff --git a/drivers/staging/lustre/lustre/llite/vvp_page.c b/drivers/staging/lustre/lustre/llite/vvp_page.c index 99c0d7aee921..a133475a7c74 100644 --- a/drivers/staging/lustre/lustre/llite/vvp_page.c +++ b/drivers/staging/lustre/lustre/llite/vvp_page.c @@ -428,7 +428,7 @@ static void vvp_transient_page_verify(const struct cl_page *page) { struct inode *inode = ccc_object_inode(page->cp_obj); - LASSERT(!mutex_trylock(&inode->i_mutex)); + LASSERT(!inode_trylock(inode)); } static int vvp_transient_page_own(const struct lu_env *env, @@ -480,9 +480,9 @@ static int vvp_transient_page_is_vmlocked(const struct lu_env *env, struct inode *inode = ccc_object_inode(slice->cpl_obj); int locked; - locked = !mutex_trylock(&inode->i_mutex); + locked = !inode_trylock(inode); if (!locked) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return locked ? -EBUSY : -ENODATA; } @@ -502,7 +502,7 @@ static void vvp_transient_page_fini(const struct lu_env *env, struct ccc_object *clobj = cl2ccc(clp->cp_obj); vvp_page_fini_common(cp); - LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + LASSERT(!inode_trylock(clobj->cob_inode)); clobj->cob_transient_pages--; } @@ -548,7 +548,7 @@ int vvp_page_init(const struct lu_env *env, struct cl_object *obj, } else { struct ccc_object *clobj = cl2ccc(obj); - LASSERT(!mutex_trylock(&clobj->cob_inode->i_mutex)); + LASSERT(!inode_trylock(clobj->cob_inode)); cl_page_slice_add(page, &cpg->cpg_cl, obj, &vvp_transient_page_ops); clobj->cob_transient_pages++; diff --git a/drivers/staging/rdma/amso1100/c2_cq.c b/drivers/staging/rdma/amso1100/c2_cq.c index 3ef881f2da0f..7ad0c082485a 100644 --- a/drivers/staging/rdma/amso1100/c2_cq.c +++ b/drivers/staging/rdma/amso1100/c2_cq.c @@ -173,9 +173,6 @@ static inline int c2_poll_one(struct c2_dev *c2dev, case C2_WR_TYPE_RDMA_READ: entry->opcode = IB_WC_RDMA_READ; break; - case C2_WR_TYPE_BIND_MW: - entry->opcode = IB_WC_BIND_MW; - break; case C2_WR_TYPE_RECV: entry->byte_len = be32_to_cpu(ce->bytes_rcvd); entry->opcode = IB_WC_RECV; diff --git a/drivers/staging/rdma/amso1100/c2_provider.c b/drivers/staging/rdma/amso1100/c2_provider.c index a092ac743c72..de8d10e1bde3 100644 --- a/drivers/staging/rdma/amso1100/c2_provider.c +++ b/drivers/staging/rdma/amso1100/c2_provider.c @@ -337,43 +337,21 @@ static inline u32 c2_convert_access(int acc) C2_ACF_LOCAL_READ | C2_ACF_WINDOW_BIND; } -static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 * iova_start) +static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc) { struct c2_mr *mr; u64 *page_list; - u32 total_len; - int err, i, j, k, page_shift, pbl_depth; + const u32 total_len = 0xffffffff; /* AMSO1100 limit */ + int err, page_shift, pbl_depth, i; + u64 kva = 0; - pbl_depth = 0; - total_len = 0; + pr_debug("%s:%u\n", __func__, __LINE__); - page_shift = PAGE_SHIFT; /* - * If there is only 1 buffer we assume this could - * be a map of all phy mem...use a 32k page_shift. + * This is a map of all phy mem...use a 32k page_shift. */ - if (num_phys_buf == 1) - page_shift += 3; - - for (i = 0; i < num_phys_buf; i++) { - - if (offset_in_page(buffer_list[i].addr)) { - pr_debug("Unaligned Memory Buffer: 0x%x\n", - (unsigned int) buffer_list[i].addr); - return ERR_PTR(-EINVAL); - } - - if (!buffer_list[i].size) { - pr_debug("Invalid Buffer Size\n"); - return ERR_PTR(-EINVAL); - } - - total_len += buffer_list[i].size; - pbl_depth += ALIGN(buffer_list[i].size, - BIT(page_shift)) >> page_shift; - } + page_shift = PAGE_SHIFT + 3; + pbl_depth = ALIGN(total_len, BIT(page_shift)) >> page_shift; page_list = vmalloc(sizeof(u64) * pbl_depth); if (!page_list) { @@ -382,16 +360,8 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, return ERR_PTR(-ENOMEM); } - for (i = 0, j = 0; i < num_phys_buf; i++) { - - int naddrs; - - naddrs = ALIGN(buffer_list[i].size, - BIT(page_shift)) >> page_shift; - for (k = 0; k < naddrs; k++) - page_list[j++] = (buffer_list[i].addr + - (k << page_shift)); - } + for (i = 0; i < pbl_depth; i++) + page_list[i] = (i << page_shift); mr = kmalloc(sizeof(*mr), GFP_KERNEL); if (!mr) { @@ -399,17 +369,17 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, return ERR_PTR(-ENOMEM); } - mr->pd = to_c2pd(ib_pd); + mr->pd = to_c2pd(pd); mr->umem = NULL; pr_debug("%s - page shift %d, pbl_depth %d, total_len %u, " "*iova_start %llx, first pa %llx, last pa %llx\n", __func__, page_shift, pbl_depth, total_len, - (unsigned long long) *iova_start, + (unsigned long long) kva, (unsigned long long) page_list[0], (unsigned long long) page_list[pbl_depth-1]); - err = c2_nsmr_register_phys_kern(to_c2dev(ib_pd->device), page_list, + err = c2_nsmr_register_phys_kern(to_c2dev(pd->device), page_list, BIT(page_shift), pbl_depth, - total_len, 0, iova_start, + total_len, 0, &kva, c2_convert_access(acc), mr); vfree(page_list); if (err) { @@ -420,19 +390,6 @@ static struct ib_mr *c2_reg_phys_mr(struct ib_pd *ib_pd, return &mr->ibmr; } -static struct ib_mr *c2_get_dma_mr(struct ib_pd *pd, int acc) -{ - struct ib_phys_buf bl; - u64 kva = 0; - - pr_debug("%s:%u\n", __func__, __LINE__); - - /* AMSO1100 limit */ - bl.size = 0xffffffff; - bl.addr = 0; - return c2_reg_phys_mr(pd, &bl, 1, acc, &kva); -} - static struct ib_mr *c2_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int acc, struct ib_udata *udata) { @@ -840,7 +797,6 @@ int c2_register_device(struct c2_dev *dev) dev->ibdev.destroy_cq = c2_destroy_cq; dev->ibdev.poll_cq = c2_poll_cq; dev->ibdev.get_dma_mr = c2_get_dma_mr; - dev->ibdev.reg_phys_mr = c2_reg_phys_mr; dev->ibdev.reg_user_mr = c2_reg_user_mr; dev->ibdev.dereg_mr = c2_dereg_mr; dev->ibdev.get_port_immutable = c2_port_immutable; diff --git a/drivers/staging/rdma/ehca/ehca_classes.h b/drivers/staging/rdma/ehca/ehca_classes.h index bd45e0f3923f..e8c3387d7aaa 100644 --- a/drivers/staging/rdma/ehca/ehca_classes.h +++ b/drivers/staging/rdma/ehca/ehca_classes.h @@ -316,9 +316,8 @@ struct ehca_mr_pginfo { union { struct { /* type EHCA_MR_PGI_PHYS section */ - int num_phys_buf; - struct ib_phys_buf *phys_buf_array; - u64 next_buf; + u64 addr; + u16 size; } phy; struct { /* type EHCA_MR_PGI_USER section */ struct ib_umem *region; diff --git a/drivers/staging/rdma/ehca/ehca_iverbs.h b/drivers/staging/rdma/ehca/ehca_iverbs.h index 80e6a3d5df3e..cca5933fcda6 100644 --- a/drivers/staging/rdma/ehca/ehca_iverbs.h +++ b/drivers/staging/rdma/ehca/ehca_iverbs.h @@ -80,30 +80,14 @@ int ehca_destroy_ah(struct ib_ah *ah); struct ib_mr *ehca_get_dma_mr(struct ib_pd *pd, int mr_access_flags); -struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, u64 *iova_start); - struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int mr_access_flags, struct ib_udata *udata); -int ehca_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, int mr_access_flags, u64 *iova_start); - -int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); - int ehca_dereg_mr(struct ib_mr *mr); struct ib_mw *ehca_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); -int ehca_bind_mw(struct ib_qp *qp, struct ib_mw *mw, - struct ib_mw_bind *mw_bind); - int ehca_dealloc_mw(struct ib_mw *mw); struct ib_fmr *ehca_alloc_fmr(struct ib_pd *pd, diff --git a/drivers/staging/rdma/ehca/ehca_main.c b/drivers/staging/rdma/ehca/ehca_main.c index 860b974e9faa..832f22f40862 100644 --- a/drivers/staging/rdma/ehca/ehca_main.c +++ b/drivers/staging/rdma/ehca/ehca_main.c @@ -511,13 +511,9 @@ static int ehca_init_device(struct ehca_shca *shca) shca->ib_device.req_notify_cq = ehca_req_notify_cq; /* shca->ib_device.req_ncomp_notif = ehca_req_ncomp_notif; */ shca->ib_device.get_dma_mr = ehca_get_dma_mr; - shca->ib_device.reg_phys_mr = ehca_reg_phys_mr; shca->ib_device.reg_user_mr = ehca_reg_user_mr; - shca->ib_device.query_mr = ehca_query_mr; shca->ib_device.dereg_mr = ehca_dereg_mr; - shca->ib_device.rereg_phys_mr = ehca_rereg_phys_mr; shca->ib_device.alloc_mw = ehca_alloc_mw; - shca->ib_device.bind_mw = ehca_bind_mw; shca->ib_device.dealloc_mw = ehca_dealloc_mw; shca->ib_device.alloc_fmr = ehca_alloc_fmr; shca->ib_device.map_phys_fmr = ehca_map_phys_fmr; diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.c b/drivers/staging/rdma/ehca/ehca_mrmw.c index 553e883a5718..3367205e3160 100644 --- a/drivers/staging/rdma/ehca/ehca_mrmw.c +++ b/drivers/staging/rdma/ehca/ehca_mrmw.c @@ -196,120 +196,6 @@ get_dma_mr_exit0: /*----------------------------------------------------------------------*/ -struct ib_mr *ehca_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - struct ib_mr *ib_mr; - int ret; - struct ehca_mr *e_mr; - struct ehca_shca *shca = - container_of(pd->device, struct ehca_shca, ib_device); - struct ehca_pd *e_pd = container_of(pd, struct ehca_pd, ib_pd); - - u64 size; - - if ((num_phys_buf <= 0) || !phys_buf_array) { - ehca_err(pd->device, "bad input values: num_phys_buf=%x " - "phys_buf_array=%p", num_phys_buf, phys_buf_array); - ib_mr = ERR_PTR(-EINVAL); - goto reg_phys_mr_exit0; - } - if (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && - !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || - ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && - !(mr_access_flags & IB_ACCESS_LOCAL_WRITE))) { - /* - * Remote Write Access requires Local Write Access - * Remote Atomic Access requires Local Write Access - */ - ehca_err(pd->device, "bad input values: mr_access_flags=%x", - mr_access_flags); - ib_mr = ERR_PTR(-EINVAL); - goto reg_phys_mr_exit0; - } - - /* check physical buffer list and calculate size */ - ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, num_phys_buf, - iova_start, &size); - if (ret) { - ib_mr = ERR_PTR(ret); - goto reg_phys_mr_exit0; - } - if ((size == 0) || - (((u64)iova_start + size) < (u64)iova_start)) { - ehca_err(pd->device, "bad input values: size=%llx iova_start=%p", - size, iova_start); - ib_mr = ERR_PTR(-EINVAL); - goto reg_phys_mr_exit0; - } - - e_mr = ehca_mr_new(); - if (!e_mr) { - ehca_err(pd->device, "out of memory"); - ib_mr = ERR_PTR(-ENOMEM); - goto reg_phys_mr_exit0; - } - - /* register MR on HCA */ - if (ehca_mr_is_maxmr(size, iova_start)) { - e_mr->flags |= EHCA_MR_FLAG_MAXMR; - ret = ehca_reg_maxmr(shca, e_mr, iova_start, mr_access_flags, - e_pd, &e_mr->ib.ib_mr.lkey, - &e_mr->ib.ib_mr.rkey); - if (ret) { - ib_mr = ERR_PTR(ret); - goto reg_phys_mr_exit1; - } - } else { - struct ehca_mr_pginfo pginfo; - u32 num_kpages; - u32 num_hwpages; - u64 hw_pgsize; - - num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size, - PAGE_SIZE); - /* for kernel space we try most possible pgsize */ - hw_pgsize = ehca_get_max_hwpage_size(shca); - num_hwpages = NUM_CHUNKS(((u64)iova_start % hw_pgsize) + size, - hw_pgsize); - memset(&pginfo, 0, sizeof(pginfo)); - pginfo.type = EHCA_MR_PGI_PHYS; - pginfo.num_kpages = num_kpages; - pginfo.hwpage_size = hw_pgsize; - pginfo.num_hwpages = num_hwpages; - pginfo.u.phy.num_phys_buf = num_phys_buf; - pginfo.u.phy.phys_buf_array = phys_buf_array; - pginfo.next_hwpage = - ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; - - ret = ehca_reg_mr(shca, e_mr, iova_start, size, mr_access_flags, - e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, - &e_mr->ib.ib_mr.rkey, EHCA_REG_MR); - if (ret) { - ib_mr = ERR_PTR(ret); - goto reg_phys_mr_exit1; - } - } - - /* successful registration of all pages */ - return &e_mr->ib.ib_mr; - -reg_phys_mr_exit1: - ehca_mr_delete(e_mr); -reg_phys_mr_exit0: - if (IS_ERR(ib_mr)) - ehca_err(pd->device, "h_ret=%li pd=%p phys_buf_array=%p " - "num_phys_buf=%x mr_access_flags=%x iova_start=%p", - PTR_ERR(ib_mr), pd, phys_buf_array, - num_phys_buf, mr_access_flags, iova_start); - return ib_mr; -} /* end ehca_reg_phys_mr() */ - -/*----------------------------------------------------------------------*/ - struct ib_mr *ehca_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt, int mr_access_flags, struct ib_udata *udata) @@ -437,207 +323,6 @@ reg_user_mr_exit0: /*----------------------------------------------------------------------*/ -int ehca_rereg_phys_mr(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start) -{ - int ret; - - struct ehca_shca *shca = - container_of(mr->device, struct ehca_shca, ib_device); - struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); - u64 new_size; - u64 *new_start; - u32 new_acl; - struct ehca_pd *new_pd; - u32 tmp_lkey, tmp_rkey; - unsigned long sl_flags; - u32 num_kpages = 0; - u32 num_hwpages = 0; - struct ehca_mr_pginfo pginfo; - - if (!(mr_rereg_mask & IB_MR_REREG_TRANS)) { - /* TODO not supported, because PHYP rereg hCall needs pages */ - ehca_err(mr->device, "rereg without IB_MR_REREG_TRANS not " - "supported yet, mr_rereg_mask=%x", mr_rereg_mask); - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - - if (mr_rereg_mask & IB_MR_REREG_PD) { - if (!pd) { - ehca_err(mr->device, "rereg with bad pd, pd=%p " - "mr_rereg_mask=%x", pd, mr_rereg_mask); - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - } - - if ((mr_rereg_mask & - ~(IB_MR_REREG_TRANS | IB_MR_REREG_PD | IB_MR_REREG_ACCESS)) || - (mr_rereg_mask == 0)) { - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - - /* check other parameters */ - if (e_mr == shca->maxmr) { - /* should be impossible, however reject to be sure */ - ehca_err(mr->device, "rereg internal max-MR impossible, mr=%p " - "shca->maxmr=%p mr->lkey=%x", - mr, shca->maxmr, mr->lkey); - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - if (mr_rereg_mask & IB_MR_REREG_TRANS) { /* transl., i.e. addr/size */ - if (e_mr->flags & EHCA_MR_FLAG_FMR) { - ehca_err(mr->device, "not supported for FMR, mr=%p " - "flags=%x", mr, e_mr->flags); - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - if (!phys_buf_array || num_phys_buf <= 0) { - ehca_err(mr->device, "bad input values mr_rereg_mask=%x" - " phys_buf_array=%p num_phys_buf=%x", - mr_rereg_mask, phys_buf_array, num_phys_buf); - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - } - if ((mr_rereg_mask & IB_MR_REREG_ACCESS) && /* change ACL */ - (((mr_access_flags & IB_ACCESS_REMOTE_WRITE) && - !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)) || - ((mr_access_flags & IB_ACCESS_REMOTE_ATOMIC) && - !(mr_access_flags & IB_ACCESS_LOCAL_WRITE)))) { - /* - * Remote Write Access requires Local Write Access - * Remote Atomic Access requires Local Write Access - */ - ehca_err(mr->device, "bad input values: mr_rereg_mask=%x " - "mr_access_flags=%x", mr_rereg_mask, mr_access_flags); - ret = -EINVAL; - goto rereg_phys_mr_exit0; - } - - /* set requested values dependent on rereg request */ - spin_lock_irqsave(&e_mr->mrlock, sl_flags); - new_start = e_mr->start; - new_size = e_mr->size; - new_acl = e_mr->acl; - new_pd = container_of(mr->pd, struct ehca_pd, ib_pd); - - if (mr_rereg_mask & IB_MR_REREG_TRANS) { - u64 hw_pgsize = ehca_get_max_hwpage_size(shca); - - new_start = iova_start; /* change address */ - /* check physical buffer list and calculate size */ - ret = ehca_mr_chk_buf_and_calc_size(phys_buf_array, - num_phys_buf, iova_start, - &new_size); - if (ret) - goto rereg_phys_mr_exit1; - if ((new_size == 0) || - (((u64)iova_start + new_size) < (u64)iova_start)) { - ehca_err(mr->device, "bad input values: new_size=%llx " - "iova_start=%p", new_size, iova_start); - ret = -EINVAL; - goto rereg_phys_mr_exit1; - } - num_kpages = NUM_CHUNKS(((u64)new_start % PAGE_SIZE) + - new_size, PAGE_SIZE); - num_hwpages = NUM_CHUNKS(((u64)new_start % hw_pgsize) + - new_size, hw_pgsize); - memset(&pginfo, 0, sizeof(pginfo)); - pginfo.type = EHCA_MR_PGI_PHYS; - pginfo.num_kpages = num_kpages; - pginfo.hwpage_size = hw_pgsize; - pginfo.num_hwpages = num_hwpages; - pginfo.u.phy.num_phys_buf = num_phys_buf; - pginfo.u.phy.phys_buf_array = phys_buf_array; - pginfo.next_hwpage = - ((u64)iova_start & ~PAGE_MASK) / hw_pgsize; - } - if (mr_rereg_mask & IB_MR_REREG_ACCESS) - new_acl = mr_access_flags; - if (mr_rereg_mask & IB_MR_REREG_PD) - new_pd = container_of(pd, struct ehca_pd, ib_pd); - - ret = ehca_rereg_mr(shca, e_mr, new_start, new_size, new_acl, - new_pd, &pginfo, &tmp_lkey, &tmp_rkey); - if (ret) - goto rereg_phys_mr_exit1; - - /* successful reregistration */ - if (mr_rereg_mask & IB_MR_REREG_PD) - mr->pd = pd; - mr->lkey = tmp_lkey; - mr->rkey = tmp_rkey; - -rereg_phys_mr_exit1: - spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); -rereg_phys_mr_exit0: - if (ret) - ehca_err(mr->device, "ret=%i mr=%p mr_rereg_mask=%x pd=%p " - "phys_buf_array=%p num_phys_buf=%x mr_access_flags=%x " - "iova_start=%p", - ret, mr, mr_rereg_mask, pd, phys_buf_array, - num_phys_buf, mr_access_flags, iova_start); - return ret; -} /* end ehca_rereg_phys_mr() */ - -/*----------------------------------------------------------------------*/ - -int ehca_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr) -{ - int ret = 0; - u64 h_ret; - struct ehca_shca *shca = - container_of(mr->device, struct ehca_shca, ib_device); - struct ehca_mr *e_mr = container_of(mr, struct ehca_mr, ib.ib_mr); - unsigned long sl_flags; - struct ehca_mr_hipzout_parms hipzout; - - if ((e_mr->flags & EHCA_MR_FLAG_FMR)) { - ehca_err(mr->device, "not supported for FMR, mr=%p e_mr=%p " - "e_mr->flags=%x", mr, e_mr, e_mr->flags); - ret = -EINVAL; - goto query_mr_exit0; - } - - memset(mr_attr, 0, sizeof(struct ib_mr_attr)); - spin_lock_irqsave(&e_mr->mrlock, sl_flags); - - h_ret = hipz_h_query_mr(shca->ipz_hca_handle, e_mr, &hipzout); - if (h_ret != H_SUCCESS) { - ehca_err(mr->device, "hipz_mr_query failed, h_ret=%lli mr=%p " - "hca_hndl=%llx mr_hndl=%llx lkey=%x", - h_ret, mr, shca->ipz_hca_handle.handle, - e_mr->ipz_mr_handle.handle, mr->lkey); - ret = ehca2ib_return_code(h_ret); - goto query_mr_exit1; - } - mr_attr->pd = mr->pd; - mr_attr->device_virt_addr = hipzout.vaddr; - mr_attr->size = hipzout.len; - mr_attr->lkey = hipzout.lkey; - mr_attr->rkey = hipzout.rkey; - ehca_mrmw_reverse_map_acl(&hipzout.acl, &mr_attr->mr_access_flags); - -query_mr_exit1: - spin_unlock_irqrestore(&e_mr->mrlock, sl_flags); -query_mr_exit0: - if (ret) - ehca_err(mr->device, "ret=%i mr=%p mr_attr=%p", - ret, mr, mr_attr); - return ret; -} /* end ehca_query_mr() */ - -/*----------------------------------------------------------------------*/ - int ehca_dereg_mr(struct ib_mr *mr) { int ret = 0; @@ -728,18 +413,6 @@ alloc_mw_exit0: /*----------------------------------------------------------------------*/ -int ehca_bind_mw(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind) -{ - /* TODO: not supported up to now */ - ehca_gen_err("bind MW currently not supported by HCAD"); - - return -EPERM; -} /* end ehca_bind_mw() */ - -/*----------------------------------------------------------------------*/ - int ehca_dealloc_mw(struct ib_mw *mw) { u64 h_ret; @@ -1616,7 +1289,6 @@ int ehca_reg_internal_maxmr( u64 *iova_start; u64 size_maxmr; struct ehca_mr_pginfo pginfo; - struct ib_phys_buf ib_pbuf; u32 num_kpages; u32 num_hwpages; u64 hw_pgsize; @@ -1637,8 +1309,6 @@ int ehca_reg_internal_maxmr( /* register internal max-MR on HCA */ size_maxmr = ehca_mr_len; iova_start = (u64 *)ehca_map_vaddr((void *)(KERNELBASE + PHYSICAL_START)); - ib_pbuf.addr = 0; - ib_pbuf.size = size_maxmr; num_kpages = NUM_CHUNKS(((u64)iova_start % PAGE_SIZE) + size_maxmr, PAGE_SIZE); hw_pgsize = ehca_get_max_hwpage_size(shca); @@ -1650,8 +1320,8 @@ int ehca_reg_internal_maxmr( pginfo.num_kpages = num_kpages; pginfo.num_hwpages = num_hwpages; pginfo.hwpage_size = hw_pgsize; - pginfo.u.phy.num_phys_buf = 1; - pginfo.u.phy.phys_buf_array = &ib_pbuf; + pginfo.u.phy.addr = 0; + pginfo.u.phy.size = size_maxmr; ret = ehca_reg_mr(shca, e_mr, iova_start, size_maxmr, 0, e_pd, &pginfo, &e_mr->ib.ib_mr.lkey, @@ -1669,7 +1339,6 @@ int ehca_reg_internal_maxmr( e_mr->ib.ib_mr.pd = &e_pd->ib_pd; e_mr->ib.ib_mr.uobject = NULL; atomic_inc(&(e_pd->ib_pd.usecnt)); - atomic_set(&(e_mr->ib.ib_mr.usecnt), 0); *e_maxmr = e_mr; return 0; @@ -1762,61 +1431,6 @@ ehca_dereg_internal_maxmr_exit0: /*----------------------------------------------------------------------*/ -/* - * check physical buffer array of MR verbs for validness and - * calculates MR size - */ -int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - u64 *iova_start, - u64 *size) -{ - struct ib_phys_buf *pbuf = phys_buf_array; - u64 size_count = 0; - u32 i; - - if (num_phys_buf == 0) { - ehca_gen_err("bad phys buf array len, num_phys_buf=0"); - return -EINVAL; - } - /* check first buffer */ - if (((u64)iova_start & ~PAGE_MASK) != (pbuf->addr & ~PAGE_MASK)) { - ehca_gen_err("iova_start/addr mismatch, iova_start=%p " - "pbuf->addr=%llx pbuf->size=%llx", - iova_start, pbuf->addr, pbuf->size); - return -EINVAL; - } - if (((pbuf->addr + pbuf->size) % PAGE_SIZE) && - (num_phys_buf > 1)) { - ehca_gen_err("addr/size mismatch in 1st buf, pbuf->addr=%llx " - "pbuf->size=%llx", pbuf->addr, pbuf->size); - return -EINVAL; - } - - for (i = 0; i < num_phys_buf; i++) { - if ((i > 0) && (pbuf->addr % PAGE_SIZE)) { - ehca_gen_err("bad address, i=%x pbuf->addr=%llx " - "pbuf->size=%llx", - i, pbuf->addr, pbuf->size); - return -EINVAL; - } - if (((i > 0) && /* not 1st */ - (i < (num_phys_buf - 1)) && /* not last */ - (pbuf->size % PAGE_SIZE)) || (pbuf->size == 0)) { - ehca_gen_err("bad size, i=%x pbuf->size=%llx", - i, pbuf->size); - return -EINVAL; - } - size_count += pbuf->size; - pbuf++; - } - - *size = size_count; - return 0; -} /* end ehca_mr_chk_buf_and_calc_size() */ - -/*----------------------------------------------------------------------*/ - /* check page list of map FMR verb for validness */ int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, u64 *page_list, @@ -2002,57 +1616,54 @@ static int ehca_set_pagebuf_phys(struct ehca_mr_pginfo *pginfo, u32 number, u64 *kpage) { int ret = 0; - struct ib_phys_buf *pbuf; + u64 addr = pginfo->u.phy.addr; + u64 size = pginfo->u.phy.size; u64 num_hw, offs_hw; u32 i = 0; - /* loop over desired phys_buf_array entries */ - while (i < number) { - pbuf = pginfo->u.phy.phys_buf_array + pginfo->u.phy.next_buf; - num_hw = NUM_CHUNKS((pbuf->addr % pginfo->hwpage_size) + - pbuf->size, pginfo->hwpage_size); - offs_hw = (pbuf->addr & ~(pginfo->hwpage_size - 1)) / - pginfo->hwpage_size; - while (pginfo->next_hwpage < offs_hw + num_hw) { - /* sanity check */ - if ((pginfo->kpage_cnt >= pginfo->num_kpages) || - (pginfo->hwpage_cnt >= pginfo->num_hwpages)) { - ehca_gen_err("kpage_cnt >= num_kpages, " - "kpage_cnt=%llx num_kpages=%llx " - "hwpage_cnt=%llx " - "num_hwpages=%llx i=%x", - pginfo->kpage_cnt, - pginfo->num_kpages, - pginfo->hwpage_cnt, - pginfo->num_hwpages, i); - return -EFAULT; - } - *kpage = (pbuf->addr & ~(pginfo->hwpage_size - 1)) + - (pginfo->next_hwpage * pginfo->hwpage_size); - if ( !(*kpage) && pbuf->addr ) { - ehca_gen_err("pbuf->addr=%llx pbuf->size=%llx " - "next_hwpage=%llx", pbuf->addr, - pbuf->size, pginfo->next_hwpage); - return -EFAULT; - } - (pginfo->hwpage_cnt)++; - (pginfo->next_hwpage)++; - if (PAGE_SIZE >= pginfo->hwpage_size) { - if (pginfo->next_hwpage % - (PAGE_SIZE / pginfo->hwpage_size) == 0) - (pginfo->kpage_cnt)++; - } else - pginfo->kpage_cnt += pginfo->hwpage_size / - PAGE_SIZE; - kpage++; - i++; - if (i >= number) break; + num_hw = NUM_CHUNKS((addr % pginfo->hwpage_size) + size, + pginfo->hwpage_size); + offs_hw = (addr & ~(pginfo->hwpage_size - 1)) / pginfo->hwpage_size; + + while (pginfo->next_hwpage < offs_hw + num_hw) { + /* sanity check */ + if ((pginfo->kpage_cnt >= pginfo->num_kpages) || + (pginfo->hwpage_cnt >= pginfo->num_hwpages)) { + ehca_gen_err("kpage_cnt >= num_kpages, " + "kpage_cnt=%llx num_kpages=%llx " + "hwpage_cnt=%llx " + "num_hwpages=%llx i=%x", + pginfo->kpage_cnt, + pginfo->num_kpages, + pginfo->hwpage_cnt, + pginfo->num_hwpages, i); + return -EFAULT; } - if (pginfo->next_hwpage >= offs_hw + num_hw) { - (pginfo->u.phy.next_buf)++; - pginfo->next_hwpage = 0; + *kpage = (addr & ~(pginfo->hwpage_size - 1)) + + (pginfo->next_hwpage * pginfo->hwpage_size); + if ( !(*kpage) && addr ) { + ehca_gen_err("addr=%llx size=%llx " + "next_hwpage=%llx", addr, + size, pginfo->next_hwpage); + return -EFAULT; } + (pginfo->hwpage_cnt)++; + (pginfo->next_hwpage)++; + if (PAGE_SIZE >= pginfo->hwpage_size) { + if (pginfo->next_hwpage % + (PAGE_SIZE / pginfo->hwpage_size) == 0) + (pginfo->kpage_cnt)++; + } else + pginfo->kpage_cnt += pginfo->hwpage_size / + PAGE_SIZE; + kpage++; + i++; + if (i >= number) break; } + if (pginfo->next_hwpage >= offs_hw + num_hw) { + pginfo->next_hwpage = 0; + } + return ret; } diff --git a/drivers/staging/rdma/ehca/ehca_mrmw.h b/drivers/staging/rdma/ehca/ehca_mrmw.h index 50d8b51306dd..52bfa95697f7 100644 --- a/drivers/staging/rdma/ehca/ehca_mrmw.h +++ b/drivers/staging/rdma/ehca/ehca_mrmw.h @@ -98,11 +98,6 @@ int ehca_reg_maxmr(struct ehca_shca *shca, int ehca_dereg_internal_maxmr(struct ehca_shca *shca); -int ehca_mr_chk_buf_and_calc_size(struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - u64 *iova_start, - u64 *size); - int ehca_fmr_check_page_list(struct ehca_mr *e_fmr, u64 *page_list, int list_len); diff --git a/drivers/staging/rdma/ehca/ehca_reqs.c b/drivers/staging/rdma/ehca/ehca_reqs.c index 10e2074384f5..11813b880e16 100644 --- a/drivers/staging/rdma/ehca/ehca_reqs.c +++ b/drivers/staging/rdma/ehca/ehca_reqs.c @@ -614,7 +614,6 @@ int ehca_post_srq_recv(struct ib_srq *srq, static const u8 ib_wc_opcode[255] = { [0x01] = IB_WC_RECV+1, [0x02] = IB_WC_RECV_RDMA_WITH_IMM+1, - [0x04] = IB_WC_BIND_MW+1, [0x08] = IB_WC_FETCH_ADD+1, [0x10] = IB_WC_COMP_SWAP+1, [0x20] = IB_WC_RDMA_WRITE+1, diff --git a/drivers/staging/rdma/hfi1/mr.c b/drivers/staging/rdma/hfi1/mr.c index 568f185a022d..a3f8b884fdd6 100644 --- a/drivers/staging/rdma/hfi1/mr.c +++ b/drivers/staging/rdma/hfi1/mr.c @@ -167,10 +167,7 @@ static struct hfi1_mr *alloc_mr(int count, struct ib_pd *pd) rval = init_mregion(&mr->mr, pd, count); if (rval) goto bail; - /* - * ib_reg_phys_mr() will initialize mr->ibmr except for - * lkey and rkey. - */ + rval = hfi1_alloc_lkey(&mr->mr, 0); if (rval) goto bail_mregion; @@ -188,52 +185,6 @@ bail: } /** - * hfi1_reg_phys_mr - register a physical memory region - * @pd: protection domain for this memory region - * @buffer_list: pointer to the list of physical buffers to register - * @num_phys_buf: the number of physical buffers to register - * @iova_start: the starting address passed over IB which maps to this MR - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start) -{ - struct hfi1_mr *mr; - int n, m, i; - struct ib_mr *ret; - - mr = alloc_mr(num_phys_buf, pd); - if (IS_ERR(mr)) { - ret = (struct ib_mr *)mr; - goto bail; - } - - mr->mr.user_base = *iova_start; - mr->mr.iova = *iova_start; - mr->mr.access_flags = acc; - - m = 0; - n = 0; - for (i = 0; i < num_phys_buf; i++) { - mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; - mr->mr.map[m]->segs[n].length = buffer_list[i].size; - mr->mr.length += buffer_list[i].size; - n++; - if (n == HFI1_SEGSZ) { - m++; - n = 0; - } - } - - ret = &mr->ibmr; - -bail: - return ret; -} - -/** * hfi1_reg_user_mr - register a userspace memory region * @pd: protection domain for this memory region * @start: starting userspace address diff --git a/drivers/staging/rdma/hfi1/verbs.c b/drivers/staging/rdma/hfi1/verbs.c index ef0feaa684a4..09b8d412ee90 100644 --- a/drivers/staging/rdma/hfi1/verbs.c +++ b/drivers/staging/rdma/hfi1/verbs.c @@ -2052,7 +2052,6 @@ int hfi1_register_ib_device(struct hfi1_devdata *dd) ibdev->poll_cq = hfi1_poll_cq; ibdev->req_notify_cq = hfi1_req_notify_cq; ibdev->get_dma_mr = hfi1_get_dma_mr; - ibdev->reg_phys_mr = hfi1_reg_phys_mr; ibdev->reg_user_mr = hfi1_reg_user_mr; ibdev->dereg_mr = hfi1_dereg_mr; ibdev->alloc_mr = hfi1_alloc_mr; diff --git a/drivers/staging/rdma/hfi1/verbs.h b/drivers/staging/rdma/hfi1/verbs.h index 72106e5362b9..286e468b0479 100644 --- a/drivers/staging/rdma/hfi1/verbs.h +++ b/drivers/staging/rdma/hfi1/verbs.h @@ -1024,10 +1024,6 @@ int hfi1_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); struct ib_mr *hfi1_get_dma_mr(struct ib_pd *pd, int acc); -struct ib_mr *hfi1_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start); - struct ib_mr *hfi1_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); diff --git a/drivers/staging/rdma/ipath/ipath_fs.c b/drivers/staging/rdma/ipath/ipath_fs.c index 796af6867007..476fcdf05acb 100644 --- a/drivers/staging/rdma/ipath/ipath_fs.c +++ b/drivers/staging/rdma/ipath/ipath_fs.c @@ -82,14 +82,14 @@ static int create_file(const char *name, umode_t mode, { int error; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); *dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(*dentry)) error = ipathfs_mknod(d_inode(parent), *dentry, mode, fops, data); else error = PTR_ERR(*dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return error; } @@ -295,7 +295,7 @@ static int remove_device_files(struct super_block *sb, int ret; root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); snprintf(unit, sizeof unit, "%02d", dd->ipath_unit); dir = lookup_one_len(unit, root, strlen(unit)); @@ -311,7 +311,7 @@ static int remove_device_files(struct super_block *sb, ret = simple_rmdir(d_inode(root), dir); bail: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); return ret; } diff --git a/drivers/staging/rdma/ipath/ipath_mr.c b/drivers/staging/rdma/ipath/ipath_mr.c index c7278f6a8217..b76b0ce66709 100644 --- a/drivers/staging/rdma/ipath/ipath_mr.c +++ b/drivers/staging/rdma/ipath/ipath_mr.c @@ -98,10 +98,6 @@ static struct ipath_mr *alloc_mr(int count, } mr->mr.mapsz = m; - /* - * ib_reg_phys_mr() will initialize mr->ibmr except for - * lkey and rkey. - */ if (!ipath_alloc_lkey(lk_table, &mr->mr)) goto bail; mr->ibmr.rkey = mr->ibmr.lkey = mr->mr.lkey; @@ -121,57 +117,6 @@ done: } /** - * ipath_reg_phys_mr - register a physical memory region - * @pd: protection domain for this memory region - * @buffer_list: pointer to the list of physical buffers to register - * @num_phys_buf: the number of physical buffers to register - * @iova_start: the starting address passed over IB which maps to this MR - * - * Returns the memory region on success, otherwise returns an errno. - */ -struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start) -{ - struct ipath_mr *mr; - int n, m, i; - struct ib_mr *ret; - - mr = alloc_mr(num_phys_buf, &to_idev(pd->device)->lk_table); - if (mr == NULL) { - ret = ERR_PTR(-ENOMEM); - goto bail; - } - - mr->mr.pd = pd; - mr->mr.user_base = *iova_start; - mr->mr.iova = *iova_start; - mr->mr.length = 0; - mr->mr.offset = 0; - mr->mr.access_flags = acc; - mr->mr.max_segs = num_phys_buf; - mr->umem = NULL; - - m = 0; - n = 0; - for (i = 0; i < num_phys_buf; i++) { - mr->mr.map[m]->segs[n].vaddr = (void *) buffer_list[i].addr; - mr->mr.map[m]->segs[n].length = buffer_list[i].size; - mr->mr.length += buffer_list[i].size; - n++; - if (n == IPATH_SEGSZ) { - m++; - n = 0; - } - } - - ret = &mr->ibmr; - -bail: - return ret; -} - -/** * ipath_reg_user_mr - register a userspace memory region * @pd: protection domain for this memory region * @start: starting userspace address diff --git a/drivers/staging/rdma/ipath/ipath_verbs.c b/drivers/staging/rdma/ipath/ipath_verbs.c index 1778dee13f99..53f9dcab180d 100644 --- a/drivers/staging/rdma/ipath/ipath_verbs.c +++ b/drivers/staging/rdma/ipath/ipath_verbs.c @@ -2201,7 +2201,6 @@ int ipath_register_ib_device(struct ipath_devdata *dd) dev->poll_cq = ipath_poll_cq; dev->req_notify_cq = ipath_req_notify_cq; dev->get_dma_mr = ipath_get_dma_mr; - dev->reg_phys_mr = ipath_reg_phys_mr; dev->reg_user_mr = ipath_reg_user_mr; dev->dereg_mr = ipath_dereg_mr; dev->alloc_fmr = ipath_alloc_fmr; diff --git a/drivers/staging/rdma/ipath/ipath_verbs.h b/drivers/staging/rdma/ipath/ipath_verbs.h index 0a90a56870ab..6c70a89667a9 100644 --- a/drivers/staging/rdma/ipath/ipath_verbs.h +++ b/drivers/staging/rdma/ipath/ipath_verbs.h @@ -828,10 +828,6 @@ int ipath_resize_cq(struct ib_cq *ibcq, int cqe, struct ib_udata *udata); struct ib_mr *ipath_get_dma_mr(struct ib_pd *pd, int acc); -struct ib_mr *ipath_reg_phys_mr(struct ib_pd *pd, - struct ib_phys_buf *buffer_list, - int num_phys_buf, int acc, u64 *iova_start); - struct ib_mr *ipath_reg_user_mr(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, int mr_access_flags, struct ib_udata *udata); diff --git a/drivers/target/target_core_iblock.c b/drivers/target/target_core_iblock.c index e77d15000caa..5a2899f9f50b 100644 --- a/drivers/target/target_core_iblock.c +++ b/drivers/target/target_core_iblock.c @@ -615,9 +615,9 @@ iblock_alloc_bip(struct se_cmd *cmd, struct bio *bio) } bip = bio_integrity_alloc(bio, GFP_NOIO, cmd->t_prot_nents); - if (!bip) { + if (IS_ERR(bip)) { pr_err("Unable to allocate bio_integrity_payload\n"); - return -ENOMEM; + return PTR_ERR(bip); } bip->bip_iter.bi_size = (cmd->data_length / dev->dev_attrib.block_size) * diff --git a/drivers/thermal/int340x_thermal/processor_thermal_device.c b/drivers/thermal/int340x_thermal/processor_thermal_device.c index ccc0ad02d066..36fa724a36c8 100644 --- a/drivers/thermal/int340x_thermal/processor_thermal_device.c +++ b/drivers/thermal/int340x_thermal/processor_thermal_device.c @@ -33,6 +33,12 @@ /* Braswell thermal reporting device */ #define PCI_DEVICE_ID_PROC_BSW_THERMAL 0x22DC +/* Broxton thermal reporting device */ +#define PCI_DEVICE_ID_PROC_BXT0_THERMAL 0x0A8C +#define PCI_DEVICE_ID_PROC_BXT1_THERMAL 0x1A8C +#define PCI_DEVICE_ID_PROC_BXTX_THERMAL 0x4A8C +#define PCI_DEVICE_ID_PROC_BXTP_THERMAL 0x5A8C + struct power_config { u32 index; u32 min_uw; @@ -404,6 +410,10 @@ static const struct pci_device_id proc_thermal_pci_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_HSB_THERMAL)}, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_SKL_THERMAL)}, { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BSW_THERMAL)}, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT0_THERMAL)}, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXT1_THERMAL)}, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXTX_THERMAL)}, + { PCI_DEVICE(PCI_VENDOR_ID_INTEL, PCI_DEVICE_ID_PROC_BXTP_THERMAL)}, { 0, }, }; diff --git a/drivers/thermal/intel_pch_thermal.c b/drivers/thermal/intel_pch_thermal.c index 50c7da79be83..00d81af648b8 100644 --- a/drivers/thermal/intel_pch_thermal.c +++ b/drivers/thermal/intel_pch_thermal.c @@ -136,7 +136,7 @@ struct pch_dev_ops { /* dev ops for Wildcat Point */ -static struct pch_dev_ops pch_dev_ops_wpt = { +static const struct pch_dev_ops pch_dev_ops_wpt = { .hw_init = pch_wpt_init, .get_temp = pch_wpt_get_temp, }; diff --git a/drivers/thermal/rcar_thermal.c b/drivers/thermal/rcar_thermal.c index 13d01edc7a04..44b9c485157d 100644 --- a/drivers/thermal/rcar_thermal.c +++ b/drivers/thermal/rcar_thermal.c @@ -75,11 +75,11 @@ struct rcar_thermal_priv { #define rcar_has_irq_support(priv) ((priv)->common->base) #define rcar_id_to_shift(priv) ((priv)->id * 8) -#ifdef DEBUG -# define rcar_force_update_temp(priv) 1 -#else -# define rcar_force_update_temp(priv) 0 -#endif +static const struct of_device_id rcar_thermal_dt_ids[] = { + { .compatible = "renesas,rcar-thermal", }, + {}, +}; +MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids); /* * basic functions @@ -203,14 +203,26 @@ err_out_unlock: static int rcar_thermal_get_temp(struct thermal_zone_device *zone, int *temp) { struct rcar_thermal_priv *priv = rcar_zone_to_priv(zone); + int tmp; + int ret; - if (!rcar_has_irq_support(priv) || rcar_force_update_temp(priv)) - rcar_thermal_update_temp(priv); + ret = rcar_thermal_update_temp(priv); + if (ret < 0) + return ret; mutex_lock(&priv->lock); - *temp = MCELSIUS((priv->ctemp * 5) - 65); + tmp = MCELSIUS((priv->ctemp * 5) - 65); mutex_unlock(&priv->lock); + if ((tmp < MCELSIUS(-45)) || (tmp > MCELSIUS(125))) { + struct device *dev = rcar_priv_to_dev(priv); + + dev_err(dev, "it couldn't measure temperature correctly\n"); + return -EIO; + } + + *temp = tmp; + return 0; } @@ -288,6 +300,9 @@ static void _rcar_thermal_irq_ctrl(struct rcar_thermal_priv *priv, int enable) unsigned long flags; u32 mask = 0x3 << rcar_id_to_shift(priv); /* enable Rising/Falling */ + if (!rcar_has_irq_support(priv)) + return; + spin_lock_irqsave(&common->lock, flags); rcar_thermal_common_bset(common, INTMSK, mask, enable ? 0 : mask); @@ -299,11 +314,15 @@ static void rcar_thermal_work(struct work_struct *work) { struct rcar_thermal_priv *priv; int cctemp, nctemp; + int ret; priv = container_of(work, struct rcar_thermal_priv, work.work); rcar_thermal_get_temp(priv->zone, &cctemp); - rcar_thermal_update_temp(priv); + ret = rcar_thermal_update_temp(priv); + if (ret < 0) + return; + rcar_thermal_irq_enable(priv); rcar_thermal_get_temp(priv->zone, &nctemp); @@ -368,8 +387,7 @@ static int rcar_thermal_remove(struct platform_device *pdev) struct rcar_thermal_priv *priv; rcar_thermal_for_each_priv(priv, common) { - if (rcar_has_irq_support(priv)) - rcar_thermal_irq_disable(priv); + rcar_thermal_irq_disable(priv); thermal_zone_device_unregister(priv->zone); } @@ -441,7 +459,9 @@ static int rcar_thermal_probe(struct platform_device *pdev) mutex_init(&priv->lock); INIT_LIST_HEAD(&priv->list); INIT_DELAYED_WORK(&priv->work, rcar_thermal_work); - rcar_thermal_update_temp(priv); + ret = rcar_thermal_update_temp(priv); + if (ret < 0) + goto error_unregister; priv->zone = thermal_zone_device_register("rcar_thermal", 1, 0, priv, @@ -453,8 +473,7 @@ static int rcar_thermal_probe(struct platform_device *pdev) goto error_unregister; } - if (rcar_has_irq_support(priv)) - rcar_thermal_irq_enable(priv); + rcar_thermal_irq_enable(priv); list_move_tail(&priv->list, &common->head); @@ -484,12 +503,6 @@ error_unregister: return ret; } -static const struct of_device_id rcar_thermal_dt_ids[] = { - { .compatible = "renesas,rcar-thermal", }, - {}, -}; -MODULE_DEVICE_TABLE(of, rcar_thermal_dt_ids); - static struct platform_driver rcar_thermal_driver = { .driver = { .name = "rcar_thermal", diff --git a/drivers/thermal/rockchip_thermal.c b/drivers/thermal/rockchip_thermal.c index e845841ab036..b58e3fb9b311 100644 --- a/drivers/thermal/rockchip_thermal.c +++ b/drivers/thermal/rockchip_thermal.c @@ -38,7 +38,7 @@ enum tshut_mode { }; /** - * the system Temperature Sensors tshut(tshut) polarity + * The system Temperature Sensors tshut(tshut) polarity * the bit 8 is tshut polarity. * 0: low active, 1: high active */ @@ -57,10 +57,10 @@ enum sensor_id { }; /** -* The conversion table has the adc value and temperature. -* ADC_DECREMENT is the adc value decremnet.(e.g. v2_code_table) -* ADC_INCREMNET is the adc value incremnet.(e.g. v3_code_table) -*/ + * The conversion table has the adc value and temperature. + * ADC_DECREMENT: the adc value is of diminishing.(e.g. v2_code_table) + * ADC_INCREMENT: the adc value is incremental.(e.g. v3_code_table) + */ enum adc_sort_mode { ADC_DECREMENT = 0, ADC_INCREMENT, @@ -72,16 +72,17 @@ enum adc_sort_mode { */ #define SOC_MAX_SENSORS 2 +/** + * struct chip_tsadc_table: hold information about chip-specific differences + * @id: conversion table + * @length: size of conversion table + * @data_mask: mask to apply on data inputs + * @mode: sort mode of this adc variant (incrementing or decrementing) + */ struct chip_tsadc_table { const struct tsadc_table *id; - - /* the array table size*/ unsigned int length; - - /* that analogic mask data */ u32 data_mask; - - /* the sort mode is adc value that increment or decrement in table */ enum adc_sort_mode mode; }; @@ -153,6 +154,7 @@ struct rockchip_thermal_data { #define TSADCV2_SHUT_2GPIO_SRC_EN(chn) BIT(4 + (chn)) #define TSADCV2_SHUT_2CRU_SRC_EN(chn) BIT(8 + (chn)) +#define TSADCV1_INT_PD_CLEAR_MASK ~BIT(16) #define TSADCV2_INT_PD_CLEAR_MASK ~BIT(8) #define TSADCV2_DATA_MASK 0xfff @@ -168,6 +170,51 @@ struct tsadc_table { int temp; }; +/** + * Note: + * Code to Temperature mapping of the Temperature sensor is a piece wise linear + * curve.Any temperature, code faling between to 2 give temperatures can be + * linearly interpolated. + * Code to Temperature mapping should be updated based on sillcon results. + */ +static const struct tsadc_table v1_code_table[] = { + {TSADCV3_DATA_MASK, -40000}, + {436, -40000}, + {431, -35000}, + {426, -30000}, + {421, -25000}, + {416, -20000}, + {411, -15000}, + {406, -10000}, + {401, -5000}, + {395, 0}, + {390, 5000}, + {385, 10000}, + {380, 15000}, + {375, 20000}, + {370, 25000}, + {364, 30000}, + {359, 35000}, + {354, 40000}, + {349, 45000}, + {343, 50000}, + {338, 55000}, + {333, 60000}, + {328, 65000}, + {322, 70000}, + {317, 75000}, + {312, 80000}, + {307, 85000}, + {301, 90000}, + {296, 95000}, + {291, 100000}, + {286, 105000}, + {280, 110000}, + {275, 115000}, + {270, 120000}, + {264, 125000}, +}; + static const struct tsadc_table v2_code_table[] = { {TSADCV2_DATA_MASK, -40000}, {3800, -40000}, @@ -245,6 +292,44 @@ static const struct tsadc_table v3_code_table[] = { {TSADCV3_DATA_MASK, 125000}, }; +static const struct tsadc_table v4_code_table[] = { + {TSADCV3_DATA_MASK, -40000}, + {431, -40000}, + {426, -35000}, + {421, -30000}, + {415, -25000}, + {410, -20000}, + {405, -15000}, + {399, -10000}, + {394, -5000}, + {389, 0}, + {383, 5000}, + {378, 10000}, + {373, 15000}, + {367, 20000}, + {362, 25000}, + {357, 30000}, + {351, 35000}, + {346, 40000}, + {340, 45000}, + {335, 50000}, + {330, 55000}, + {324, 60000}, + {319, 65000}, + {313, 70000}, + {308, 75000}, + {302, 80000}, + {297, 85000}, + {291, 90000}, + {286, 95000}, + {281, 100000}, + {275, 105000}, + {270, 110000}, + {264, 115000}, + {259, 120000}, + {253, 125000}, +}; + static u32 rk_tsadcv2_temp_to_code(struct chip_tsadc_table table, int temp) { @@ -368,6 +453,14 @@ static void rk_tsadcv2_initialize(void __iomem *regs, regs + TSADCV2_HIGHT_TSHUT_DEBOUNCE); } +static void rk_tsadcv1_irq_ack(void __iomem *regs) +{ + u32 val; + + val = readl_relaxed(regs + TSADCV2_INT_PD); + writel_relaxed(val & TSADCV1_INT_PD_CLEAR_MASK, regs + TSADCV2_INT_PD); +} + static void rk_tsadcv2_irq_ack(void __iomem *regs) { u32 val; @@ -429,6 +522,29 @@ static void rk_tsadcv2_tshut_mode(int chn, void __iomem *regs, writel_relaxed(val, regs + TSADCV2_INT_EN); } +static const struct rockchip_tsadc_chip rk3228_tsadc_data = { + .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */ + .chn_num = 1, /* one channel for tsadc */ + + .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ + .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ + .tshut_temp = 95000, + + .initialize = rk_tsadcv2_initialize, + .irq_ack = rk_tsadcv1_irq_ack, + .control = rk_tsadcv2_control, + .get_temp = rk_tsadcv2_get_temp, + .set_tshut_temp = rk_tsadcv2_tshut_temp, + .set_tshut_mode = rk_tsadcv2_tshut_mode, + + .table = { + .id = v1_code_table, + .length = ARRAY_SIZE(v1_code_table), + .data_mask = TSADCV3_DATA_MASK, + .mode = ADC_DECREMENT, + }, +}; + static const struct rockchip_tsadc_chip rk3288_tsadc_data = { .chn_id[SENSOR_CPU] = 1, /* cpu sensor is channel 1 */ .chn_id[SENSOR_GPU] = 2, /* gpu sensor is channel 2 */ @@ -477,8 +593,36 @@ static const struct rockchip_tsadc_chip rk3368_tsadc_data = { }, }; +static const struct rockchip_tsadc_chip rk3399_tsadc_data = { + .chn_id[SENSOR_CPU] = 0, /* cpu sensor is channel 0 */ + .chn_id[SENSOR_GPU] = 1, /* gpu sensor is channel 1 */ + .chn_num = 2, /* two channels for tsadc */ + + .tshut_mode = TSHUT_MODE_GPIO, /* default TSHUT via GPIO give PMIC */ + .tshut_polarity = TSHUT_LOW_ACTIVE, /* default TSHUT LOW ACTIVE */ + .tshut_temp = 95000, + + .initialize = rk_tsadcv2_initialize, + .irq_ack = rk_tsadcv1_irq_ack, + .control = rk_tsadcv2_control, + .get_temp = rk_tsadcv2_get_temp, + .set_tshut_temp = rk_tsadcv2_tshut_temp, + .set_tshut_mode = rk_tsadcv2_tshut_mode, + + .table = { + .id = v4_code_table, + .length = ARRAY_SIZE(v4_code_table), + .data_mask = TSADCV3_DATA_MASK, + .mode = ADC_DECREMENT, + }, +}; + static const struct of_device_id of_rockchip_thermal_match[] = { { + .compatible = "rockchip,rk3228-tsadc", + .data = (void *)&rk3228_tsadc_data, + }, + { .compatible = "rockchip,rk3288-tsadc", .data = (void *)&rk3288_tsadc_data, }, @@ -486,6 +630,10 @@ static const struct of_device_id of_rockchip_thermal_match[] = { .compatible = "rockchip,rk3368-tsadc", .data = (void *)&rk3368_tsadc_data, }, + { + .compatible = "rockchip,rk3399-tsadc", + .data = (void *)&rk3399_tsadc_data, + }, { /* end */ }, }; MODULE_DEVICE_TABLE(of, of_rockchip_thermal_match); @@ -617,7 +765,7 @@ rockchip_thermal_register_sensor(struct platform_device *pdev, return 0; } -/* +/** * Reset TSADC Controller, reset all tsadc registers. */ static void rockchip_thermal_reset_controller(struct reset_control *reset) diff --git a/drivers/thermal/step_wise.c b/drivers/thermal/step_wise.c index 2f9f7086ac3d..ea9366ad3e6b 100644 --- a/drivers/thermal/step_wise.c +++ b/drivers/thermal/step_wise.c @@ -63,6 +63,19 @@ static unsigned long get_target_state(struct thermal_instance *instance, next_target = instance->target; dev_dbg(&cdev->device, "cur_state=%ld\n", cur_state); + if (!instance->initialized) { + if (throttle) { + next_target = (cur_state + 1) >= instance->upper ? + instance->upper : + ((cur_state + 1) < instance->lower ? + instance->lower : (cur_state + 1)); + } else { + next_target = THERMAL_NO_TARGET; + } + + return next_target; + } + switch (trend) { case THERMAL_TREND_RAISING: if (throttle) { @@ -149,7 +162,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) dev_dbg(&instance->cdev->device, "old_target=%d, target=%d\n", old_target, (int)instance->target); - if (old_target == instance->target) + if (instance->initialized && old_target == instance->target) continue; /* Activate a passive thermal instance */ @@ -161,7 +174,7 @@ static void thermal_zone_trip_update(struct thermal_zone_device *tz, int trip) instance->target == THERMAL_NO_TARGET) update_passive_instance(tz, trip_type, -1); - + instance->initialized = true; instance->cdev->updated = false; /* cdev needs update */ } diff --git a/drivers/thermal/thermal_core.c b/drivers/thermal/thermal_core.c index d9e525cc9c1c..a0a8fd1235e2 100644 --- a/drivers/thermal/thermal_core.c +++ b/drivers/thermal/thermal_core.c @@ -37,6 +37,7 @@ #include <linux/of.h> #include <net/netlink.h> #include <net/genetlink.h> +#include <linux/suspend.h> #define CREATE_TRACE_POINTS #include <trace/events/thermal.h> @@ -59,6 +60,8 @@ static LIST_HEAD(thermal_governor_list); static DEFINE_MUTEX(thermal_list_lock); static DEFINE_MUTEX(thermal_governor_lock); +static atomic_t in_suspend; + static struct thermal_governor *def_governor; static struct thermal_governor *__find_governor(const char *name) @@ -532,14 +535,31 @@ static void update_temperature(struct thermal_zone_device *tz) mutex_unlock(&tz->lock); trace_thermal_temperature(tz); - dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n", - tz->last_temperature, tz->temperature); + if (tz->last_temperature == THERMAL_TEMP_INVALID) + dev_dbg(&tz->device, "last_temperature N/A, current_temperature=%d\n", + tz->temperature); + else + dev_dbg(&tz->device, "last_temperature=%d, current_temperature=%d\n", + tz->last_temperature, tz->temperature); +} + +static void thermal_zone_device_reset(struct thermal_zone_device *tz) +{ + struct thermal_instance *pos; + + tz->temperature = THERMAL_TEMP_INVALID; + tz->passive = 0; + list_for_each_entry(pos, &tz->thermal_instances, tz_node) + pos->initialized = false; } void thermal_zone_device_update(struct thermal_zone_device *tz) { int count; + if (atomic_read(&in_suspend)) + return; + if (!tz->ops->get_temp) return; @@ -676,8 +696,12 @@ trip_point_temp_store(struct device *dev, struct device_attribute *attr, return -EINVAL; ret = tz->ops->set_trip_temp(tz, trip, temperature); + if (ret) + return ret; - return ret ? ret : count; + thermal_zone_device_update(tz); + + return count; } static ssize_t @@ -1321,6 +1345,7 @@ int thermal_zone_bind_cooling_device(struct thermal_zone_device *tz, if (!result) { list_add_tail(&dev->tz_node, &tz->thermal_instances); list_add_tail(&dev->cdev_node, &cdev->thermal_instances); + atomic_set(&tz->need_update, 1); } mutex_unlock(&cdev->lock); mutex_unlock(&tz->lock); @@ -1430,6 +1455,7 @@ __thermal_cooling_device_register(struct device_node *np, const struct thermal_cooling_device_ops *ops) { struct thermal_cooling_device *cdev; + struct thermal_zone_device *pos = NULL; int result; if (type && strlen(type) >= THERMAL_NAME_LENGTH) @@ -1474,6 +1500,12 @@ __thermal_cooling_device_register(struct device_node *np, /* Update binding information for 'this' new cdev */ bind_cdev(cdev); + mutex_lock(&thermal_list_lock); + list_for_each_entry(pos, &thermal_tz_list, node) + if (atomic_cmpxchg(&pos->need_update, 1, 0)) + thermal_zone_device_update(pos); + mutex_unlock(&thermal_list_lock); + return cdev; } @@ -1806,6 +1838,8 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, tz->trips = trips; tz->passive_delay = passive_delay; tz->polling_delay = polling_delay; + /* A new thermal zone needs to be updated anyway. */ + atomic_set(&tz->need_update, 1); dev_set_name(&tz->device, "thermal_zone%d", tz->id); result = device_register(&tz->device); @@ -1900,7 +1934,10 @@ struct thermal_zone_device *thermal_zone_device_register(const char *type, INIT_DELAYED_WORK(&(tz->poll_queue), thermal_zone_device_check); - thermal_zone_device_update(tz); + thermal_zone_device_reset(tz); + /* Update the new thermal zone and mark it as already updated. */ + if (atomic_cmpxchg(&tz->need_update, 1, 0)) + thermal_zone_device_update(tz); return tz; @@ -2140,6 +2177,36 @@ static void thermal_unregister_governors(void) thermal_gov_power_allocator_unregister(); } +static int thermal_pm_notify(struct notifier_block *nb, + unsigned long mode, void *_unused) +{ + struct thermal_zone_device *tz; + + switch (mode) { + case PM_HIBERNATION_PREPARE: + case PM_RESTORE_PREPARE: + case PM_SUSPEND_PREPARE: + atomic_set(&in_suspend, 1); + break; + case PM_POST_HIBERNATION: + case PM_POST_RESTORE: + case PM_POST_SUSPEND: + atomic_set(&in_suspend, 0); + list_for_each_entry(tz, &thermal_tz_list, node) { + thermal_zone_device_reset(tz); + thermal_zone_device_update(tz); + } + break; + default: + break; + } + return 0; +} + +static struct notifier_block thermal_pm_nb = { + .notifier_call = thermal_pm_notify, +}; + static int __init thermal_init(void) { int result; @@ -2160,6 +2227,11 @@ static int __init thermal_init(void) if (result) goto exit_netlink; + result = register_pm_notifier(&thermal_pm_nb); + if (result) + pr_warn("Thermal: Can not register suspend notifier, return %d\n", + result); + return 0; exit_netlink: @@ -2179,6 +2251,7 @@ error: static void __exit thermal_exit(void) { + unregister_pm_notifier(&thermal_pm_nb); of_thermal_destroy_zones(); genetlink_exit(); class_unregister(&thermal_class); diff --git a/drivers/thermal/thermal_core.h b/drivers/thermal/thermal_core.h index d7ac1fccd659..749d41abfbab 100644 --- a/drivers/thermal/thermal_core.h +++ b/drivers/thermal/thermal_core.h @@ -41,6 +41,7 @@ struct thermal_instance { struct thermal_zone_device *tz; struct thermal_cooling_device *cdev; int trip; + bool initialized; unsigned long upper; /* Highest cooling state for this trip point */ unsigned long lower; /* Lowest cooling state for this trip point */ unsigned long target; /* expected cooling state */ diff --git a/drivers/usb/gadget/function/f_printer.c b/drivers/usb/gadget/function/f_printer.c index 0fbfb2b2aa08..26ccad5d8680 100644 --- a/drivers/usb/gadget/function/f_printer.c +++ b/drivers/usb/gadget/function/f_printer.c @@ -673,7 +673,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync) unsigned long flags; int tx_list_empty; - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock_irqsave(&dev->lock, flags); tx_list_empty = (likely(list_empty(&dev->tx_reqs))); spin_unlock_irqrestore(&dev->lock, flags); @@ -683,7 +683,7 @@ printer_fsync(struct file *fd, loff_t start, loff_t end, int datasync) wait_event_interruptible(dev->tx_flush_wait, (likely(list_empty(&dev->tx_reqs_active)))); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/drivers/usb/gadget/legacy/inode.c b/drivers/usb/gadget/legacy/inode.c index 365afd7e14f8..7e179f81d05c 100644 --- a/drivers/usb/gadget/legacy/inode.c +++ b/drivers/usb/gadget/legacy/inode.c @@ -1521,10 +1521,10 @@ static void destroy_ep_files (struct dev_data *dev) spin_unlock_irq (&dev->lock); /* break link to dcache */ - mutex_lock (&parent->i_mutex); + inode_lock(parent); d_delete (dentry); dput (dentry); - mutex_unlock (&parent->i_mutex); + inode_unlock(parent); spin_lock_irq (&dev->lock); } diff --git a/drivers/usb/gadget/udc/atmel_usba_udc.c b/drivers/usb/gadget/udc/atmel_usba_udc.c index f92f5aff0dd5..8755b2c2aada 100644 --- a/drivers/usb/gadget/udc/atmel_usba_udc.c +++ b/drivers/usb/gadget/udc/atmel_usba_udc.c @@ -91,7 +91,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf, if (!access_ok(VERIFY_WRITE, buf, nbytes)) return -EFAULT; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); list_for_each_entry_safe(req, tmp_req, queue, queue) { len = snprintf(tmpbuf, sizeof(tmpbuf), "%8p %08x %c%c%c %5d %c%c%c\n", @@ -118,7 +118,7 @@ static ssize_t queue_dbg_read(struct file *file, char __user *buf, nbytes -= len; buf += len; } - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return actual; } @@ -143,7 +143,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file) u32 *data; int ret = -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); udc = inode->i_private; data = kmalloc(inode->i_size, GFP_KERNEL); if (!data) @@ -158,7 +158,7 @@ static int regs_dbg_open(struct inode *inode, struct file *file) ret = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -169,11 +169,11 @@ static ssize_t regs_dbg_read(struct file *file, char __user *buf, struct inode *inode = file_inode(file); int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = simple_read_from_buffer(buf, nbytes, ppos, file->private_data, file_inode(file)->i_size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/drivers/usb/host/Kconfig b/drivers/usb/host/Kconfig index daa563ff1fa0..1f117c360ebb 100644 --- a/drivers/usb/host/Kconfig +++ b/drivers/usb/host/Kconfig @@ -229,6 +229,8 @@ config USB_EHCI_TEGRA depends on ARCH_TEGRA select USB_EHCI_ROOT_HUB_TT select USB_PHY + select USB_ULPI + select USB_ULPI_VIEWPORT help This driver enables support for the internal USB Host Controllers found in NVIDIA Tegra SoCs. The controllers are EHCI compliant. diff --git a/drivers/video/fbdev/core/fb_defio.c b/drivers/video/fbdev/core/fb_defio.c index 3fc63c208d08..57721c73177f 100644 --- a/drivers/video/fbdev/core/fb_defio.c +++ b/drivers/video/fbdev/core/fb_defio.c @@ -78,13 +78,13 @@ int fb_deferred_io_fsync(struct file *file, loff_t start, loff_t end, int datasy if (!info->fbdefio) return 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Kill off the delayed work */ cancel_delayed_work_sync(&info->deferred_work); /* Run it immediately */ schedule_delayed_work(&info->deferred_work, 0); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 7bf835f85bc8..eadc894faea2 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -449,14 +449,14 @@ static int v9fs_file_fsync(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; v9fs_blank_wstat(&wstat); retval = p9_client_wstat(fid, &wstat); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } @@ -472,13 +472,13 @@ int v9fs_file_fsync_dotl(struct file *filp, loff_t start, loff_t end, if (retval) return retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); p9_debug(P9_DEBUG_VFS, "filp %p datasync %x\n", filp, datasync); fid = filp->private_data; retval = p9_client_fsync(fid, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/affs/file.c b/fs/affs/file.c index 659c579c4588..0548c53f41d5 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -33,11 +33,11 @@ affs_file_release(struct inode *inode, struct file *filp) inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (inode->i_size != AFFS_I(inode)->mmu_private) affs_truncate(inode); affs_free_prealloc(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -958,12 +958,12 @@ int affs_file_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = write_inode_now(inode, 0); err = sync_blockdev(inode->i_sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } const struct file_operations affs_file_operations = { diff --git a/fs/afs/flock.c b/fs/afs/flock.c index 4baf1d2b39e4..d91a9c9cfbd0 100644 --- a/fs/afs/flock.c +++ b/fs/afs/flock.c @@ -483,7 +483,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) fl->fl_type = F_UNLCK; - mutex_lock(&vnode->vfs_inode.i_mutex); + inode_lock(&vnode->vfs_inode); /* check local lock records first */ ret = 0; @@ -505,7 +505,7 @@ static int afs_do_getlk(struct file *file, struct file_lock *fl) } error: - mutex_unlock(&vnode->vfs_inode.i_mutex); + inode_unlock(&vnode->vfs_inode); _leave(" = %d [%hd]", ret, fl->fl_type); return ret; } diff --git a/fs/afs/write.c b/fs/afs/write.c index 0714abcd7f32..dfef94f70667 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -693,7 +693,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* use a writeback record as a marker in the queue - when this reaches * the front of the queue, all the outstanding writes are either @@ -735,7 +735,7 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) afs_put_writeback(wb); _leave(" = %d", ret); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/attr.c b/fs/attr.c index 6530ced19697..25b24d0f6c88 100644 --- a/fs/attr.c +++ b/fs/attr.c @@ -195,7 +195,7 @@ int notify_change(struct dentry * dentry, struct iattr * attr, struct inode **de struct timespec now; unsigned int ia_valid = attr->ia_valid; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID | ATTR_TIMES_SET)) { if (IS_IMMUTABLE(inode) || IS_APPEND(inode)) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 78f005f37847..3a3ced779fc7 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -638,11 +638,11 @@ static ssize_t bm_entry_write(struct file *file, const char __user *buffer, case 3: /* Delete this handler. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); kill_node(e); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: @@ -675,7 +675,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, return PTR_ERR(e); root = dget(sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = lookup_one_len(e->name, root, strlen(e->name)); err = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -711,7 +711,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, out2: dput(dentry); out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); if (err) { @@ -754,12 +754,12 @@ static ssize_t bm_status_write(struct file *file, const char __user *buffer, case 3: /* Delete all handlers. */ root = dget(file->f_path.dentry->d_sb->s_root); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); while (!list_empty(&entries)) kill_node(list_entry(entries.next, Node, list)); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(root); break; default: diff --git a/fs/block_dev.c b/fs/block_dev.c index ba762ea07f67..7b9cd49622b1 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); @@ -346,9 +346,9 @@ static loff_t block_llseek(struct file *file, loff_t offset, int whence) struct inode *bd_inode = bdev_file_inode(file); loff_t retval; - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); retval = fixed_size_llseek(file, offset, whence, i_size_read(bd_inode)); - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return retval; } @@ -1142,9 +1142,9 @@ void bd_set_size(struct block_device *bdev, loff_t size) { unsigned bsize = bdev_logical_block_size(bdev); - mutex_lock(&bdev->bd_inode->i_mutex); + inode_lock(bdev->bd_inode); i_size_write(bdev->bd_inode, size); - mutex_unlock(&bdev->bd_inode->i_mutex); + inode_unlock(bdev->bd_inode); while (bsize < PAGE_CACHE_SIZE) { if (size & bsize) break; @@ -1741,9 +1741,9 @@ static void blkdev_vm_open(struct vm_area_struct *vma) struct inode *bd_inode = bdev_file_inode(vma->vm_file); struct block_device *bdev = I_BDEV(bd_inode); - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); bdev->bd_map_count++; - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); } static void blkdev_vm_close(struct vm_area_struct *vma) @@ -1751,9 +1751,9 @@ static void blkdev_vm_close(struct vm_area_struct *vma) struct inode *bd_inode = bdev_file_inode(vma->vm_file); struct block_device *bdev = I_BDEV(bd_inode); - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); bdev->bd_map_count--; - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); } static const struct vm_operations_struct blkdev_dax_vm_ops = { @@ -1777,7 +1777,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) struct block_device *bdev = I_BDEV(bd_inode); file_accessed(file); - mutex_lock(&bd_inode->i_mutex); + inode_lock(bd_inode); bdev->bd_map_count++; if (IS_DAX(bd_inode)) { vma->vm_ops = &blkdev_dax_vm_ops; @@ -1785,7 +1785,7 @@ static int blkdev_mmap(struct file *file, struct vm_area_struct *vma) } else { vma->vm_ops = &blkdev_default_vm_ops; } - mutex_unlock(&bd_inode->i_mutex); + inode_unlock(bd_inode); return 0; } diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 08405a3da6b1..b90cd3776f8e 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -560,13 +560,13 @@ static int __add_missing_keys(struct btrfs_fs_info *fs_info, */ static void __merge_refs(struct list_head *head, int mode) { - struct __prelim_ref *ref1; + struct __prelim_ref *pos1; - list_for_each_entry(ref1, head, list) { - struct __prelim_ref *ref2 = ref1, *tmp; + list_for_each_entry(pos1, head, list) { + struct __prelim_ref *pos2 = pos1, *tmp; - list_for_each_entry_safe_continue(ref2, tmp, head, list) { - struct __prelim_ref *xchg; + list_for_each_entry_safe_continue(pos2, tmp, head, list) { + struct __prelim_ref *xchg, *ref1 = pos1, *ref2 = pos2; struct extent_inode_elem *eie; if (!ref_for_same_block(ref1, ref2)) diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 97ad9bbeb35d..bfe4a337fb4d 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -1614,7 +1614,7 @@ struct btrfs_fs_info { spinlock_t delayed_iput_lock; struct list_head delayed_iputs; - struct rw_semaphore delayed_iput_sem; + struct mutex cleaner_delayed_iput_mutex; /* this protects tree_mod_seq_list */ spinlock_t tree_mod_seq_lock; @@ -3641,6 +3641,7 @@ int btrfs_delayed_refs_qgroup_accounting(struct btrfs_trans_handle *trans, int __get_raid_index(u64 flags); int btrfs_start_write_no_snapshoting(struct btrfs_root *root); void btrfs_end_write_no_snapshoting(struct btrfs_root *root); +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root); void check_system_chunk(struct btrfs_trans_handle *trans, struct btrfs_root *root, const u64 type); diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 1e668fb7dd4c..cbb7dbfb3fff 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -614,7 +614,7 @@ static void btrfs_dev_replace_update_device_in_mapping_tree( em = lookup_extent_mapping(em_tree, start, (u64)-1); if (!em) break; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) if (srcdev == map->stripes[i].dev) map->stripes[i].dev = tgtdev; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index e99ccd6ffb2c..dd08e29f5117 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -55,6 +55,12 @@ #include <asm/cpufeature.h> #endif +#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ + BTRFS_HEADER_FLAG_RELOC |\ + BTRFS_SUPER_FLAG_ERROR |\ + BTRFS_SUPER_FLAG_SEEDING |\ + BTRFS_SUPER_FLAG_METADUMP) + static const struct extent_io_ops btree_extent_io_ops; static void end_workqueue_fn(struct btrfs_work *work); static void free_fs_root(struct btrfs_root *root); @@ -1583,8 +1589,23 @@ int btrfs_init_fs_root(struct btrfs_root *root) ret = get_anon_bdev(&root->anon_dev); if (ret) goto free_writers; + + mutex_lock(&root->objectid_mutex); + ret = btrfs_find_highest_objectid(root, + &root->highest_objectid); + if (ret) { + mutex_unlock(&root->objectid_mutex); + goto free_root_dev; + } + + ASSERT(root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&root->objectid_mutex); + return 0; +free_root_dev: + free_anon_bdev(root->anon_dev); free_writers: btrfs_free_subvolume_writers(root->subv_writers); fail: @@ -1786,7 +1807,10 @@ static int cleaner_kthread(void *arg) goto sleep; } + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); btrfs_run_delayed_iputs(root); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); + again = btrfs_clean_one_deleted_snapshot(root); mutex_unlock(&root->fs_info->cleaner_mutex); @@ -2556,8 +2580,8 @@ int open_ctree(struct super_block *sb, mutex_init(&fs_info->delete_unused_bgs_mutex); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); + mutex_init(&fs_info->cleaner_delayed_iput_mutex); seqlock_init(&fs_info->profiles_lock); - init_rwsem(&fs_info->delayed_iput_sem); INIT_LIST_HEAD(&fs_info->dirty_cowonly_roots); INIT_LIST_HEAD(&fs_info->space_info); @@ -2742,26 +2766,6 @@ int open_ctree(struct super_block *sb, goto fail_alloc; } - /* - * Leafsize and nodesize were always equal, this is only a sanity check. - */ - if (le32_to_cpu(disk_super->__unused_leafsize) != - btrfs_super_nodesize(disk_super)) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksizes don't match. node %d leaf %d\n", - btrfs_super_nodesize(disk_super), - le32_to_cpu(disk_super->__unused_leafsize)); - err = -EINVAL; - goto fail_alloc; - } - if (btrfs_super_nodesize(disk_super) > BTRFS_MAX_METADATA_BLOCKSIZE) { - printk(KERN_ERR "BTRFS: couldn't mount because metadata " - "blocksize (%d) was too large\n", - btrfs_super_nodesize(disk_super)); - err = -EINVAL; - goto fail_alloc; - } - features = btrfs_super_incompat_flags(disk_super); features |= BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF; if (tree_root->fs_info->compress_type == BTRFS_COMPRESS_LZO) @@ -2833,17 +2837,6 @@ int open_ctree(struct super_block *sb, sb->s_blocksize = sectorsize; sb->s_blocksize_bits = blksize_bits(sectorsize); - if (btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - printk(KERN_ERR "BTRFS: valid FS not found on %s\n", sb->s_id); - goto fail_sb_buffer; - } - - if (sectorsize != PAGE_SIZE) { - printk(KERN_ERR "BTRFS: incompatible sector size (%lu) " - "found on %s\n", (unsigned long)sectorsize, sb->s_id); - goto fail_sb_buffer; - } - mutex_lock(&fs_info->chunk_mutex); ret = btrfs_read_sys_array(tree_root); mutex_unlock(&fs_info->chunk_mutex); @@ -2915,6 +2908,18 @@ retry_root_backup: tree_root->commit_root = btrfs_root_node(tree_root); btrfs_set_root_refs(&tree_root->root_item, 1); + mutex_lock(&tree_root->objectid_mutex); + ret = btrfs_find_highest_objectid(tree_root, + &tree_root->highest_objectid); + if (ret) { + mutex_unlock(&tree_root->objectid_mutex); + goto recovery_tree_root; + } + + ASSERT(tree_root->highest_objectid <= BTRFS_LAST_FREE_OBJECTID); + + mutex_unlock(&tree_root->objectid_mutex); + ret = btrfs_read_roots(fs_info, tree_root); if (ret) goto recovery_tree_root; @@ -4018,8 +4023,17 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, int read_only) { struct btrfs_super_block *sb = fs_info->super_copy; + u64 nodesize = btrfs_super_nodesize(sb); + u64 sectorsize = btrfs_super_sectorsize(sb); int ret = 0; + if (btrfs_super_magic(sb) != BTRFS_MAGIC) { + printk(KERN_ERR "BTRFS: no valid FS found\n"); + ret = -EINVAL; + } + if (btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP) + printk(KERN_WARNING "BTRFS: unrecognized super flag: %llu\n", + btrfs_super_flags(sb) & ~BTRFS_SUPER_FLAG_SUPP); if (btrfs_super_root_level(sb) >= BTRFS_MAX_LEVEL) { printk(KERN_ERR "BTRFS: tree_root level too big: %d >= %d\n", btrfs_super_root_level(sb), BTRFS_MAX_LEVEL); @@ -4037,31 +4051,46 @@ static int btrfs_check_super_valid(struct btrfs_fs_info *fs_info, } /* - * The common minimum, we don't know if we can trust the nodesize/sectorsize - * items yet, they'll be verified later. Issue just a warning. + * Check sectorsize and nodesize first, other check will need it. + * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!IS_ALIGNED(btrfs_super_root(sb), 4096)) + if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid sectorsize %llu\n", sectorsize); + ret = -EINVAL; + } + /* Only PAGE SIZE is supported yet */ + if (sectorsize != PAGE_CACHE_SIZE) { + printk(KERN_ERR "BTRFS: sectorsize %llu not supported yet, only support %lu\n", + sectorsize, PAGE_CACHE_SIZE); + ret = -EINVAL; + } + if (!is_power_of_2(nodesize) || nodesize < sectorsize || + nodesize > BTRFS_MAX_METADATA_BLOCKSIZE) { + printk(KERN_ERR "BTRFS: invalid nodesize %llu\n", nodesize); + ret = -EINVAL; + } + if (nodesize != le32_to_cpu(sb->__unused_leafsize)) { + printk(KERN_ERR "BTRFS: invalid leafsize %u, should be %llu\n", + le32_to_cpu(sb->__unused_leafsize), + nodesize); + ret = -EINVAL; + } + + /* Root alignment check */ + if (!IS_ALIGNED(btrfs_super_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: tree_root block unaligned: %llu\n", btrfs_super_root(sb)); - if (!IS_ALIGNED(btrfs_super_chunk_root(sb), 4096)) + ret = -EINVAL; + } + if (!IS_ALIGNED(btrfs_super_chunk_root(sb), sectorsize)) { printk(KERN_WARNING "BTRFS: chunk_root block unaligned: %llu\n", btrfs_super_chunk_root(sb)); - if (!IS_ALIGNED(btrfs_super_log_root(sb), 4096)) - printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", - btrfs_super_log_root(sb)); - - /* - * Check the lower bound, the alignment and other constraints are - * checked later. - */ - if (btrfs_super_nodesize(sb) < 4096) { - printk(KERN_ERR "BTRFS: nodesize too small: %u < 4096\n", - btrfs_super_nodesize(sb)); ret = -EINVAL; } - if (btrfs_super_sectorsize(sb) < 4096) { - printk(KERN_ERR "BTRFS: sectorsize too small: %u < 4096\n", - btrfs_super_sectorsize(sb)); + if (!IS_ALIGNED(btrfs_super_log_root(sb), sectorsize)) { + printk(KERN_WARNING "BTRFS: log_root block unaligned: %llu\n", + btrfs_super_log_root(sb)); ret = -EINVAL; } diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 60cc1399c64f..e2287c7c10be 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -4139,8 +4139,10 @@ commit_trans: !atomic_read(&root->fs_info->open_ioctl_trans)) { need_commit--; - if (need_commit > 0) + if (need_commit > 0) { + btrfs_start_delalloc_roots(fs_info, 0, -1); btrfs_wait_ordered_roots(fs_info, -1); + } trans = btrfs_join_transaction(root); if (IS_ERR(trans)) @@ -4153,11 +4155,12 @@ commit_trans: if (ret) return ret; /* - * make sure that all running delayed iput are - * done + * The cleaner kthread might still be doing iput + * operations. Wait for it to finish so that + * more space is released. */ - down_write(&root->fs_info->delayed_iput_sem); - up_write(&root->fs_info->delayed_iput_sem); + mutex_lock(&root->fs_info->cleaner_delayed_iput_mutex); + mutex_unlock(&root->fs_info->cleaner_delayed_iput_mutex); goto again; } else { btrfs_end_transaction(trans, root); @@ -10399,7 +10402,7 @@ btrfs_start_trans_remove_block_group(struct btrfs_fs_info *fs_info, * more device items and remove one chunk item), but this is done at * btrfs_remove_chunk() through a call to check_system_chunk(). */ - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; num_items = 3 + map->num_stripes; free_extent_map(em); @@ -10586,7 +10589,7 @@ int btrfs_init_space_info(struct btrfs_fs_info *fs_info) disk_super = fs_info->super_copy; if (!btrfs_super_root(disk_super)) - return 1; + return -EINVAL; features = btrfs_super_incompat_flags(disk_super); if (features & BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS) @@ -10816,3 +10819,23 @@ int btrfs_start_write_no_snapshoting(struct btrfs_root *root) } return 1; } + +static int wait_snapshoting_atomic_t(atomic_t *a) +{ + schedule(); + return 0; +} + +void btrfs_wait_for_snapshot_creation(struct btrfs_root *root) +{ + while (true) { + int ret; + + ret = btrfs_start_write_no_snapshoting(root); + if (ret) + break; + wait_on_atomic_t(&root->will_be_snapshoted, + wait_snapshoting_atomic_t, + TASK_UNINTERRUPTIBLE); + } +} diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 6a98bddd8f33..84fb56d5c018 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -76,7 +76,7 @@ void free_extent_map(struct extent_map *em) WARN_ON(extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); if (test_bit(EXTENT_FLAG_FS_MAPPING, &em->flags)) - kfree(em->bdev); + kfree(em->map_lookup); kmem_cache_free(extent_map_cache, em); } } diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index b2991fd8583e..eb8b8fae036b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -32,7 +32,15 @@ struct extent_map { u64 block_len; u64 generation; unsigned long flags; - struct block_device *bdev; + union { + struct block_device *bdev; + + /* + * used for chunk mappings + * flags & EXTENT_FLAG_FS_MAPPING must be set + */ + struct map_lookup *map_lookup; + }; atomic_t refs; unsigned int compress_type; struct list_head list; diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 83d7859d7619..098bb8f690c9 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -406,8 +406,7 @@ int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info) /* simple helper to fault in pages and copy. This should go away * and be replaced with calls into generic code. */ -static noinline int btrfs_copy_from_user(loff_t pos, int num_pages, - size_t write_bytes, +static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes, struct page **prepared_pages, struct iov_iter *i) { @@ -1588,8 +1587,7 @@ again: ret = 0; } - copied = btrfs_copy_from_user(pos, num_pages, - write_bytes, pages, i); + copied = btrfs_copy_from_user(pos, write_bytes, pages, i); /* * if we have trouble faulting in the pages, fall @@ -1764,17 +1762,17 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, loff_t pos; size_t count; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = generic_write_checks(iocb, from); if (err <= 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } current->backing_dev_info = inode_to_bdi(inode); err = file_remove_privs(file); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -1785,7 +1783,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, * to stop this write operation to ensure FS consistency. */ if (test_bit(BTRFS_FS_STATE_ERROR, &root->fs_info->fs_state)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EROFS; goto out; } @@ -1806,7 +1804,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, end_pos = round_up(pos + count, root->sectorsize); err = btrfs_cont_expand(inode, i_size_read(inode), end_pos); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -1822,7 +1820,7 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, iocb->ki_pos = pos + num_written; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * We also have to set last_sub_trans to the current log transid, @@ -1911,7 +1909,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); atomic_inc(&root->log_batch); full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); @@ -1963,7 +1961,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) ret = start_ordered_ops(inode, start, end); } if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } atomic_inc(&root->log_batch); @@ -2009,7 +2007,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } @@ -2033,7 +2031,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } trans->sync = true; @@ -2056,7 +2054,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If any of the ordered extents had an error, just return it to user @@ -2305,7 +2303,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ino_size = round_up(inode->i_size, PAGE_CACHE_SIZE); ret = find_first_non_hole(inode, &offset, &len); if (ret < 0) @@ -2345,7 +2343,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_page = true; ret = btrfs_truncate_page(inode, offset, 0, 0); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2421,7 +2419,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) ret = btrfs_wait_ordered_range(inode, lockstart, lockend - lockstart + 1); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } @@ -2576,7 +2574,7 @@ out_only_mutex: ret = btrfs_end_transaction(trans, root); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret && !err) err = ret; return err; @@ -2660,7 +2658,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (ret < 0) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = inode_newsize_ok(inode, alloc_end); if (ret) goto out; @@ -2818,7 +2816,7 @@ out: * So this is completely used as cleanup. */ btrfs_qgroup_free_data(inode, alloc_start, alloc_end - alloc_start); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* Let go of our reservation. */ btrfs_free_reserved_data_space(inode, alloc_start, alloc_end - alloc_start); @@ -2894,7 +2892,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: case SEEK_CUR: @@ -2903,20 +2901,20 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: if (offset >= i_size_read(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } ret = find_desired_extent(inode, &offset, whence); if (ret) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c index 8b57c17b3fb3..e50316c4af15 100644 --- a/fs/btrfs/inode-map.c +++ b/fs/btrfs/inode-map.c @@ -515,7 +515,7 @@ out: return ret; } -static int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid) { struct btrfs_path *path; int ret; @@ -555,13 +555,6 @@ int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid) int ret; mutex_lock(&root->objectid_mutex); - if (unlikely(root->highest_objectid < BTRFS_FIRST_FREE_OBJECTID)) { - ret = btrfs_find_highest_objectid(root, - &root->highest_objectid); - if (ret) - goto out; - } - if (unlikely(root->highest_objectid >= BTRFS_LAST_FREE_OBJECTID)) { ret = -ENOSPC; goto out; diff --git a/fs/btrfs/inode-map.h b/fs/btrfs/inode-map.h index ddb347bfee23..c8e864b2d530 100644 --- a/fs/btrfs/inode-map.h +++ b/fs/btrfs/inode-map.h @@ -9,5 +9,6 @@ int btrfs_save_ino_cache(struct btrfs_root *root, struct btrfs_trans_handle *trans); int btrfs_find_free_objectid(struct btrfs_root *root, u64 *objectid); +int btrfs_find_highest_objectid(struct btrfs_root *root, u64 *objectid); #endif diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 247830107686..e28f3d4691af 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -3134,7 +3134,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - down_read(&fs_info->delayed_iput_sem); spin_lock(&fs_info->delayed_iput_lock); while (!list_empty(&fs_info->delayed_iputs)) { struct btrfs_inode *inode; @@ -3153,7 +3152,6 @@ void btrfs_run_delayed_iputs(struct btrfs_root *root) spin_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); - up_read(&root->fs_info->delayed_iput_sem); } /* @@ -4874,26 +4872,6 @@ next: return err; } -static int wait_snapshoting_atomic_t(atomic_t *a) -{ - schedule(); - return 0; -} - -static void wait_for_snapshot_creation(struct btrfs_root *root) -{ - while (true) { - int ret; - - ret = btrfs_start_write_no_snapshoting(root); - if (ret) - break; - wait_on_atomic_t(&root->will_be_snapshoted, - wait_snapshoting_atomic_t, - TASK_UNINTERRUPTIBLE); - } -} - static int btrfs_setsize(struct inode *inode, struct iattr *attr) { struct btrfs_root *root = BTRFS_I(inode)->root; @@ -4925,7 +4903,7 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr) * truncation, it must capture all writes that happened before * this truncation. */ - wait_for_snapshot_creation(root); + btrfs_wait_for_snapshot_creation(root); ret = btrfs_cont_expand(inode, oldsize, newsize); if (ret) { btrfs_end_write_no_snapshoting(root); @@ -8469,7 +8447,7 @@ static ssize_t btrfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, * not unlock the i_mutex at this case. */ if (offset + count <= inode->i_size) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); relock = true; } ret = btrfs_delalloc_reserve_space(inode, offset, count); @@ -8526,7 +8504,7 @@ out: if (wakeup) inode_dio_end(inode); if (relock) - mutex_lock(&inode->i_mutex); + inode_lock(inode); return ret; } diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 2a47a3148ec8..952172ca7e45 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -240,7 +240,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ip_oldflags = ip->flags; i_oldflags = inode->i_flags; @@ -358,7 +358,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); return ret; } @@ -568,6 +568,10 @@ static noinline int create_subvol(struct inode *dir, goto fail; } + mutex_lock(&new_root->objectid_mutex); + new_root->highest_objectid = new_dirid; + mutex_unlock(&new_root->objectid_mutex); + /* * insert the directory item */ @@ -877,7 +881,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return error; } @@ -1389,18 +1393,18 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) BTRFS_I(inode)->force_compress = compress_type; ret = cluster_pages_for_defrag(inode, pages, i, cluster); if (ret < 0) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (newer_than) { if (newer_off == (u64)-1) @@ -1461,9 +1465,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); BTRFS_I(inode)->force_compress = BTRFS_COMPRESS_NONE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (!file) kfree(ra); @@ -2426,7 +2430,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Don't allow to delete a subvolume with send in progress. This is @@ -2539,7 +2543,7 @@ out_up_write: spin_unlock(&dest->root_item_lock); } out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) { d_invalidate(dentry); btrfs_invalidate_inodes(dest); @@ -2555,7 +2559,7 @@ out_unlock_inode: out_dput: dput(dentry); out_unlock_dir: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); out_drop_write: mnt_drop_write_file(file); out: @@ -2853,8 +2857,8 @@ static inline void lock_extent_range(struct inode *inode, u64 off, u64 len) static void btrfs_double_inode_unlock(struct inode *inode1, struct inode *inode2) { - mutex_unlock(&inode1->i_mutex); - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode1); + inode_unlock(inode2); } static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) @@ -2862,8 +2866,8 @@ static void btrfs_double_inode_lock(struct inode *inode1, struct inode *inode2) if (inode1 < inode2) swap(inode1, inode2); - mutex_lock_nested(&inode1->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode1, I_MUTEX_PARENT); + inode_lock_nested(inode2, I_MUTEX_CHILD); } static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1, @@ -3022,7 +3026,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, return 0; if (same_inode) { - mutex_lock(&src->i_mutex); + inode_lock(src); ret = extent_same_check_offsets(src, loff, &len, olen); if (ret) @@ -3097,7 +3101,7 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, btrfs_cmp_data_free(&cmp); out_unlock: if (same_inode) - mutex_unlock(&src->i_mutex); + inode_unlock(src); else btrfs_double_inode_unlock(src, dst); @@ -3745,7 +3749,7 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, if (!same_inode) { btrfs_double_inode_lock(src, inode); } else { - mutex_lock(&src->i_mutex); + inode_lock(src); } /* determine range to clone */ @@ -3816,7 +3820,7 @@ out_unlock: if (!same_inode) btrfs_double_inode_unlock(src, inode); else - mutex_unlock(&src->i_mutex); + inode_unlock(src); return ret; } diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 6d707545f775..55161369fab1 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -609,13 +609,28 @@ static int rbio_can_merge(struct btrfs_raid_bio *last, return 1; } +static int rbio_stripe_page_index(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return stripe * rbio->stripe_npages + index; +} + +/* + * these are just the pages from the rbio array, not from anything + * the FS sent down to us + */ +static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, + int index) +{ + return rbio->stripe_pages[rbio_stripe_page_index(rbio, stripe, index)]; +} + /* * helper to index into the pstripe */ static struct page *rbio_pstripe_page(struct btrfs_raid_bio *rbio, int index) { - index += (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data, index); } /* @@ -626,10 +641,7 @@ static struct page *rbio_qstripe_page(struct btrfs_raid_bio *rbio, int index) { if (rbio->nr_data + 1 == rbio->real_stripes) return NULL; - - index += ((rbio->nr_data + 1) * rbio->stripe_len) >> - PAGE_CACHE_SHIFT; - return rbio->stripe_pages[index]; + return rbio_stripe_page(rbio, rbio->nr_data + 1, index); } /* @@ -889,6 +901,7 @@ static void raid_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; int err = bio->bi_error; + int max_errors; if (err) fail_bio_stripe(rbio, bio); @@ -901,7 +914,9 @@ static void raid_write_end_io(struct bio *bio) err = 0; /* OK, we have read all the stripes we need to. */ - if (atomic_read(&rbio->error) > rbio->bbio->max_errors) + max_errors = (rbio->operation == BTRFS_RBIO_PARITY_SCRUB) ? + 0 : rbio->bbio->max_errors; + if (atomic_read(&rbio->error) > max_errors) err = -EIO; rbio_orig_end_io(rbio, err); @@ -947,8 +962,7 @@ static struct page *page_in_rbio(struct btrfs_raid_bio *rbio, */ static unsigned long rbio_nr_pages(unsigned long stripe_len, int nr_stripes) { - unsigned long nr = stripe_len * nr_stripes; - return DIV_ROUND_UP(nr, PAGE_CACHE_SIZE); + return DIV_ROUND_UP(stripe_len, PAGE_CACHE_SIZE) * nr_stripes; } /* @@ -966,8 +980,8 @@ static struct btrfs_raid_bio *alloc_rbio(struct btrfs_root *root, void *p; rbio = kzalloc(sizeof(*rbio) + num_pages * sizeof(struct page *) * 2 + - DIV_ROUND_UP(stripe_npages, BITS_PER_LONG / 8), - GFP_NOFS); + DIV_ROUND_UP(stripe_npages, BITS_PER_LONG) * + sizeof(long), GFP_NOFS); if (!rbio) return ERR_PTR(-ENOMEM); @@ -1021,18 +1035,17 @@ static int alloc_rbio_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[i] = page; - ClearPageUptodate(page); } return 0; } -/* allocate pages for just the p/q stripes */ +/* only allocate pages for p/q stripes */ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio) { int i; struct page *page; - i = (rbio->nr_data * rbio->stripe_len) >> PAGE_CACHE_SHIFT; + i = rbio_stripe_page_index(rbio, rbio->nr_data, 0); for (; i < rbio->nr_pages; i++) { if (rbio->stripe_pages[i]) @@ -1121,18 +1134,6 @@ static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio) } /* - * these are just the pages from the rbio array, not from anything - * the FS sent down to us - */ -static struct page *rbio_stripe_page(struct btrfs_raid_bio *rbio, int stripe, int page) -{ - int index; - index = stripe * (rbio->stripe_len >> PAGE_CACHE_SHIFT); - index += page; - return rbio->stripe_pages[index]; -} - -/* * helper function to walk our bio list and populate the bio_pages array with * the result. This seems expensive, but it is faster than constantly * searching through the bio list as we setup the IO in finish_rmw or stripe @@ -1175,7 +1176,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) { struct btrfs_bio *bbio = rbio->bbio; void *pointers[rbio->real_stripes]; - int stripe_len = rbio->stripe_len; int nr_data = rbio->nr_data; int stripe; int pagenr; @@ -1183,7 +1183,6 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) int q_stripe = -1; struct bio_list bio_list; struct bio *bio; - int pages_per_stripe = stripe_len >> PAGE_CACHE_SHIFT; int ret; bio_list_init(&bio_list); @@ -1226,7 +1225,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) else clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { @@ -1268,7 +1267,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) * everything else. */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1292,7 +1291,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) if (!bbio->tgtdev_map[stripe]) continue; - for (pagenr = 0; pagenr < pages_per_stripe; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; if (stripe < rbio->nr_data) { page = page_in_rbio(rbio, stripe, pagenr, 1); @@ -1506,7 +1505,6 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -1525,7 +1523,7 @@ static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio) * stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *page; /* * we want to find all the pages missing from @@ -1801,7 +1799,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) int pagenr, stripe; void **pointers; int faila = -1, failb = -1; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); struct page *page; int err; int i; @@ -1824,7 +1821,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) index_rbio_pages(rbio); - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { /* * Now we just use bitmap to mark the horizontal stripes in * which we have data when doing parity scrub. @@ -1935,7 +1932,7 @@ pstripe: * other endio functions will fiddle the uptodate bits */ if (rbio->operation == BTRFS_RBIO_WRITE) { - for (i = 0; i < nr_pages; i++) { + for (i = 0; i < rbio->stripe_npages; i++) { if (faila != -1) { page = rbio_stripe_page(rbio, faila, i); SetPageUptodate(page); @@ -2031,7 +2028,6 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) int bios_to_read = 0; struct bio_list bio_list; int ret; - int nr_pages = DIV_ROUND_UP(rbio->stripe_len, PAGE_CACHE_SIZE); int pagenr; int stripe; struct bio *bio; @@ -2055,7 +2051,7 @@ static int __raid56_parity_recover(struct btrfs_raid_bio *rbio) continue; } - for (pagenr = 0; pagenr < nr_pages; pagenr++) { + for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) { struct page *p; /* @@ -2279,37 +2275,11 @@ static int alloc_rbio_essential_pages(struct btrfs_raid_bio *rbio) if (!page) return -ENOMEM; rbio->stripe_pages[index] = page; - ClearPageUptodate(page); } } return 0; } -/* - * end io function used by finish_rmw. When we finally - * get here, we've written a full stripe - */ -static void raid_write_parity_end_io(struct bio *bio) -{ - struct btrfs_raid_bio *rbio = bio->bi_private; - int err = bio->bi_error; - - if (bio->bi_error) - fail_bio_stripe(rbio, bio); - - bio_put(bio); - - if (!atomic_dec_and_test(&rbio->stripes_pending)) - return; - - err = 0; - - if (atomic_read(&rbio->error)) - err = -EIO; - - rbio_orig_end_io(rbio, err); -} - static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, int need_check) { @@ -2462,7 +2432,7 @@ submit_write: break; bio->bi_private = rbio; - bio->bi_end_io = raid_write_parity_end_io; + bio->bi_end_io = raid_write_end_io; submit_bio(WRITE, bio); } return; diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index ef6d8fc85853..fd1c4d982463 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -3030,7 +3030,7 @@ int prealloc_file_extent_cluster(struct inode *inode, int ret = 0; BUG_ON(cluster->start != cluster->boundary[0]); - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = btrfs_check_data_free_space(inode, cluster->start, cluster->end + 1 - cluster->start); @@ -3057,7 +3057,7 @@ int prealloc_file_extent_cluster(struct inode *inode, btrfs_free_reserved_data_space(inode, cluster->start, cluster->end + 1 - cluster->start); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 0c981ebe2acb..92bf5ee732fb 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -2813,7 +2813,7 @@ out: static inline int scrub_calc_parity_bitmap_len(int nsectors) { - return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * (BITS_PER_LONG / 8); + return DIV_ROUND_UP(nsectors, BITS_PER_LONG) * sizeof(long); } static void scrub_parity_get(struct scrub_parity *sparity) @@ -3458,7 +3458,7 @@ static noinline_for_stack int scrub_chunk(struct scrub_ctx *sctx, return ret; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (em->start != chunk_offset) goto out; @@ -4279,7 +4279,7 @@ static int copy_nocow_pages_for_inode(u64 inum, u64 offset, u64 root, return PTR_ERR(inode); /* Avoid truncate/dio/punch hole.. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode_dio_wait(inode); physical_for_dev_replace = nocow_ctx->physical_for_dev_replace; @@ -4358,7 +4358,7 @@ next_page: } ret = COPY_COMPLETE; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); return ret; } diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index 9b9eab6d048e..d41e09fe8e38 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -383,6 +383,9 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) int ret = 0; char *compress_type; bool compress_force = false; + enum btrfs_compression_type saved_compress_type; + bool saved_compress_force; + int no_compress = 0; cache_gen = btrfs_super_cache_generation(root->fs_info->super_copy); if (btrfs_fs_compat_ro(root->fs_info, FREE_SPACE_TREE)) @@ -462,6 +465,10 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) /* Fallthrough */ case Opt_compress: case Opt_compress_type: + saved_compress_type = btrfs_test_opt(root, COMPRESS) ? + info->compress_type : BTRFS_COMPRESS_NONE; + saved_compress_force = + btrfs_test_opt(root, FORCE_COMPRESS); if (token == Opt_compress || token == Opt_compress_force || strcmp(args[0].from, "zlib") == 0) { @@ -470,6 +477,7 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_set_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); + no_compress = 0; } else if (strcmp(args[0].from, "lzo") == 0) { compress_type = "lzo"; info->compress_type = BTRFS_COMPRESS_LZO; @@ -477,25 +485,21 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) btrfs_clear_opt(info->mount_opt, NODATACOW); btrfs_clear_opt(info->mount_opt, NODATASUM); btrfs_set_fs_incompat(info, COMPRESS_LZO); + no_compress = 0; } else if (strncmp(args[0].from, "no", 2) == 0) { compress_type = "no"; btrfs_clear_opt(info->mount_opt, COMPRESS); btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); compress_force = false; + no_compress++; } else { ret = -EINVAL; goto out; } if (compress_force) { - btrfs_set_and_info(root, FORCE_COMPRESS, - "force %s compression", - compress_type); + btrfs_set_opt(info->mount_opt, FORCE_COMPRESS); } else { - if (!btrfs_test_opt(root, COMPRESS)) - btrfs_info(root->fs_info, - "btrfs: use %s compression", - compress_type); /* * If we remount from compress-force=xxx to * compress=xxx, we need clear FORCE_COMPRESS @@ -504,6 +508,17 @@ int btrfs_parse_options(struct btrfs_root *root, char *options) */ btrfs_clear_opt(info->mount_opt, FORCE_COMPRESS); } + if ((btrfs_test_opt(root, COMPRESS) && + (info->compress_type != saved_compress_type || + compress_force != saved_compress_force)) || + (!btrfs_test_opt(root, COMPRESS) && + no_compress == 1)) { + btrfs_info(root->fs_info, + "%s %s compression", + (compress_force) ? "force" : "use", + compress_type); + } + compress_force = false; break; case Opt_ssd: btrfs_set_and_info(root, SSD, diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index c32abbca9d77..366b335946fa 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -108,7 +108,7 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = { }, }; -const u64 const btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { +const u64 btrfs_raid_group[BTRFS_NR_RAID_TYPES] = { [BTRFS_RAID_RAID10] = BTRFS_BLOCK_GROUP_RAID10, [BTRFS_RAID_RAID1] = BTRFS_BLOCK_GROUP_RAID1, [BTRFS_RAID_DUP] = BTRFS_BLOCK_GROUP_DUP, @@ -233,6 +233,7 @@ static struct btrfs_device *__alloc_device(void) spin_lock_init(&dev->reada_lock); atomic_set(&dev->reada_in_flight, 0); atomic_set(&dev->dev_stats_ccnt, 0); + btrfs_device_data_ordered_init(dev); INIT_RADIX_TREE(&dev->reada_zones, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); INIT_RADIX_TREE(&dev->reada_extents, GFP_NOFS & ~__GFP_DIRECT_RECLAIM); @@ -1183,7 +1184,7 @@ again: struct map_lookup *map; int i; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { u64 end; @@ -2755,7 +2756,7 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, free_extent_map(em); return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; lock_chunks(root->fs_info->chunk_root); check_system_chunk(trans, extent_root, map->type); unlock_chunks(root->fs_info->chunk_root); @@ -3751,7 +3752,7 @@ int btrfs_balance(struct btrfs_balance_control *bctl, if (btrfs_get_num_tolerated_disk_barrier_failures(bctl->meta.target) < btrfs_get_num_tolerated_disk_barrier_failures(bctl->data.target)) { btrfs_warn(fs_info, - "metatdata profile 0x%llx has lower redundancy than data profile 0x%llx", + "metadata profile 0x%llx has lower redundancy than data profile 0x%llx", bctl->meta.target, bctl->data.target); } @@ -4718,7 +4719,7 @@ static int __btrfs_alloc_chunk(struct btrfs_trans_handle *trans, goto error; } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = start; em->len = num_bytes; em->block_start = 0; @@ -4813,7 +4814,7 @@ int btrfs_finish_chunk_alloc(struct btrfs_trans_handle *trans, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; item_size = btrfs_chunk_item_size(map->num_stripes); stripe_size = em->orig_block_len; @@ -4968,7 +4969,7 @@ int btrfs_chunk_readonly(struct btrfs_root *root, u64 chunk_offset) if (!em) return 1; - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { if (map->stripes[i].dev->missing) { miss_ndevs++; @@ -5048,7 +5049,7 @@ int btrfs_num_copies(struct btrfs_fs_info *fs_info, u64 logical, u64 len) return 1; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & (BTRFS_BLOCK_GROUP_DUP | BTRFS_BLOCK_GROUP_RAID1)) ret = map->num_stripes; else if (map->type & BTRFS_BLOCK_GROUP_RAID10) @@ -5084,7 +5085,7 @@ unsigned long btrfs_full_stripe_len(struct btrfs_root *root, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) len = map->stripe_len * nr_data_stripes(map); free_extent_map(em); @@ -5105,7 +5106,7 @@ int btrfs_is_parity_mirror(struct btrfs_mapping_tree *map_tree, BUG_ON(!em); BUG_ON(em->start > logical || em->start + em->len < logical); - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) ret = 1; free_extent_map(em); @@ -5264,7 +5265,7 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, return -EINVAL; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; offset = logical - em->start; stripe_len = map->stripe_len; @@ -5378,35 +5379,33 @@ static int __btrfs_map_block(struct btrfs_fs_info *fs_info, int rw, * target drive. */ for (i = 0; i < tmp_num_stripes; i++) { - if (tmp_bbio->stripes[i].dev->devid == srcdev_devid) { - /* - * In case of DUP, in order to keep it - * simple, only add the mirror with the - * lowest physical address - */ - if (found && - physical_of_found <= - tmp_bbio->stripes[i].physical) - continue; - index_srcdev = i; - found = 1; - physical_of_found = - tmp_bbio->stripes[i].physical; - } + if (tmp_bbio->stripes[i].dev->devid != srcdev_devid) + continue; + + /* + * In case of DUP, in order to keep it simple, only add + * the mirror with the lowest physical address + */ + if (found && + physical_of_found <= tmp_bbio->stripes[i].physical) + continue; + + index_srcdev = i; + found = 1; + physical_of_found = tmp_bbio->stripes[i].physical; } - if (found) { - mirror_num = index_srcdev + 1; - patch_the_first_stripe_for_dev_replace = 1; - physical_to_patch_in_first_stripe = physical_of_found; - } else { + btrfs_put_bbio(tmp_bbio); + + if (!found) { WARN_ON(1); ret = -EIO; - btrfs_put_bbio(tmp_bbio); goto out; } - btrfs_put_bbio(tmp_bbio); + mirror_num = index_srcdev + 1; + patch_the_first_stripe_for_dev_replace = 1; + physical_to_patch_in_first_stripe = physical_of_found; } else if (mirror_num > map->num_stripes) { mirror_num = 0; } @@ -5806,7 +5805,7 @@ int btrfs_rmap_block(struct btrfs_mapping_tree *map_tree, free_extent_map(em); return -EIO; } - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; length = em->len; rmap_len = map->stripe_len; @@ -6069,7 +6068,8 @@ int btrfs_map_bio(struct btrfs_root *root, int rw, struct bio *bio, bbio->fs_info = root->fs_info; atomic_set(&bbio->stripes_pending, bbio->num_stripes); - if (bbio->raid_map) { + if ((bbio->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) && + ((rw & WRITE) || (mirror_num > 1))) { /* In this case, map_length has been set to the length of a single stripe; not the whole write */ if (rw & WRITE) { @@ -6210,6 +6210,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, struct extent_map *em; u64 logical; u64 length; + u64 stripe_len; u64 devid; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; @@ -6218,6 +6219,37 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + stripe_len = btrfs_chunk_stripe_len(leaf, chunk); + num_stripes = btrfs_chunk_num_stripes(leaf, chunk); + /* Validation check */ + if (!num_stripes) { + btrfs_err(root->fs_info, "invalid chunk num_stripes: %u", + num_stripes); + return -EIO; + } + if (!IS_ALIGNED(logical, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk logical %llu", logical); + return -EIO; + } + if (!length || !IS_ALIGNED(length, root->sectorsize)) { + btrfs_err(root->fs_info, + "invalid chunk length %llu", length); + return -EIO; + } + if (!is_power_of_2(stripe_len)) { + btrfs_err(root->fs_info, "invalid chunk stripe length: %llu", + stripe_len); + return -EIO; + } + if (~(BTRFS_BLOCK_GROUP_TYPE_MASK | BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)) { + btrfs_err(root->fs_info, "unrecognized chunk type: %llu", + ~(BTRFS_BLOCK_GROUP_TYPE_MASK | + BTRFS_BLOCK_GROUP_PROFILE_MASK) & + btrfs_chunk_type(leaf, chunk)); + return -EIO; + } read_lock(&map_tree->map_tree.lock); em = lookup_extent_mapping(&map_tree->map_tree, logical, 1); @@ -6234,7 +6266,6 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, em = alloc_extent_map(); if (!em) return -ENOMEM; - num_stripes = btrfs_chunk_num_stripes(leaf, chunk); map = kmalloc(map_lookup_size(num_stripes), GFP_NOFS); if (!map) { free_extent_map(em); @@ -6242,7 +6273,7 @@ static int read_one_chunk(struct btrfs_root *root, struct btrfs_key *key, } set_bit(EXTENT_FLAG_FS_MAPPING, &em->flags); - em->bdev = (struct block_device *)map; + em->map_lookup = map; em->start = logical; em->len = length; em->orig_start = 0; @@ -6944,7 +6975,7 @@ void btrfs_update_commit_device_bytes_used(struct btrfs_root *root, /* In order to kick the device replace finish process */ lock_chunks(root); list_for_each_entry(em, &transaction->pending_chunks, list) { - map = (struct map_lookup *)em->bdev; + map = em->map_lookup; for (i = 0; i < map->num_stripes; i++) { dev = map->stripes[i].dev; diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c index fd953c361a43..6c68d6356197 100644 --- a/fs/btrfs/xattr.c +++ b/fs/btrfs/xattr.c @@ -126,7 +126,7 @@ static int do_setxattr(struct btrfs_trans_handle *trans, * locks the inode's i_mutex before calling setxattr or removexattr. */ if (flags & XATTR_REPLACE) { - ASSERT(mutex_is_locked(&inode->i_mutex)); + ASSERT(inode_is_locked(inode)); di = btrfs_lookup_xattr(NULL, root, path, btrfs_ino(inode), name, name_len, 0); if (!di) diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index afa023dded5b..675a3332d72f 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -446,7 +446,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) return 0; cachefiles_begin_secure(cache, &saved_cred); - mutex_lock(&d_inode(object->backer)->i_mutex); + inode_lock(d_inode(object->backer)); /* if there's an extension to a partial page at the end of the backing * file, we need to discard the partial page so that we pick up new @@ -465,7 +465,7 @@ static int cachefiles_attr_changed(struct fscache_object *_object) ret = notify_change(object->backer, &newattrs, NULL); truncate_failed: - mutex_unlock(&d_inode(object->backer)->i_mutex); + inode_unlock(d_inode(object->backer)); cachefiles_end_secure(cache, saved_cred); if (ret == -EIO) { diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index c4b893453e0e..1c2334c163dd 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -295,7 +295,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, cachefiles_mark_object_buried(cache, rep, why); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (ret == -EIO) cachefiles_io_error(cache, "Unlink failed"); @@ -306,7 +306,7 @@ static int cachefiles_bury_object(struct cachefiles_cache *cache, /* directories have to be moved to the graveyard */ _debug("move stale object to graveyard"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); try_again: /* first step is to make up a grave dentry in the graveyard */ @@ -423,13 +423,13 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, dir = dget_parent(object->dentry); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); if (test_bit(FSCACHE_OBJECT_KILLED_BY_CACHE, &object->fscache.flags)) { /* object allocation for the same key preemptively deleted this * object's file so that it could create its own file */ _debug("object preemptively buried"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } else { /* we need to check that our parent is _still_ our parent - it @@ -442,7 +442,7 @@ int cachefiles_delete_object(struct cachefiles_cache *cache, /* it got moved, presumably by cachefilesd culling it, * so it's no longer in the key path and we can ignore * it */ - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = 0; } } @@ -501,7 +501,7 @@ lookup_again: /* search the current directory for the element name */ _debug("lookup '%s'", name); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; next = lookup_one_len(name, dir, nlen); @@ -585,7 +585,7 @@ lookup_again: /* process the next component */ if (key) { _debug("advance"); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = next; next = NULL; @@ -623,7 +623,7 @@ lookup_again: /* note that we're now using this object */ ret = cachefiles_mark_object_active(cache, object); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); dir = NULL; @@ -705,7 +705,7 @@ lookup_error: cachefiles_io_error(cache, "Lookup failed"); next = NULL; error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(next); error_out2: dput(dir); @@ -729,7 +729,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, _enter(",,%s", dirname); /* search the current directory for the element name */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); start = jiffies; subdir = lookup_one_len(dirname, dir, strlen(dirname)); @@ -768,7 +768,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, d_backing_inode(subdir)->i_ino); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); /* we need to make sure the subdir is a directory */ ASSERT(d_backing_inode(subdir)); @@ -800,19 +800,19 @@ check_error: return ERR_PTR(ret); mkdir_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(subdir); pr_err("Lookup %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); nomem_d_alloc: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); _leave(" = -ENOMEM"); return ERR_PTR(-ENOMEM); } @@ -837,7 +837,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, // dir, filename); /* look up the victim */ - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); start = jiffies; victim = lookup_one_len(filename, dir, strlen(filename)); @@ -852,7 +852,7 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, * at the netfs's request whilst the cull was in progress */ if (d_is_negative(victim)) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); _leave(" = -ENOENT [absent]"); return ERR_PTR(-ENOENT); @@ -881,13 +881,13 @@ static struct dentry *cachefiles_check_active(struct cachefiles_cache *cache, object_in_use: read_unlock(&cache->active_lock); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = -EBUSY [in use]"); return ERR_PTR(-EBUSY); lookup_error: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); ret = PTR_ERR(victim); if (ret == -ENOENT) { /* file or dir now absent - probably retired by netfs */ @@ -947,7 +947,7 @@ int cachefiles_cull(struct cachefiles_cache *cache, struct dentry *dir, return 0; error_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); error: dput(victim); if (ret == -ENOENT) { @@ -982,7 +982,7 @@ int cachefiles_check_in_use(struct cachefiles_cache *cache, struct dentry *dir, if (IS_ERR(victim)) return PTR_ERR(victim); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(victim); //_leave(" = 0"); return 0; diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index b7d218a168fb..c22213789090 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -1108,7 +1108,7 @@ retry_locked: return 0; /* past end of file? */ - i_size = inode->i_size; /* caller holds i_mutex */ + i_size = i_size_read(inode); if (page_off >= i_size || (pos_in_page == 0 && (pos+len) >= i_size && @@ -1149,7 +1149,6 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, page = grab_cache_page_write_begin(mapping, index, 0); if (!page) return -ENOMEM; - *pagep = page; dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); @@ -1184,8 +1183,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, zero_user_segment(page, from+copied, len); /* did file size increase? */ - /* (no need for i_size_read(); we caller holds i_mutex */ - if (pos+copied > inode->i_size) + if (pos+copied > i_size_read(inode)) check_cap = ceph_inode_set_size(inode, pos+copied); if (!PageUptodate(page)) @@ -1378,11 +1376,13 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; if ((off > size) || - (page->mapping != inode->i_mapping)) + (page->mapping != inode->i_mapping)) { + unlock_page(page); goto out; + } ret = ceph_update_writeable_page(vma->vm_file, off, len, page); - if (ret == 0) { + if (ret >= 0) { /* success. we'll keep the page locked. */ set_page_dirty(page); ret = VM_FAULT_LOCKED; @@ -1393,8 +1393,6 @@ static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) ret = VM_FAULT_SIGBUS; } out: - if (ret != VM_FAULT_LOCKED) - unlock_page(page); if (ret == VM_FAULT_LOCKED || ci->i_inline_version != CEPH_INLINE_NONE) { int dirty; diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index a4766ded1ba7..a351480dbabc 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -106,7 +106,7 @@ static uint16_t ceph_fscache_inode_get_aux(const void *cookie_netfs_data, memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); memcpy(buffer, &aux, sizeof(aux)); @@ -117,9 +117,7 @@ static void ceph_fscache_inode_get_attr(const void *cookie_netfs_data, uint64_t *size) { const struct ceph_inode_info* ci = cookie_netfs_data; - const struct inode* inode = &ci->vfs_inode; - - *size = inode->i_size; + *size = i_size_read(&ci->vfs_inode); } static enum fscache_checkaux ceph_fscache_inode_check_aux( @@ -134,7 +132,7 @@ static enum fscache_checkaux ceph_fscache_inode_check_aux( memset(&aux, 0, sizeof(aux)); aux.mtime = inode->i_mtime; - aux.size = inode->i_size; + aux.size = i_size_read(inode); if (memcmp(data, &aux, sizeof(aux)) != 0) return FSCACHE_CHECKAUX_OBSOLETE; @@ -197,7 +195,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, return; /* Avoid multiple racing open requests */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ci->fscache) goto done; @@ -207,7 +205,7 @@ void ceph_fscache_register_inode_cookie(struct ceph_fs_client* fsc, ci, true); fscache_check_consistency(ci->fscache); done: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index c69e1253b47b..cdbf8cf3d52c 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -2030,7 +2030,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (datasync) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); dirty = try_flush_caps(inode, &flush_tid); dout("fsync dirty caps are %s\n", ceph_cap_string(dirty)); @@ -2046,7 +2046,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = wait_event_interruptible(ci->i_cap_wq, caps_are_flushed(inode, flush_tid)); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: dout("fsync %p%s result=%d\n", inode, datasync ? " datasync" : "", ret); return ret; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 9314b4ea2375..fd11fb231a2e 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -507,7 +507,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) loff_t old_offset = ceph_make_fpos(fi->frag, fi->next_offset); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = -EINVAL; switch (whence) { case SEEK_CUR: @@ -542,7 +542,7 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) } } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } diff --git a/fs/ceph/export.c b/fs/ceph/export.c index fe02ae7f056a..3b3172357326 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -215,7 +215,7 @@ static int ceph_get_name(struct dentry *parent, char *name, if (IS_ERR(req)) return PTR_ERR(req); - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); req->r_inode = d_inode(child); ihold(d_inode(child)); @@ -224,7 +224,7 @@ static int ceph_get_name(struct dentry *parent, char *name, req->r_num_caps = 2; err = ceph_mdsc_do_request(mdsc, NULL, req); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!err) { struct ceph_mds_reply_info_parsed *rinfo = &req->r_reply_info; diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 3c68e6aee2f0..86a9c383955e 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -397,8 +397,9 @@ int ceph_release(struct inode *inode, struct file *file) } enum { - CHECK_EOF = 1, - READ_INLINE = 2, + HAVE_RETRIED = 1, + CHECK_EOF = 2, + READ_INLINE = 3, }; /* @@ -411,17 +412,15 @@ enum { static int striped_read(struct inode *inode, u64 off, u64 len, struct page **pages, int num_pages, - int *checkeof, bool o_direct, - unsigned long buf_align) + int *checkeof) { struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); u64 pos, this_len, left; - int io_align, page_align; - int pages_left; - int read; + loff_t i_size; + int page_align, pages_left; + int read, ret; struct page **page_pos; - int ret; bool hit_stripe, was_short; /* @@ -432,13 +431,9 @@ static int striped_read(struct inode *inode, page_pos = pages; pages_left = num_pages; read = 0; - io_align = off & ~PAGE_MASK; more: - if (o_direct) - page_align = (pos - io_align + buf_align) & ~PAGE_MASK; - else - page_align = pos & ~PAGE_MASK; + page_align = pos & ~PAGE_MASK; this_len = left; ret = ceph_osdc_readpages(&fsc->client->osdc, ceph_vino(inode), &ci->i_layout, pos, &this_len, @@ -452,13 +447,12 @@ more: dout("striped_read %llu~%llu (read %u) got %d%s%s\n", pos, left, read, ret, hit_stripe ? " HITSTRIPE" : "", was_short ? " SHORT" : ""); + i_size = i_size_read(inode); if (ret >= 0) { int didpages; - if (was_short && (pos + ret < inode->i_size)) { - int zlen = min(this_len - ret, - inode->i_size - pos - ret); - int zoff = (o_direct ? buf_align : io_align) + - read + ret; + if (was_short && (pos + ret < i_size)) { + int zlen = min(this_len - ret, i_size - pos - ret); + int zoff = (off & ~PAGE_MASK) + read + ret; dout(" zero gap %llu to %llu\n", pos + ret, pos + ret + zlen); ceph_zero_page_vector_range(zoff, zlen, pages); @@ -473,14 +467,14 @@ more: pages_left -= didpages; /* hit stripe and need continue*/ - if (left && hit_stripe && pos < inode->i_size) + if (left && hit_stripe && pos < i_size) goto more; } if (read > 0) { ret = read; /* did we bounce off eof? */ - if (pos + left > inode->i_size) + if (pos + left > i_size) *checkeof = CHECK_EOF; } @@ -521,54 +515,28 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, if (ret < 0) return ret; - if (iocb->ki_flags & IOCB_DIRECT) { - while (iov_iter_count(i)) { - size_t start; - ssize_t n; - - n = dio_get_pagev_size(i); - pages = dio_get_pages_alloc(i, n, &start, &num_pages); - if (IS_ERR(pages)) - return PTR_ERR(pages); - - ret = striped_read(inode, off, n, - pages, num_pages, checkeof, - 1, start); - - ceph_put_page_vector(pages, num_pages, true); - - if (ret <= 0) - break; - off += ret; - iov_iter_advance(i, ret); - if (ret < n) + num_pages = calc_pages_for(off, len); + pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); + if (IS_ERR(pages)) + return PTR_ERR(pages); + ret = striped_read(inode, off, len, pages, + num_pages, checkeof); + if (ret > 0) { + int l, k = 0; + size_t left = ret; + + while (left) { + size_t page_off = off & ~PAGE_MASK; + size_t copy = min_t(size_t, left, + PAGE_SIZE - page_off); + l = copy_page_to_iter(pages[k++], page_off, copy, i); + off += l; + left -= l; + if (l < copy) break; } - } else { - num_pages = calc_pages_for(off, len); - pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL); - if (IS_ERR(pages)) - return PTR_ERR(pages); - ret = striped_read(inode, off, len, pages, - num_pages, checkeof, 0, 0); - if (ret > 0) { - int l, k = 0; - size_t left = ret; - - while (left) { - size_t page_off = off & ~PAGE_MASK; - size_t copy = min_t(size_t, - PAGE_SIZE - page_off, left); - l = copy_page_to_iter(pages[k++], page_off, - copy, i); - off += l; - left -= l; - if (l < copy) - break; - } - } - ceph_release_page_vector(pages, num_pages); } + ceph_release_page_vector(pages, num_pages); if (off > iocb->ki_pos) { ret = off - iocb->ki_pos; @@ -579,6 +547,193 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *i, return ret; } +struct ceph_aio_request { + struct kiocb *iocb; + size_t total_len; + int write; + int error; + struct list_head osd_reqs; + unsigned num_reqs; + atomic_t pending_reqs; + struct timespec mtime; + struct ceph_cap_flush *prealloc_cf; +}; + +struct ceph_aio_work { + struct work_struct work; + struct ceph_osd_request *req; +}; + +static void ceph_aio_retry_work(struct work_struct *work); + +static void ceph_aio_complete(struct inode *inode, + struct ceph_aio_request *aio_req) +{ + struct ceph_inode_info *ci = ceph_inode(inode); + int ret; + + if (!atomic_dec_and_test(&aio_req->pending_reqs)) + return; + + ret = aio_req->error; + if (!ret) + ret = aio_req->total_len; + + dout("ceph_aio_complete %p rc %d\n", inode, ret); + + if (ret >= 0 && aio_req->write) { + int dirty; + + loff_t endoff = aio_req->iocb->ki_pos + aio_req->total_len; + if (endoff > i_size_read(inode)) { + if (ceph_inode_set_size(inode, endoff)) + ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL); + } + + spin_lock(&ci->i_ceph_lock); + ci->i_inline_version = CEPH_INLINE_NONE; + dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, + &aio_req->prealloc_cf); + spin_unlock(&ci->i_ceph_lock); + if (dirty) + __mark_inode_dirty(inode, dirty); + + } + + ceph_put_cap_refs(ci, (aio_req->write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD)); + + aio_req->iocb->ki_complete(aio_req->iocb, ret, 0); + + ceph_free_cap_flush(aio_req->prealloc_cf); + kfree(aio_req); +} + +static void ceph_aio_complete_req(struct ceph_osd_request *req, + struct ceph_msg *msg) +{ + int rc = req->r_result; + struct inode *inode = req->r_inode; + struct ceph_aio_request *aio_req = req->r_priv; + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + int num_pages = calc_pages_for((u64)osd_data->alignment, + osd_data->length); + + dout("ceph_aio_complete_req %p rc %d bytes %llu\n", + inode, rc, osd_data->length); + + if (rc == -EOLDSNAPC) { + struct ceph_aio_work *aio_work; + BUG_ON(!aio_req->write); + + aio_work = kmalloc(sizeof(*aio_work), GFP_NOFS); + if (aio_work) { + INIT_WORK(&aio_work->work, ceph_aio_retry_work); + aio_work->req = req; + queue_work(ceph_inode_to_client(inode)->wb_wq, + &aio_work->work); + return; + } + rc = -ENOMEM; + } else if (!aio_req->write) { + if (rc == -ENOENT) + rc = 0; + if (rc >= 0 && osd_data->length > rc) { + int zoff = osd_data->alignment + rc; + int zlen = osd_data->length - rc; + /* + * If read is satisfied by single OSD request, + * it can pass EOF. Otherwise read is within + * i_size. + */ + if (aio_req->num_reqs == 1) { + loff_t i_size = i_size_read(inode); + loff_t endoff = aio_req->iocb->ki_pos + rc; + if (endoff < i_size) + zlen = min_t(size_t, zlen, + i_size - endoff); + aio_req->total_len = rc + zlen; + } + + if (zlen > 0) + ceph_zero_page_vector_range(zoff, zlen, + osd_data->pages); + } + } + + ceph_put_page_vector(osd_data->pages, num_pages, false); + ceph_osdc_put_request(req); + + if (rc < 0) + cmpxchg(&aio_req->error, 0, rc); + + ceph_aio_complete(inode, aio_req); + return; +} + +static void ceph_aio_retry_work(struct work_struct *work) +{ + struct ceph_aio_work *aio_work = + container_of(work, struct ceph_aio_work, work); + struct ceph_osd_request *orig_req = aio_work->req; + struct ceph_aio_request *aio_req = orig_req->r_priv; + struct inode *inode = orig_req->r_inode; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + struct ceph_osd_request *req; + int ret; + + spin_lock(&ci->i_ceph_lock); + if (__ceph_have_pending_cap_snap(ci)) { + struct ceph_cap_snap *capsnap = + list_last_entry(&ci->i_cap_snaps, + struct ceph_cap_snap, + ci_item); + snapc = ceph_get_snap_context(capsnap->context); + } else { + BUG_ON(!ci->i_head_snapc); + snapc = ceph_get_snap_context(ci->i_head_snapc); + } + spin_unlock(&ci->i_ceph_lock); + + req = ceph_osdc_alloc_request(orig_req->r_osdc, snapc, 2, + false, GFP_NOFS); + if (IS_ERR(req)) { + ret = PTR_ERR(req); + req = orig_req; + goto out; + } + + req->r_flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + req->r_base_oloc = orig_req->r_base_oloc; + req->r_base_oid = orig_req->r_base_oid; + + req->r_ops[0] = orig_req->r_ops[0]; + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + + ceph_osdc_build_request(req, req->r_ops[0].extent.offset, + snapc, CEPH_NOSNAP, &aio_req->mtime); + + ceph_put_snap_context(snapc); + ceph_osdc_put_request(orig_req); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + + ret = ceph_osdc_start_request(req->r_osdc, req, false); +out: + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + + kfree(aio_work); +} + /* * Write commit request unsafe callback, called to tell us when a * request is unsafe (that is, in flight--has been handed to the @@ -612,16 +767,10 @@ static void ceph_sync_write_unsafe(struct ceph_osd_request *req, bool unsafe) } -/* - * Synchronous write, straight from __user pointer or user pages. - * - * If write spans object boundary, just do multiple writes. (For a - * correct atomic write, we should e.g. take write locks on all - * objects, rollback on failure, etc.) - */ static ssize_t -ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, - struct ceph_snap_context *snapc) +ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, + struct ceph_snap_context *snapc, + struct ceph_cap_flush **pcf) { struct file *file = iocb->ki_filp; struct inode *inode = file_inode(file); @@ -630,44 +779,52 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, struct ceph_vino vino; struct ceph_osd_request *req; struct page **pages; - int num_pages; - int written = 0; + struct ceph_aio_request *aio_req = NULL; + int num_pages = 0; int flags; - int check_caps = 0; int ret; struct timespec mtime = CURRENT_TIME; - size_t count = iov_iter_count(from); + size_t count = iov_iter_count(iter); + loff_t pos = iocb->ki_pos; + bool write = iov_iter_rw(iter) == WRITE; - if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) + if (write && ceph_snap(file_inode(file)) != CEPH_NOSNAP) return -EROFS; - dout("sync_direct_write on file %p %lld~%u\n", file, pos, - (unsigned)count); + dout("sync_direct_read_write (%s) on file %p %lld~%u\n", + (write ? "write" : "read"), file, pos, (unsigned)count); ret = filemap_write_and_wait_range(inode->i_mapping, pos, pos + count); if (ret < 0) return ret; - ret = invalidate_inode_pages2_range(inode->i_mapping, - pos >> PAGE_CACHE_SHIFT, - (pos + count) >> PAGE_CACHE_SHIFT); - if (ret < 0) - dout("invalidate_inode_pages2_range returned %d\n", ret); + if (write) { + ret = invalidate_inode_pages2_range(inode->i_mapping, + pos >> PAGE_CACHE_SHIFT, + (pos + count) >> PAGE_CACHE_SHIFT); + if (ret < 0) + dout("invalidate_inode_pages2_range returned %d\n", ret); - flags = CEPH_OSD_FLAG_ORDERSNAP | - CEPH_OSD_FLAG_ONDISK | - CEPH_OSD_FLAG_WRITE; + flags = CEPH_OSD_FLAG_ORDERSNAP | + CEPH_OSD_FLAG_ONDISK | + CEPH_OSD_FLAG_WRITE; + } else { + flags = CEPH_OSD_FLAG_READ; + } - while (iov_iter_count(from) > 0) { - u64 len = dio_get_pagev_size(from); - size_t start; - ssize_t n; + while (iov_iter_count(iter) > 0) { + u64 size = dio_get_pagev_size(iter); + size_t start = 0; + ssize_t len; vino = ceph_vino(inode); req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, - vino, pos, &len, 0, - 2,/*include a 'startsync' command*/ - CEPH_OSD_OP_WRITE, flags, snapc, + vino, pos, &size, 0, + /*include a 'startsync' command*/ + write ? 2 : 1, + write ? CEPH_OSD_OP_WRITE : + CEPH_OSD_OP_READ, + flags, snapc, ci->i_truncate_seq, ci->i_truncate_size, false); @@ -676,10 +833,8 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, break; } - osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); - - n = len; - pages = dio_get_pages_alloc(from, len, &start, &num_pages); + len = size; + pages = dio_get_pages_alloc(iter, len, &start, &num_pages); if (IS_ERR(pages)) { ceph_osdc_put_request(req); ret = PTR_ERR(pages); @@ -687,47 +842,128 @@ ceph_sync_direct_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, } /* - * throw out any page cache pages in this range. this - * may block. + * To simplify error handling, allow AIO when IO within i_size + * or IO can be satisfied by single OSD request. */ - truncate_inode_pages_range(inode->i_mapping, pos, - (pos+n) | (PAGE_CACHE_SIZE-1)); - osd_req_op_extent_osd_data_pages(req, 0, pages, n, start, - false, false); + if (pos == iocb->ki_pos && !is_sync_kiocb(iocb) && + (len == count || pos + count <= i_size_read(inode))) { + aio_req = kzalloc(sizeof(*aio_req), GFP_KERNEL); + if (aio_req) { + aio_req->iocb = iocb; + aio_req->write = write; + INIT_LIST_HEAD(&aio_req->osd_reqs); + if (write) { + aio_req->mtime = mtime; + swap(aio_req->prealloc_cf, *pcf); + } + } + /* ignore error */ + } + + if (write) { + /* + * throw out any page cache pages in this range. this + * may block. + */ + truncate_inode_pages_range(inode->i_mapping, pos, + (pos+len) | (PAGE_CACHE_SIZE - 1)); + + osd_req_op_init(req, 1, CEPH_OSD_OP_STARTSYNC, 0); + } + + + osd_req_op_extent_osd_data_pages(req, 0, pages, len, start, + false, false); - /* BUG_ON(vino.snap != CEPH_NOSNAP); */ ceph_osdc_build_request(req, pos, snapc, vino.snap, &mtime); - ret = ceph_osdc_start_request(&fsc->client->osdc, req, false); + if (aio_req) { + aio_req->total_len += len; + aio_req->num_reqs++; + atomic_inc(&aio_req->pending_reqs); + + req->r_callback = ceph_aio_complete_req; + req->r_inode = inode; + req->r_priv = aio_req; + list_add_tail(&req->r_unsafe_item, &aio_req->osd_reqs); + + pos += len; + iov_iter_advance(iter, len); + continue; + } + + ret = ceph_osdc_start_request(req->r_osdc, req, false); if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); + size = i_size_read(inode); + if (!write) { + if (ret == -ENOENT) + ret = 0; + if (ret >= 0 && ret < len && pos + ret < size) { + int zlen = min_t(size_t, len - ret, + size - pos - ret); + ceph_zero_page_vector_range(start + ret, zlen, + pages); + ret += zlen; + } + if (ret >= 0) + len = ret; + } + ceph_put_page_vector(pages, num_pages, false); ceph_osdc_put_request(req); - if (ret) + if (ret < 0) break; - pos += n; - written += n; - iov_iter_advance(from, n); - if (pos > i_size_read(inode)) { - check_caps = ceph_inode_set_size(inode, pos); - if (check_caps) + pos += len; + iov_iter_advance(iter, len); + + if (!write && pos >= size) + break; + + if (write && pos > size) { + if (ceph_inode_set_size(inode, pos)) ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL); } } - if (ret != -EOLDSNAPC && written > 0) { + if (aio_req) { + if (aio_req->num_reqs == 0) { + kfree(aio_req); + return ret; + } + + ceph_get_cap_refs(ci, write ? CEPH_CAP_FILE_WR : + CEPH_CAP_FILE_RD); + + while (!list_empty(&aio_req->osd_reqs)) { + req = list_first_entry(&aio_req->osd_reqs, + struct ceph_osd_request, + r_unsafe_item); + list_del_init(&req->r_unsafe_item); + if (ret >= 0) + ret = ceph_osdc_start_request(req->r_osdc, + req, false); + if (ret < 0) { + BUG_ON(ret == -EOLDSNAPC); + req->r_result = ret; + ceph_aio_complete_req(req, NULL); + } + } + return -EIOCBQUEUED; + } + + if (ret != -EOLDSNAPC && pos > iocb->ki_pos) { + ret = pos - iocb->ki_pos; iocb->ki_pos = pos; - ret = written; } return ret; } - /* * Synchronous write, straight from __user pointer or user pages. * @@ -897,8 +1133,14 @@ again: ceph_cap_string(got)); if (ci->i_inline_version == CEPH_INLINE_NONE) { - /* hmm, this isn't really async... */ - ret = ceph_sync_read(iocb, to, &retry_op); + if (!retry_op && (iocb->ki_flags & IOCB_DIRECT)) { + ret = ceph_direct_read_write(iocb, to, + NULL, NULL); + if (ret >= 0 && ret < len) + retry_op = CHECK_EOF; + } else { + ret = ceph_sync_read(iocb, to, &retry_op); + } } else { retry_op = READ_INLINE; } @@ -916,7 +1158,7 @@ again: pinned_page = NULL; } ceph_put_cap_refs(ci, got); - if (retry_op && ret >= 0) { + if (retry_op > HAVE_RETRIED && ret >= 0) { int statret; struct page *page = NULL; loff_t i_size; @@ -968,12 +1210,11 @@ again: if (retry_op == CHECK_EOF && iocb->ki_pos < i_size && ret < len) { dout("sync_read hit hole, ppos %lld < size %lld" - ", reading more\n", iocb->ki_pos, - inode->i_size); + ", reading more\n", iocb->ki_pos, i_size); read += ret; len -= ret; - retry_op = 0; + retry_op = HAVE_RETRIED; goto again; } } @@ -1014,7 +1255,7 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from) if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1052,7 +1293,7 @@ retry_snap: } dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n", - inode, ceph_vinop(inode), pos, count, inode->i_size); + inode, ceph_vinop(inode), pos, count, i_size_read(inode)); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_BUFFER | CEPH_CAP_FILE_LAZYIO; else @@ -1070,7 +1311,7 @@ retry_snap: (iocb->ki_flags & IOCB_DIRECT) || (fi->flags & CEPH_F_SYNC)) { struct ceph_snap_context *snapc; struct iov_iter data; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); spin_lock(&ci->i_ceph_lock); if (__ceph_have_pending_cap_snap(ci)) { @@ -1088,8 +1329,8 @@ retry_snap: /* we might need to revert back to that point */ data = *from; if (iocb->ki_flags & IOCB_DIRECT) - written = ceph_sync_direct_write(iocb, &data, pos, - snapc); + written = ceph_direct_read_write(iocb, &data, snapc, + &prealloc_cf); else written = ceph_sync_write(iocb, &data, pos, snapc); if (written == -EOLDSNAPC) { @@ -1097,14 +1338,14 @@ retry_snap: "got EOLDSNAPC, retrying\n", inode, ceph_vinop(inode), pos, (unsigned)count); - mutex_lock(&inode->i_mutex); + inode_lock(inode); goto retry_snap; } if (written > 0) iov_iter_advance(from, written); ceph_put_snap_context(snapc); } else { - loff_t old_size = inode->i_size; + loff_t old_size = i_size_read(inode); /* * No need to acquire the i_truncate_mutex. Because * the MDS revokes Fwb caps before sending truncate @@ -1115,9 +1356,9 @@ retry_snap: written = generic_perform_write(file, from, pos); if (likely(written >= 0)) iocb->ki_pos = pos + written; - if (inode->i_size > old_size) + if (i_size_read(inode) > old_size) ceph_fscache_update_objectsize(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (written >= 0) { @@ -1147,7 +1388,7 @@ retry_snap: goto out_unlocked; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_unlocked: ceph_free_cap_flush(prealloc_cf); current->backing_dev_info = NULL; @@ -1160,9 +1401,10 @@ out_unlocked: static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; + loff_t i_size; int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (whence == SEEK_END || whence == SEEK_DATA || whence == SEEK_HOLE) { ret = ceph_do_getattr(inode, CEPH_STAT_CAP_SIZE, false); @@ -1172,9 +1414,10 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) } } + i_size = i_size_read(inode); switch (whence) { case SEEK_END: - offset += inode->i_size; + offset += i_size; break; case SEEK_CUR: /* @@ -1190,24 +1433,24 @@ static loff_t ceph_llseek(struct file *file, loff_t offset, int whence) offset += file->f_pos; break; case SEEK_DATA: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } break; case SEEK_HOLE: - if (offset >= inode->i_size) { + if (offset >= i_size) { ret = -ENXIO; goto out; } - offset = inode->i_size; + offset = i_size; break; } offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -1363,7 +1606,7 @@ static long ceph_fallocate(struct file *file, int mode, if (!prealloc_cf) return -ENOMEM; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (ceph_snap(inode) != CEPH_NOSNAP) { ret = -EROFS; @@ -1418,7 +1661,7 @@ static long ceph_fallocate(struct file *file, int mode, ceph_put_cap_refs(ci, got); unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ceph_free_cap_flush(prealloc_cf); return ret; } diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index da55eb8bcffa..fb4ba2e4e2a5 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -548,7 +548,7 @@ int ceph_fill_file_size(struct inode *inode, int issued, if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { dout("size %lld -> %llu\n", inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1<<9) - 1) >> 9; ci->i_reported_size = size; if (truncate_seq != ci->i_truncate_seq) { @@ -808,7 +808,7 @@ static int fill_inode(struct inode *inode, struct page *locked_page, spin_unlock(&ci->i_ceph_lock); err = -EINVAL; - if (WARN_ON(symlen != inode->i_size)) + if (WARN_ON(symlen != i_size_read(inode))) goto out; err = -ENOMEM; @@ -1549,7 +1549,7 @@ int ceph_inode_set_size(struct inode *inode, loff_t size) spin_lock(&ci->i_ceph_lock); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); - inode->i_size = size; + i_size_write(inode, size); inode->i_blocks = (size + (1 << 9) - 1) >> 9; /* tell the MDS if we are approaching max_size */ @@ -1911,7 +1911,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) inode->i_size, attr->ia_size); if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > inode->i_size) { - inode->i_size = attr->ia_size; + i_size_write(inode, attr->ia_size); inode->i_blocks = (attr->ia_size + (1 << 9) - 1) >> 9; inode->i_ctime = attr->ia_ctime; diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 7febcf2475c5..50b268483302 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -50,7 +50,7 @@ void cifs_vfs_err(const char *fmt, ...) vaf.fmt = fmt; vaf.va = &args; - pr_err("CIFS VFS: %pV", &vaf); + pr_err_ratelimited("CIFS VFS: %pV", &vaf); va_end(args); } diff --git a/fs/cifs/cifs_debug.h b/fs/cifs/cifs_debug.h index f40fbaca1b2a..66cf0f9fff89 100644 --- a/fs/cifs/cifs_debug.h +++ b/fs/cifs/cifs_debug.h @@ -51,14 +51,13 @@ __printf(1, 2) void cifs_vfs_err(const char *fmt, ...); /* information message: e.g., configuration, major event */ #define cifs_dbg(type, fmt, ...) \ do { \ - if (type == FYI) { \ - if (cifsFYI & CIFS_INFO) { \ - pr_debug("%s: " fmt, __FILE__, ##__VA_ARGS__); \ - } \ + if (type == FYI && cifsFYI & CIFS_INFO) { \ + pr_debug_ratelimited("%s: " \ + fmt, __FILE__, ##__VA_ARGS__); \ } else if (type == VFS) { \ cifs_vfs_err(fmt, ##__VA_ARGS__); \ } else if (type == NOISY && type != 0) { \ - pr_debug(fmt, ##__VA_ARGS__); \ + pr_debug_ratelimited(fmt, ##__VA_ARGS__); \ } \ } while (0) diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index c4c1169814b2..c48ca13673e3 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -507,6 +507,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->rsize); seq_printf(s, ",wsize=%u", cifs_sb->wsize); + seq_printf(s, ",echo_interval=%lu", + tcon->ses->server->echo_interval / HZ); /* convert actimeo and display it in seconds */ seq_printf(s, ",actimeo=%lu", cifs_sb->actimeo / HZ); @@ -640,9 +642,9 @@ cifs_get_root(struct smb_vol *vol, struct super_block *sb) while (*s && *s != sep) s++; - mutex_lock(&dir->i_mutex); + inode_lock(dir); child = lookup_one_len(p, dentry, s - p); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -752,6 +754,9 @@ cifs_loose_read_iter(struct kiocb *iocb, struct iov_iter *iter) ssize_t rc; struct inode *inode = file_inode(iocb->ki_filp); + if (iocb->ki_filp->f_flags & O_DIRECT) + return cifs_user_readv(iocb, iter); + rc = cifs_revalidate_mapping(inode); if (rc) return rc; @@ -766,6 +771,18 @@ static ssize_t cifs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written; int rc; + if (iocb->ki_filp->f_flags & O_DIRECT) { + written = cifs_user_writev(iocb, from); + if (written > 0 && CIFS_CACHE_READ(cinode)) { + cifs_zap_mapping(inode); + cifs_dbg(FYI, + "Set no oplock for inode=%p after a write operation\n", + inode); + cinode->oplock = 0; + } + return written; + } + written = cifs_get_writer(cinode); if (written) return written; diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index 2b510c537a0d..a25b2513f146 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -70,8 +70,10 @@ #define SERVER_NAME_LENGTH 40 #define SERVER_NAME_LEN_WITH_NULL (SERVER_NAME_LENGTH + 1) -/* SMB echo "timeout" -- FIXME: tunable? */ -#define SMB_ECHO_INTERVAL (60 * HZ) +/* echo interval in seconds */ +#define SMB_ECHO_INTERVAL_MIN 1 +#define SMB_ECHO_INTERVAL_MAX 600 +#define SMB_ECHO_INTERVAL_DEFAULT 60 #include "cifspdu.h" @@ -225,7 +227,7 @@ struct smb_version_operations { void (*print_stats)(struct seq_file *m, struct cifs_tcon *); void (*dump_share_caps)(struct seq_file *, struct cifs_tcon *); /* verify the message */ - int (*check_message)(char *, unsigned int); + int (*check_message)(char *, unsigned int, struct TCP_Server_Info *); bool (*is_oplock_break)(char *, struct TCP_Server_Info *); void (*downgrade_oplock)(struct TCP_Server_Info *, struct cifsInodeInfo *, bool); @@ -507,6 +509,7 @@ struct smb_vol { struct sockaddr_storage dstaddr; /* destination address */ struct sockaddr_storage srcaddr; /* allow binding to a local IP */ struct nls_table *local_nls; + unsigned int echo_interval; /* echo interval in secs */ }; #define CIFS_MOUNT_MASK (CIFS_MOUNT_NO_PERM | CIFS_MOUNT_SET_UID | \ @@ -627,7 +630,9 @@ struct TCP_Server_Info { #ifdef CONFIG_CIFS_SMB2 unsigned int max_read; unsigned int max_write; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ + unsigned long echo_interval; }; static inline unsigned int @@ -809,7 +814,10 @@ struct cifs_ses { bool need_reconnect:1; /* connection reset, uid now invalid */ #ifdef CONFIG_CIFS_SMB2 __u16 session_flags; - char smb3signingkey[SMB3_SIGN_KEY_SIZE]; /* for signing smb3 packets */ + __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3encryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; + __u8 preauth_hash[512]; #endif /* CONFIG_CIFS_SMB2 */ }; diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index c63fd1dde25b..eed7ff50faf0 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -102,7 +102,7 @@ extern int SendReceiveBlockingLock(const unsigned int xid, struct smb_hdr *out_buf, int *bytes_returned); extern int cifs_reconnect(struct TCP_Server_Info *server); -extern int checkSMB(char *buf, unsigned int length); +extern int checkSMB(char *buf, unsigned int len, struct TCP_Server_Info *srvr); extern bool is_valid_oplock_break(char *, struct TCP_Server_Info *); extern bool backup_cred(struct cifs_sb_info *); extern bool is_size_safe_to_change(struct cifsInodeInfo *, __u64 eof); @@ -439,7 +439,8 @@ extern int setup_ntlm_response(struct cifs_ses *, const struct nls_table *); extern int setup_ntlmv2_rsp(struct cifs_ses *, const struct nls_table *); extern void cifs_crypto_shash_release(struct TCP_Server_Info *); extern int calc_seckey(struct cifs_ses *); -extern int generate_smb3signingkey(struct cifs_ses *); +extern int generate_smb30signingkey(struct cifs_ses *); +extern int generate_smb311signingkey(struct cifs_ses *); #ifdef CONFIG_CIFS_WEAK_PW_HASH extern int calc_lanman_hash(const char *password, const char *cryptkey, diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index ecb0803bdb0e..4fbd92d2e113 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -95,6 +95,7 @@ enum { Opt_cruid, Opt_gid, Opt_file_mode, Opt_dirmode, Opt_port, Opt_rsize, Opt_wsize, Opt_actimeo, + Opt_echo_interval, /* Mount options which take string value */ Opt_user, Opt_pass, Opt_ip, @@ -188,6 +189,7 @@ static const match_table_t cifs_mount_option_tokens = { { Opt_rsize, "rsize=%s" }, { Opt_wsize, "wsize=%s" }, { Opt_actimeo, "actimeo=%s" }, + { Opt_echo_interval, "echo_interval=%s" }, { Opt_blank_user, "user=" }, { Opt_blank_user, "username=" }, @@ -368,7 +370,6 @@ cifs_reconnect(struct TCP_Server_Info *server) server->session_key.response = NULL; server->session_key.len = 0; server->lstrp = jiffies; - mutex_unlock(&server->srv_mutex); /* mark submitted MIDs for retry and issue callback */ INIT_LIST_HEAD(&retry_list); @@ -381,6 +382,7 @@ cifs_reconnect(struct TCP_Server_Info *server) list_move(&mid_entry->qhead, &retry_list); } spin_unlock(&GlobalMid_Lock); + mutex_unlock(&server->srv_mutex); cifs_dbg(FYI, "%s: issuing mid callbacks\n", __func__); list_for_each_safe(tmp, tmp2, &retry_list) { @@ -418,6 +420,7 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); + unsigned long echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled or until the @@ -427,7 +430,7 @@ cifs_echo_request(struct work_struct *work) */ if (!server->ops->need_neg || server->ops->need_neg(server) || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + SMB_ECHO_INTERVAL - HZ)) + time_before(jiffies, server->lstrp + echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -436,7 +439,7 @@ cifs_echo_request(struct work_struct *work) server->hostname); requeue_echo: - queue_delayed_work(cifsiod_wq, &server->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &server->echo, echo_interval); } static bool @@ -487,9 +490,9 @@ server_unresponsive(struct TCP_Server_Info *server) * a response in >60s. */ if (server->tcpStatus == CifsGood && - time_after(jiffies, server->lstrp + 2 * SMB_ECHO_INTERVAL)) { - cifs_dbg(VFS, "Server %s has not responded in %d seconds. Reconnecting...\n", - server->hostname, (2 * SMB_ECHO_INTERVAL) / HZ); + time_after(jiffies, server->lstrp + 2 * server->echo_interval)) { + cifs_dbg(VFS, "Server %s has not responded in %lu seconds. Reconnecting...\n", + server->hostname, (2 * server->echo_interval) / HZ); cifs_reconnect(server); wake_up(&server->response_q); return true; @@ -828,7 +831,7 @@ standard_receive3(struct TCP_Server_Info *server, struct mid_q_entry *mid) * 48 bytes is enough to display the header and a little bit * into the payload for debugging purposes. */ - length = server->ops->check_message(buf, server->total_read); + length = server->ops->check_message(buf, server->total_read, server); if (length != 0) cifs_dump_mem("Bad SMB: ", buf, min_t(unsigned int, server->total_read, 48)); @@ -1624,6 +1627,14 @@ cifs_parse_mount_options(const char *mountdata, const char *devname, goto cifs_parse_mount_err; } break; + case Opt_echo_interval: + if (get_option_ul(args, &option)) { + cifs_dbg(VFS, "%s: Invalid echo interval value\n", + __func__); + goto cifs_parse_mount_err; + } + vol->echo_interval = option; + break; /* String Arguments */ @@ -2089,6 +2100,9 @@ static int match_server(struct TCP_Server_Info *server, struct smb_vol *vol) if (!match_security(server, vol)) return 0; + if (server->echo_interval != vol->echo_interval) + return 0; + return 1; } @@ -2208,6 +2222,12 @@ cifs_get_tcp_session(struct smb_vol *volume_info) tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + if (volume_info->echo_interval >= SMB_ECHO_INTERVAL_MIN && + volume_info->echo_interval <= SMB_ECHO_INTERVAL_MAX) + tcp_ses->echo_interval = volume_info->echo_interval * HZ; + else + tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; + rc = ip_connect(tcp_ses); if (rc < 0) { cifs_dbg(VFS, "Error connecting to socket. Aborting operation.\n"); @@ -2237,7 +2257,7 @@ cifs_get_tcp_session(struct smb_vol *volume_info) cifs_fscache_get_client_cookie(tcp_ses); /* queue echo request delayed work */ - queue_delayed_work(cifsiod_wq, &tcp_ses->echo, SMB_ECHO_INTERVAL); + queue_delayed_work(cifsiod_wq, &tcp_ses->echo, tcp_ses->echo_interval); return tcp_ses; diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 0a2752b79e72..ff882aeaccc6 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -2267,7 +2267,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2292,7 +2292,7 @@ int cifs_strict_fsync(struct file *file, loff_t start, loff_t end, } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2309,7 +2309,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) rc = filemap_write_and_wait_range(inode->i_mapping, start, end); if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); xid = get_xid(); @@ -2326,7 +2326,7 @@ int cifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) } free_xid(xid); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } @@ -2672,7 +2672,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) * with a brlock that prevents writing. */ down_read(&cinode->lock_sem); - mutex_lock(&inode->i_mutex); + inode_lock(inode); rc = generic_write_checks(iocb, from); if (rc <= 0) @@ -2685,7 +2685,7 @@ cifs_writev(struct kiocb *iocb, struct iov_iter *from) else rc = -EACCES; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (rc > 0) { ssize_t err = generic_write_sync(file, iocb->ki_pos - rc, rc); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index a329f5ba35aa..aeb26dbfa1bf 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -814,8 +814,21 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } } else fattr.cf_uniqueid = iunique(sb, ROOT_I); - } else - fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else { + if ((cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM) && + validinum == false && server->ops->get_srv_inum) { + /* + * Pass a NULL tcon to ensure we don't make a round + * trip to the server. This only works for SMB2+. + */ + tmprc = server->ops->get_srv_inum(xid, + NULL, cifs_sb, full_path, + &fattr.cf_uniqueid, data); + if (tmprc) + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } else + fattr.cf_uniqueid = CIFS_I(*inode)->uniqueid; + } /* query for SFU type info if supported and needed */ if (fattr.cf_cifsattrs & ATTR_SYSTEM && @@ -856,6 +869,13 @@ cifs_get_inode_info(struct inode **inode, const char *full_path, } else { /* we already have inode, update it */ + /* if uniqueid is different, return error */ + if (unlikely(cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SERVER_INUM && + CIFS_I(*inode)->uniqueid != fattr.cf_uniqueid)) { + rc = -ESTALE; + goto cgii_exit; + } + /* if filetype is different, return error */ if (unlikely(((*inode)->i_mode & S_IFMT) != (fattr.cf_mode & S_IFMT))) { diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 8442b8b8e0be..813fe13c2ae1 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -310,7 +310,7 @@ check_smb_hdr(struct smb_hdr *smb) } int -checkSMB(char *buf, unsigned int total_read) +checkSMB(char *buf, unsigned int total_read, struct TCP_Server_Info *server) { struct smb_hdr *smb = (struct smb_hdr *)buf; __u32 rfclen = be32_to_cpu(smb->smb_buf_length); diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 0557c45e9c33..b30a4a6d98a0 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -847,6 +847,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) * if buggy server returns . and .. late do we want to * check for that here? */ + *tmp_buf = 0; rc = cifs_filldir(current_entry, file, ctx, tmp_buf, max_len); if (rc) { diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index 1c5907019045..389fb9f8c84e 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -38,7 +38,7 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) * Make sure that this really is an SMB, that it is a response, * and that the message ids match. */ - if ((*(__le32 *)hdr->ProtocolId == SMB2_PROTO_NUMBER) && + if ((hdr->ProtocolId == SMB2_PROTO_NUMBER) && (mid == wire_mid)) { if (hdr->Flags & SMB2_FLAGS_SERVER_TO_REDIR) return 0; @@ -50,9 +50,9 @@ check_smb2_hdr(struct smb2_hdr *hdr, __u64 mid) cifs_dbg(VFS, "Received Request not response\n"); } } else { /* bad signature or mid */ - if (*(__le32 *)hdr->ProtocolId != SMB2_PROTO_NUMBER) + if (hdr->ProtocolId != SMB2_PROTO_NUMBER) cifs_dbg(VFS, "Bad protocol string signature header %x\n", - *(unsigned int *) hdr->ProtocolId); + le32_to_cpu(hdr->ProtocolId)); if (mid != wire_mid) cifs_dbg(VFS, "Mids do not match: %llu and %llu\n", mid, wire_mid); @@ -93,11 +93,11 @@ static const __le16 smb2_rsp_struct_sizes[NUMBER_OF_SMB2_COMMANDS] = { }; int -smb2_check_message(char *buf, unsigned int length) +smb2_check_message(char *buf, unsigned int length, struct TCP_Server_Info *srvr) { struct smb2_hdr *hdr = (struct smb2_hdr *)buf; struct smb2_pdu *pdu = (struct smb2_pdu *)hdr; - __u64 mid = le64_to_cpu(hdr->MessageId); + __u64 mid; __u32 len = get_rfc1002_length(buf); __u32 clc_len; /* calculated length */ int command; @@ -111,6 +111,30 @@ smb2_check_message(char *buf, unsigned int length) * ie Validate the wct via smb2_struct_sizes table above */ + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + struct smb2_transform_hdr *thdr = + (struct smb2_transform_hdr *)buf; + struct cifs_ses *ses = NULL; + struct list_head *tmp; + + /* decrypt frame now that it is completely read in */ + spin_lock(&cifs_tcp_ses_lock); + list_for_each(tmp, &srvr->smb_ses_list) { + ses = list_entry(tmp, struct cifs_ses, smb_ses_list); + if (ses->Suid == thdr->SessionId) + break; + + ses = NULL; + } + spin_unlock(&cifs_tcp_ses_lock); + if (ses == NULL) { + cifs_dbg(VFS, "no decryption - session id not found\n"); + return 1; + } + } + + + mid = le64_to_cpu(hdr->MessageId); if (length < sizeof(struct smb2_pdu)) { if ((length >= sizeof(struct smb2_hdr)) && (hdr->Status != 0)) { pdu->StructureSize2 = 0; @@ -322,7 +346,7 @@ smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr) /* return pointer to beginning of data area, ie offset from SMB start */ if ((*off != 0) && (*len != 0)) - return (char *)(&hdr->ProtocolId[0]) + *off; + return (char *)(&hdr->ProtocolId) + *off; else return NULL; } diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index 53ccdde6ff18..3525ed756173 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -182,6 +182,11 @@ smb2_find_mid(struct TCP_Server_Info *server, char *buf) struct smb2_hdr *hdr = (struct smb2_hdr *)buf; __u64 wire_mid = le64_to_cpu(hdr->MessageId); + if (hdr->ProtocolId == SMB2_TRANSFORM_PROTO_NUM) { + cifs_dbg(VFS, "encrypted frame parsing not supported yet"); + return NULL; + } + spin_lock(&GlobalMid_Lock); list_for_each_entry(mid, &server->pending_mid_q, qhead) { if ((mid->mid == wire_mid) && @@ -1692,7 +1697,7 @@ struct smb_version_operations smb30_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb30signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1779,7 +1784,7 @@ struct smb_version_operations smb311_operations = { .get_lease_key = smb2_get_lease_key, .set_lease_key = smb2_set_lease_key, .new_lease_key = smb2_new_lease_key, - .generate_signingkey = generate_smb3signingkey, + .generate_signingkey = generate_smb311signingkey, .calc_signature = smb3_calc_signature, .set_integrity = smb3_set_integrity, .is_read_op = smb21_is_read_op, @@ -1838,7 +1843,7 @@ struct smb_version_values smb21_values = { struct smb_version_values smb30_values = { .version_string = SMB30_VERSION_STRING, .protocol_id = SMB30_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, @@ -1858,7 +1863,7 @@ struct smb_version_values smb30_values = { struct smb_version_values smb302_values = { .version_string = SMB302_VERSION_STRING, .protocol_id = SMB302_PROT_ID, - .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES, + .req_capabilities = SMB2_GLOBAL_CAP_DFS | SMB2_GLOBAL_CAP_LEASING | SMB2_GLOBAL_CAP_LARGE_MTU | SMB2_GLOBAL_CAP_PERSISTENT_HANDLES | SMB2_GLOBAL_CAP_ENCRYPTION, .large_lock_type = 0, .exclusive_lock_type = SMB2_LOCKFLAG_EXCLUSIVE_LOCK, .shared_lock_type = SMB2_LOCKFLAG_SHARED_LOCK, diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 767555518d40..10f8d5cf5681 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -97,10 +97,7 @@ smb2_hdr_assemble(struct smb2_hdr *hdr, __le16 smb2_cmd /* command */ , hdr->smb2_buf_length = cpu_to_be32(parmsize + sizeof(struct smb2_hdr) - 4 /* RFC 1001 length field itself not counted */); - hdr->ProtocolId[0] = 0xFE; - hdr->ProtocolId[1] = 'S'; - hdr->ProtocolId[2] = 'M'; - hdr->ProtocolId[3] = 'B'; + hdr->ProtocolId = SMB2_PROTO_NUMBER; hdr->StructureSize = cpu_to_le16(64); hdr->Command = smb2_cmd; hdr->CreditRequest = cpu_to_le16(2); /* BB make this dynamic */ @@ -1573,7 +1570,8 @@ SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, goto ioctl_exit; } - memcpy(*out_data, rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), + memcpy(*out_data, + (char *)&rsp->hdr.ProtocolId + le32_to_cpu(rsp->OutputOffset), *plen); ioctl_exit: free_rsp_buf(resp_buftype, rsp); @@ -2093,7 +2091,7 @@ SMB2_read(const unsigned int xid, struct cifs_io_parms *io_parms, } if (*buf) { - memcpy(*buf, (char *)rsp->hdr.ProtocolId + rsp->DataOffset, + memcpy(*buf, (char *)&rsp->hdr.ProtocolId + rsp->DataOffset, *nbytes); free_rsp_buf(resp_buftype, iov[0].iov_base); } else if (resp_buftype != CIFS_NO_BUFFER) { diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index 4af52780ec35..ff88d9feb01e 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -86,6 +86,7 @@ #define MAX_SMB2_HDR_SIZE 0x78 /* 4 len + 64 hdr + (2*24 wct) + 2 bct + 2 pad */ #define SMB2_PROTO_NUMBER cpu_to_le32(0x424d53fe) +#define SMB2_TRANSFORM_PROTO_NUM cpu_to_le32(0x424d53fd) /* * SMB2 Header Definition @@ -102,7 +103,7 @@ struct smb2_hdr { __be32 smb2_buf_length; /* big endian on wire */ /* length is only two or three bytes - with one or two byte type preceding it that MBZ */ - __u8 ProtocolId[4]; /* 0xFE 'S' 'M' 'B' */ + __le32 ProtocolId; /* 0xFE 'S' 'M' 'B' */ __le16 StructureSize; /* 64 */ __le16 CreditCharge; /* MBZ */ __le32 Status; /* Error from server */ @@ -128,11 +129,10 @@ struct smb2_transform_hdr { one or two byte type preceding it that MBZ */ __u8 ProtocolId[4]; /* 0xFD 'S' 'M' 'B' */ __u8 Signature[16]; - __u8 Nonce[11]; - __u8 Reserved[5]; + __u8 Nonce[16]; __le32 OriginalMessageSize; __u16 Reserved1; - __le16 EncryptionAlgorithm; + __le16 Flags; /* EncryptionAlgorithm */ __u64 SessionId; } __packed; diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index 79dc650c18b2..4f07dc93608d 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -34,7 +34,8 @@ struct smb_rqst; ***************************************************************** */ extern int map_smb2_to_linux_error(char *buf, bool log_err); -extern int smb2_check_message(char *buf, unsigned int length); +extern int smb2_check_message(char *buf, unsigned int length, + struct TCP_Server_Info *server); extern unsigned int smb2_calc_size(void *buf); extern char *smb2_get_data_area_len(int *off, int *len, struct smb2_hdr *hdr); extern __le16 *cifs_convert_path_to_utf16(const char *from, diff --git a/fs/cifs/smb2transport.c b/fs/cifs/smb2transport.c index d4c5b6f109a7..8732a43b1008 100644 --- a/fs/cifs/smb2transport.c +++ b/fs/cifs/smb2transport.c @@ -222,8 +222,8 @@ smb2_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) return rc; } -int -generate_smb3signingkey(struct cifs_ses *ses) +static int generate_key(struct cifs_ses *ses, struct kvec label, + struct kvec context, __u8 *key, unsigned int key_size) { unsigned char zero = 0x0; __u8 i[4] = {0, 0, 0, 1}; @@ -233,7 +233,7 @@ generate_smb3signingkey(struct cifs_ses *ses) unsigned char *hashptr = prfhash; memset(prfhash, 0x0, SMB2_HMACSHA256_SIZE); - memset(ses->smb3signingkey, 0x0, SMB3_SIGNKEY_SIZE); + memset(key, 0x0, key_size); rc = smb3_crypto_shash_allocate(ses->server); if (rc) { @@ -262,7 +262,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SMB2AESCMAC", 12); + label.iov_base, label.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with label\n", __func__); goto smb3signkey_ret; @@ -276,7 +276,7 @@ generate_smb3signingkey(struct cifs_ses *ses) } rc = crypto_shash_update(&ses->server->secmech.sdeschmacsha256->shash, - "SmbSign", 8); + context.iov_base, context.iov_len); if (rc) { cifs_dbg(VFS, "%s: Could not update with context\n", __func__); goto smb3signkey_ret; @@ -296,12 +296,102 @@ generate_smb3signingkey(struct cifs_ses *ses) goto smb3signkey_ret; } - memcpy(ses->smb3signingkey, hashptr, SMB3_SIGNKEY_SIZE); + memcpy(key, hashptr, key_size); smb3signkey_ret: return rc; } +struct derivation { + struct kvec label; + struct kvec context; +}; + +struct derivation_triplet { + struct derivation signing; + struct derivation encryption; + struct derivation decryption; +}; + +static int +generate_smb3signingkey(struct cifs_ses *ses, + const struct derivation_triplet *ptriplet) +{ + int rc; + + rc = generate_key(ses, ptriplet->signing.label, + ptriplet->signing.context, ses->smb3signingkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + rc = generate_key(ses, ptriplet->encryption.label, + ptriplet->encryption.context, ses->smb3encryptionkey, + SMB3_SIGN_KEY_SIZE); + if (rc) + return rc; + + return generate_key(ses, ptriplet->decryption.label, + ptriplet->decryption.context, + ses->smb3decryptionkey, SMB3_SIGN_KEY_SIZE); +} + +int +generate_smb30signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + +int +generate_smb311signingkey(struct cifs_ses *ses) + +{ + struct derivation_triplet triplet; + struct derivation *d; + + d = &triplet.signing; + d->label.iov_base = "SMB2AESCMAC"; + d->label.iov_len = 12; + d->context.iov_base = "SmbSign"; + d->context.iov_len = 8; + + d = &triplet.encryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerIn "; + d->context.iov_len = 10; + + d = &triplet.decryption; + d->label.iov_base = "SMB2AESCCM"; + d->label.iov_len = 11; + d->context.iov_base = "ServerOut"; + d->context.iov_len = 10; + + return generate_smb3signingkey(ses, &triplet); +} + int smb3_calc_signature(struct smb_rqst *rqst, struct TCP_Server_Info *server) { diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c index 2a24c524fb9a..87abe8ed074c 100644 --- a/fs/cifs/transport.c +++ b/fs/cifs/transport.c @@ -576,14 +576,16 @@ cifs_call_async(struct TCP_Server_Info *server, struct smb_rqst *rqst, cifs_in_send_dec(server); cifs_save_when_sent(mid); - if (rc < 0) + if (rc < 0) { server->sequence_number -= 2; + cifs_delete_mid(mid); + } + mutex_unlock(&server->srv_mutex); if (rc == 0) return 0; - cifs_delete_mid(mid); add_credits_and_wake_if(server, credits, optype); return rc; } diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index f829fe963f5b..5104d84c4f64 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -72,8 +72,7 @@ void coda_sysctl_clean(void); } while (0) -#define CODA_FREE(ptr,size) \ - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) +#define CODA_FREE(ptr, size) kvfree((ptr)) /* inode to cnode access functions */ diff --git a/fs/coda/dir.c b/fs/coda/dir.c index fda9f4311212..42e731b8c80a 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -427,13 +427,13 @@ static int coda_readdir(struct file *coda_file, struct dir_context *ctx) if (host_file->f_op->iterate) { struct inode *host_inode = file_inode(host_file); - mutex_lock(&host_inode->i_mutex); + inode_lock(host_inode); ret = -ENOENT; if (!IS_DEADDIR(host_inode)) { ret = host_file->f_op->iterate(host_file, ctx); file_accessed(host_file); } - mutex_unlock(&host_inode->i_mutex); + inode_unlock(host_inode); return ret; } /* Venus: we must read Venus dirents from a file */ diff --git a/fs/coda/file.c b/fs/coda/file.c index 1da3805f3ddc..f47c7483863b 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -71,12 +71,12 @@ coda_file_write_iter(struct kiocb *iocb, struct iov_iter *to) host_file = cfi->cfi_container; file_start_write(host_file); - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); ret = vfs_iter_write(cfi->cfi_container, to, &iocb->ki_pos); coda_inode->i_size = file_inode(host_file)->i_size; coda_inode->i_blocks = (coda_inode->i_size + 511) >> 9; coda_inode->i_mtime = coda_inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); file_end_write(host_file); return ret; } @@ -203,7 +203,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(coda_inode->i_mapping, start, end); if (err) return err; - mutex_lock(&coda_inode->i_mutex); + inode_lock(coda_inode); cfi = CODA_FTOC(coda_file); BUG_ON(!cfi || cfi->cfi_magic != CODA_MAGIC); @@ -212,7 +212,7 @@ int coda_fsync(struct file *coda_file, loff_t start, loff_t end, int datasync) err = vfs_fsync(host_file, datasync); if (!err && !datasync) err = venus_fsync(coda_inode->i_sb, coda_i2f(coda_inode)); - mutex_unlock(&coda_inode->i_mutex); + inode_unlock(coda_inode); return err; } diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index cab612b2ae76..f419519ec41f 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -640,13 +640,13 @@ static void detach_groups(struct config_group *group) child = sd->s_dentry; - mutex_lock(&d_inode(child)->i_mutex); + inode_lock(d_inode(child)); configfs_detach_group(sd->s_element); d_inode(child)->i_flags |= S_DEAD; dont_mount(child); - mutex_unlock(&d_inode(child)->i_mutex); + inode_unlock(d_inode(child)); d_delete(child); dput(child); @@ -834,11 +834,11 @@ static int configfs_attach_item(struct config_item *parent_item, * the VFS may already have hit and used them. Thus, * we must lock them as rmdir() would. */ - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); configfs_remove_dir(item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); } } @@ -874,7 +874,7 @@ static int configfs_attach_group(struct config_item *parent_item, * We must also lock the inode to remove it safely in case of * error, as rmdir() would. */ - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); configfs_adjust_dir_dirent_depth_before_populate(sd); ret = populate_groups(to_config_group(item)); if (ret) { @@ -883,7 +883,7 @@ static int configfs_attach_group(struct config_item *parent_item, dont_mount(dentry); } configfs_adjust_dir_dirent_depth_after_populate(sd); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (ret) d_delete(dentry); } @@ -1135,7 +1135,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, * subsystem is really registered, and so we need to lock out * configfs_[un]register_subsystem(). */ - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); subsys_sd = configfs_find_subsys_dentry(root->d_fsdata, s_item); if (!subsys_sd) { @@ -1147,7 +1147,7 @@ int configfs_depend_item(struct configfs_subsystem *subsys, ret = configfs_do_depend_item(subsys_sd->s_dentry, target); out_unlock_fs: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); /* * If we succeeded, the fs is pinned via other methods. If not, @@ -1230,7 +1230,7 @@ int configfs_depend_item_unlocked(struct configfs_subsystem *caller_subsys, * additional locking to prevent other subsystem from being * unregistered */ - mutex_lock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + inode_lock(d_inode(root->cg_item.ci_dentry)); /* * As we are trying to depend item from other subsystem @@ -1254,7 +1254,7 @@ out_root_unlock: * We were called from subsystem other than our target so we * took some locks so now it's time to release them */ - mutex_unlock(&d_inode(root->cg_item.ci_dentry)->i_mutex); + inode_unlock(d_inode(root->cg_item.ci_dentry)); return ret; } @@ -1561,7 +1561,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) down_write(&configfs_rename_sem); parent = item->parent->dentry; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); new_dentry = lookup_one_len(new_name, parent, strlen(new_name)); if (!IS_ERR(new_dentry)) { @@ -1577,7 +1577,7 @@ int configfs_rename_dir(struct config_item * item, const char *new_name) error = -EEXIST; dput(new_dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); up_write(&configfs_rename_sem); return error; @@ -1590,7 +1590,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) struct configfs_dirent * parent_sd = dentry->d_fsdata; int err; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); /* * Fake invisibility if dir belongs to a group/default groups hierarchy * being attached @@ -1603,7 +1603,7 @@ static int configfs_dir_open(struct inode *inode, struct file *file) else err = 0; } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return err; } @@ -1613,11 +1613,11 @@ static int configfs_dir_close(struct inode *inode, struct file *file) struct dentry * dentry = file->f_path.dentry; struct configfs_dirent * cursor = file->private_data; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); spin_lock(&configfs_dirent_lock); list_del_init(&cursor->s_sibling); spin_unlock(&configfs_dirent_lock); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); release_configfs_dirent(cursor); @@ -1698,7 +1698,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry * dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -1706,7 +1706,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -1732,7 +1732,7 @@ static loff_t configfs_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&configfs_dirent_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } @@ -1767,14 +1767,14 @@ int configfs_register_group(struct config_group *parent_group, parent = parent_group->cg_item.ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); ret = create_default_group(parent_group, group); if (!ret) { spin_lock(&configfs_dirent_lock); configfs_dir_set_ready(group->cg_item.ci_dentry->d_fsdata); spin_unlock(&configfs_dirent_lock); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); return ret; } EXPORT_SYMBOL(configfs_register_group); @@ -1791,7 +1791,7 @@ void configfs_unregister_group(struct config_group *group) struct dentry *dentry = group->cg_item.ci_dentry; struct dentry *parent = group->cg_item.ci_parent->ci_dentry; - mutex_lock_nested(&d_inode(parent)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(parent), I_MUTEX_PARENT); spin_lock(&configfs_dirent_lock); configfs_detach_prep(dentry, NULL); spin_unlock(&configfs_dirent_lock); @@ -1800,7 +1800,7 @@ void configfs_unregister_group(struct config_group *group) d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); d_delete(dentry); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); dput(dentry); @@ -1872,7 +1872,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) sd = root->d_fsdata; link_group(to_config_group(sd->s_element), group); - mutex_lock_nested(&d_inode(root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); err = -ENOMEM; dentry = d_alloc_name(root, group->cg_item.ci_name); @@ -1892,7 +1892,7 @@ int configfs_register_subsystem(struct configfs_subsystem *subsys) } } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (err) { unlink_group(group); @@ -1913,9 +1913,9 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) return; } - mutex_lock_nested(&d_inode(root)->i_mutex, + inode_lock_nested(d_inode(root), I_MUTEX_PARENT); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); mutex_lock(&configfs_symlink_mutex); spin_lock(&configfs_dirent_lock); if (configfs_detach_prep(dentry, NULL)) { @@ -1926,11 +1926,11 @@ void configfs_unregister_subsystem(struct configfs_subsystem *subsys) configfs_detach_group(&group->cg_item); d_inode(dentry)->i_flags |= S_DEAD; dont_mount(dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); d_delete(dentry); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); dput(dentry); diff --git a/fs/configfs/file.c b/fs/configfs/file.c index 3687187c8ea5..33b7ee34eda5 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -540,10 +540,10 @@ int configfs_create_file(struct config_item * item, const struct configfs_attrib umode_t mode = (attr->ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(d_inode(dir), I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) attr, mode, CONFIGFS_ITEM_ATTR); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return error; } @@ -562,10 +562,10 @@ int configfs_create_bin_file(struct config_item *item, umode_t mode = (bin_attr->cb_attr.ca_mode & S_IALLUGO) | S_IFREG; int error = 0; - mutex_lock_nested(&dir->d_inode->i_mutex, I_MUTEX_NORMAL); + inode_lock_nested(dir->d_inode, I_MUTEX_NORMAL); error = configfs_make_dirent(parent_sd, NULL, (void *) bin_attr, mode, CONFIGFS_ITEM_BIN_ATTR); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return error; } diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 0cc810e9dccc..cee087d8f7e0 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -255,7 +255,7 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) /* no inode means this hasn't been made visible yet */ return; - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); list_for_each_entry(sd, &parent_sd->s_children, s_sibling) { if (!sd->s_element) continue; @@ -268,5 +268,5 @@ void configfs_hash_and_remove(struct dentry * dir, const char * name) break; } } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); } @@ -24,6 +24,7 @@ #include <linux/memcontrol.h> #include <linux/mm.h> #include <linux/mutex.h> +#include <linux/pagevec.h> #include <linux/pmem.h> #include <linux/sched.h> #include <linux/uio.h> @@ -245,13 +246,14 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, pos, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); goto out; } } @@ -263,7 +265,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, retval = dax_io(inode, iter, pos, end, get_block, &bh); if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((retval > 0) && end_io) end_io(iocb, pos, retval, bh.b_private); @@ -324,6 +326,199 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } +#define NO_SECTOR -1 +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) + +static int dax_radix_entry(struct address_space *mapping, pgoff_t index, + sector_t sector, bool pmd_entry, bool dirty) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + pgoff_t pmd_index = DAX_PMD_INDEX(index); + int type, error = 0; + void *entry; + + WARN_ON_ONCE(pmd_entry && !dirty); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + spin_lock_irq(&mapping->tree_lock); + + entry = radix_tree_lookup(page_tree, pmd_index); + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { + index = pmd_index; + goto dirty; + } + + entry = radix_tree_lookup(page_tree, index); + if (entry) { + type = RADIX_DAX_TYPE(entry); + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && + type != RADIX_DAX_PMD)) { + error = -EIO; + goto unlock; + } + + if (!pmd_entry || type == RADIX_DAX_PMD) + goto dirty; + + /* + * We only insert dirty PMD entries into the radix tree. This + * means we don't need to worry about removing a dirty PTE + * entry and inserting a clean PMD entry, thus reducing the + * range we would flush with a follow-up fsync/msync call. + */ + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + } + + if (sector == NO_SECTOR) { + /* + * This can happen during correct operation if our pfn_mkwrite + * fault raced against a hole punch operation. If this + * happens the pte that was hole punched will have been + * unmapped and the radix tree entry will have been removed by + * the time we are called, but the call will still happen. We + * will return all the way up to wp_pfn_shared(), where the + * pte_same() check will fail, eventually causing page fault + * to be retried by the CPU. + */ + goto unlock; + } + + error = radix_tree_insert(page_tree, index, + RADIX_DAX_ENTRY(sector, pmd_entry)); + if (error) + goto unlock; + + mapping->nrexceptional++; + dirty: + if (dirty) + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + unlock: + spin_unlock_irq(&mapping->tree_lock); + return error; +} + +static int dax_writeback_one(struct block_device *bdev, + struct address_space *mapping, pgoff_t index, void *entry) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + int type = RADIX_DAX_TYPE(entry); + struct radix_tree_node *node; + struct blk_dax_ctl dax; + void **slot; + int ret = 0; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + + /* another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto unlock; + + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + ret = -EIO; + goto unlock; + } + + dax.sector = RADIX_DAX_SECTOR(entry); + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + spin_unlock_irq(&mapping->tree_lock); + + /* + * We cannot hold tree_lock while calling dax_map_atomic() because it + * eventually calls cond_resched(). + */ + ret = dax_map_atomic(bdev, &dax); + if (ret < 0) + return ret; + + if (WARN_ON_ONCE(ret < dax.size)) { + ret = -EIO; + goto unmap; + } + + wb_cache_pmem(dax.addr, dax.size); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + unmap: + dax_unmap_atomic(bdev, &dax); + return ret; + + unlock: + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Flush the mapping to the persistent domain within the byte range of [start, + * end]. This is required by data integrity operations to ensure file data is + * on persistent storage prior to completion of the operation. + */ +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + struct inode *inode = mapping->host; + struct block_device *bdev = inode->i_sb->s_bdev; + pgoff_t start_index, end_index, pmd_index; + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + bool done = false; + int i, ret = 0; + void *entry; + + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) + return -EIO; + + start_index = start >> PAGE_CACHE_SHIFT; + end_index = end >> PAGE_CACHE_SHIFT; + pmd_index = DAX_PMD_INDEX(start_index); + + rcu_read_lock(); + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); + rcu_read_unlock(); + + /* see if the start of our range is covered by a PMD entry */ + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) + start_index = pmd_index; + + tag_pages_for_writeback(mapping, start_index, end_index); + + pagevec_init(&pvec, 0); + while (!done) { + pvec.nr = find_get_entries_tag(mapping, start_index, + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, + pvec.pages, indices); + + if (pvec.nr == 0) + break; + + for (i = 0; i < pvec.nr; i++) { + if (indices[i] > end_index) { + done = true; + break; + } + + ret = dax_writeback_one(bdev, mapping, indices[i], + pvec.pages[i]); + if (ret < 0) + return ret; + } + } + wmb_pmem(); + return 0; +} +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -363,6 +558,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, + vmf->flags & FAULT_FLAG_WRITE); + if (error) + goto out; + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: @@ -408,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: @@ -487,6 +688,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, delete_from_page_cache(page); unlock_page(page); page_cache_release(page); + page = NULL; } /* @@ -590,7 +792,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, struct block_device *bdev; pgoff_t size, pgoff; sector_t block; - int result = 0; + int error, result = 0; + bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) @@ -624,13 +827,21 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - if (get_block(inode, block, &bh, write) != 0) + + if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; + + if (!buffer_mapped(&bh) && write) { + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + alloc = true; + } + bdev = bh.b_bdev; - i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -639,19 +850,22 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); - goto fallback; + return VM_FAULT_FALLBACK; } /* * If we allocated new storage, make sure no process has any * zero pages covering this hole */ - if (buffer_new(&bh)) { - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_read(mapping); + if (alloc) { + loff_t lstart = pgoff << PAGE_SHIFT; + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ + + truncate_pagecache_range(inode, lstart, lend); } + i_mmap_lock_read(mapping); + /* * If a truncate happened while we were allocating blocks, we may * leave blocks allocated to the file that are beyond EOF. We can't @@ -664,7 +878,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, "pgoff unaligned"); + dax_pmd_dbg(&bh, address, + "offset + huge page size > file size"); goto fallback; } @@ -732,6 +947,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } dax_unmap_atomic(bdev, &dax); + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_pfn_mkwrite() path. This sequence + * allows the dax_pfn_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_pfn_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + error = dax_radix_entry(mapping, pgoff, dax.sector, + true, true); + if (error) { + dax_pmd_dbg(&bh, address, + "PMD radix insertion failed"); + goto fallback; + } + } + dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, @@ -790,15 +1030,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault - * */ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct file *file = vma->vm_file; - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - sb_end_pagefault(sb); + /* + * We pass NO_SECTOR to dax_radix_entry() because we expect that a + * RADIX_DAX_PTE entry already exists in the radix tree from a + * previous call to __dax_fault(). We just want to look up that PTE + * entry using vmf->pgoff and make sure the dirty tag is set. This + * saves us from having to make a call to get_block() here to look + * up the sector. + */ + dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); @@ -835,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, BUG_ON((offset + length) > PAGE_CACHE_SIZE); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_CACHE_SIZE; err = get_block(inode, index, &bh, 0); if (err < 0) diff --git a/fs/dcache.c b/fs/dcache.c index b4539e84e577..92d5140de851 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2462,7 +2462,7 @@ EXPORT_SYMBOL(d_rehash); */ void dentry_update_name_case(struct dentry *dentry, struct qstr *name) { - BUG_ON(!mutex_is_locked(&dentry->d_parent->d_inode->i_mutex)); + BUG_ON(!inode_is_locked(dentry->d_parent->d_inode)); BUG_ON(dentry->d_name.len != name->len); /* d_lookup gives this */ spin_lock(&dentry->d_lock); @@ -2738,7 +2738,7 @@ static int __d_unalias(struct inode *inode, if (!mutex_trylock(&dentry->d_sb->s_vfs_rename_mutex)) goto out_err; m1 = &dentry->d_sb->s_vfs_rename_mutex; - if (!mutex_trylock(&alias->d_parent->d_inode->i_mutex)) + if (!inode_trylock(alias->d_parent->d_inode)) goto out_err; m2 = &alias->d_parent->d_inode->i_mutex; out_unalias: diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index b7fcc0de0b2f..bece948b363d 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -265,7 +265,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { dput(dentry); @@ -273,7 +273,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -282,7 +282,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); dput(dentry); simple_release_fs(&debugfs_mount, &debugfs_mount_count); return NULL; @@ -290,7 +290,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&d_inode(dentry->d_parent)->i_mutex); + inode_unlock(d_inode(dentry->d_parent)); return dentry; } @@ -560,9 +560,9 @@ void debugfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); ret = __debugfs_remove(dentry, parent); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); if (!ret) simple_release_fs(&debugfs_mount, &debugfs_mount_count); } @@ -594,7 +594,7 @@ void debugfs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -609,7 +609,7 @@ void debugfs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); parent = child; goto down; } @@ -630,10 +630,10 @@ void debugfs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); child = parent; parent = parent->d_parent; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (child != dentry) /* go up */ @@ -641,7 +641,7 @@ void debugfs_remove_recursive(struct dentry *dentry) if (!__debugfs_remove(child, parent)) simple_release_fs(&debugfs_mount, &debugfs_mount_count); - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); } EXPORT_SYMBOL_GPL(debugfs_remove_recursive); diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c35ffdc12bba..1f107fd51328 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -255,7 +255,7 @@ static int mknod_ptmx(struct super_block *sb) if (!uid_valid(root_uid) || !gid_valid(root_gid)) return -EINVAL; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); /* If we have already created ptmx node, return */ if (fsi->ptmx_dentry) { @@ -292,7 +292,7 @@ static int mknod_ptmx(struct super_block *sb) fsi->ptmx_dentry = dentry; rc = 0; out: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return rc; } @@ -615,7 +615,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, sprintf(s, "%d", index); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, s); if (dentry) { @@ -626,7 +626,7 @@ struct inode *devpts_pty_new(struct inode *ptmx_inode, dev_t device, int index, inode = ERR_PTR(-ENOMEM); } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return inode; } @@ -671,7 +671,7 @@ void devpts_pty_kill(struct inode *inode) BUG_ON(inode->i_rdev == MKDEV(TTYAUX_MAJOR, PTMX_MINOR)); - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_find_alias(inode); @@ -680,7 +680,7 @@ void devpts_pty_kill(struct inode *inode) dput(dentry); /* d_alloc_name() in devpts_pty_new() */ dput(dentry); /* d_find_alias above */ - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); } static int __init init_devpts_fs(void) diff --git a/fs/direct-io.c b/fs/direct-io.c index 602e8441bc0f..1b2f7ffc8b84 100644 --- a/fs/direct-io.c +++ b/fs/direct-io.c @@ -1157,12 +1157,12 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, iocb->ki_filp->f_mapping; /* will be released by direct_io_worker */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = filemap_write_and_wait_range(mapping, offset, end - 1); if (retval) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); goto out; } @@ -1173,7 +1173,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, dio->i_size = i_size_read(inode); if (iov_iter_rw(iter) == READ && offset >= dio->i_size) { if (dio->flags & DIO_LOCKING) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kmem_cache_free(dio_cache, dio); retval = 0; goto out; @@ -1295,7 +1295,7 @@ do_blockdev_direct_IO(struct kiocb *iocb, struct inode *inode, * of protecting us from looking up uninitialized blocks. */ if (iov_iter_rw(iter) == READ && (dio->flags & DIO_LOCKING)) - mutex_unlock(&dio->inode->i_mutex); + inode_unlock(dio->inode); /* * The only time we want to leave bios in flight is when a successful diff --git a/fs/dlm/user.c b/fs/dlm/user.c index 1925d6d222b8..58c2f4a21b7f 100644 --- a/fs/dlm/user.c +++ b/fs/dlm/user.c @@ -516,7 +516,7 @@ static ssize_t device_write(struct file *file, const char __user *buf, return -EINVAL; kbuf = memdup_user_nul(buf, count); - if (!IS_ERR(kbuf)) + if (IS_ERR(kbuf)) return PTR_ERR(kbuf); if (check_version(kbuf)) { diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 040aa879d634..4e685ac1024d 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -41,13 +41,13 @@ static struct dentry *lock_parent(struct dentry *dentry) struct dentry *dir; dir = dget_parent(dentry); - mutex_lock_nested(&(d_inode(dir)->i_mutex), I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); return dir; } static void unlock_dir(struct dentry *dir) { - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); dput(dir); } @@ -397,11 +397,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, int rc = 0; lower_dir_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry->d_parent); - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(ecryptfs_dentry->d_name.name, lower_dir_dentry, ecryptfs_dentry->d_name.len); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -426,11 +426,11 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, "filename; rc = [%d]\n", __func__, rc); goto out; } - mutex_lock(&d_inode(lower_dir_dentry)->i_mutex); + inode_lock(d_inode(lower_dir_dentry)); lower_dentry = lookup_one_len(encrypted_and_encoded_name, lower_dir_dentry, encrypted_and_encoded_name_size); - mutex_unlock(&d_inode(lower_dir_dentry)->i_mutex); + inode_unlock(d_inode(lower_dir_dentry)); if (IS_ERR(lower_dentry)) { rc = PTR_ERR(lower_dentry); ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " @@ -869,9 +869,9 @@ int ecryptfs_truncate(struct dentry *dentry, loff_t new_length) if (!rc && lower_ia.ia_valid & ATTR_SIZE) { struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); } return rc; } @@ -970,9 +970,9 @@ static int ecryptfs_setattr(struct dentry *dentry, struct iattr *ia) if (lower_ia.ia_valid & (ATTR_KILL_SUID | ATTR_KILL_SGID)) lower_ia.ia_valid &= ~ATTR_MODE; - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = notify_change(lower_dentry, &lower_ia, NULL); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: fsstack_copy_attr_all(inode, lower_inode); return rc; @@ -1048,10 +1048,10 @@ ecryptfs_getxattr_lower(struct dentry *lower_dentry, const char *name, rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->getxattr(lower_dentry, name, value, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1075,9 +1075,9 @@ ecryptfs_listxattr(struct dentry *dentry, char *list, size_t size) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->listxattr(lower_dentry, list, size); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } @@ -1092,9 +1092,9 @@ static int ecryptfs_removexattr(struct dentry *dentry, const char *name) rc = -EOPNOTSUPP; goto out; } - mutex_lock(&d_inode(lower_dentry)->i_mutex); + inode_lock(d_inode(lower_dentry)); rc = d_inode(lower_dentry)->i_op->removexattr(lower_dentry, name); - mutex_unlock(&d_inode(lower_dentry)->i_mutex); + inode_unlock(d_inode(lower_dentry)); out: return rc; } diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index caba848ac763..c6ced4cbf0cf 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -436,7 +436,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) rc = -ENOMEM; goto out; } - mutex_lock(&lower_inode->i_mutex); + inode_lock(lower_inode); size = lower_inode->i_op->getxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, PAGE_CACHE_SIZE); if (size < 0) @@ -444,7 +444,7 @@ static int ecryptfs_write_inode_size_to_xattr(struct inode *ecryptfs_inode) put_unaligned_be64(i_size_read(ecryptfs_inode), xattr_virt); rc = lower_inode->i_op->setxattr(lower_dentry, ECRYPTFS_XATTR_NAME, xattr_virt, size, 0); - mutex_unlock(&lower_inode->i_mutex); + inode_unlock(lower_inode); if (rc) printk(KERN_ERR "Error whilst attempting to write inode size " "to lower file xattr; rc = [%d]\n", rc); diff --git a/fs/efivarfs/file.c b/fs/efivarfs/file.c index 90001da9abfd..c424e4813ec8 100644 --- a/fs/efivarfs/file.c +++ b/fs/efivarfs/file.c @@ -50,9 +50,9 @@ static ssize_t efivarfs_file_write(struct file *file, d_delete(file->f_path.dentry); dput(file->f_path.dentry); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, datasize + sizeof(attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } bytes = count; diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 86a2121828c3..b8a564f29107 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -160,10 +160,10 @@ static int efivarfs_callback(efi_char16_t *name16, efi_guid_t vendor, efivar_entry_size(entry, &size); efivar_entry_add(entry, &efivarfs_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_private = entry; i_size_write(inode, size + sizeof(entry->var.Attributes)); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); d_add(dentry, inode); return 0; diff --git a/fs/exec.c b/fs/exec.c index 828ec5f07de0..dcd4ac7d3f1e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -1307,13 +1307,13 @@ static void bprm_fill_uid(struct linux_binprm *bprm) return; /* Be careful if suid/sgid is set */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* reload atomically mode/uid/gid now that lock held */ mode = inode->i_mode; uid = inode->i_uid; gid = inode->i_gid; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* We ignore suid/sgid if there are no mappings for them in the ns */ if (!kuid_has_mapping(bprm->cred->user_ns, uid) || diff --git a/fs/exofs/file.c b/fs/exofs/file.c index 906de66e8e7e..28645f0640f7 100644 --- a/fs/exofs/file.c +++ b/fs/exofs/file.c @@ -52,9 +52,9 @@ static int exofs_file_fsync(struct file *filp, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_inode_metadata(filp->f_mapping->host, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 714cd37a6ba3..c46f1a190b8d 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -124,10 +124,10 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("%s: get_parent of %ld failed, err %d\n", @@ -143,9 +143,9 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); tmp = lookup_one_len(nbuf, parent, strlen(nbuf)); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(tmp)) { dprintk("%s: lookup failed: %d\n", __func__, PTR_ERR(tmp)); goto out_err; @@ -503,10 +503,10 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, */ err = exportfs_get_name(mnt, target_dir, nbuf, result); if (!err) { - mutex_lock(&target_dir->d_inode->i_mutex); + inode_lock(target_dir->d_inode); nresult = lookup_one_len(nbuf, target_dir, strlen(nbuf)); - mutex_unlock(&target_dir->d_inode->i_mutex); + inode_unlock(target_dir->d_inode); if (!IS_ERR(nresult)) { if (nresult->d_inode) { dput(result); diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 11a42c5a09ae..2c88d683cd91 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret = VM_FAULT_NOPAGE; loff_t size; + int ret; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); @@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); diff --git a/fs/ext2/ioctl.c b/fs/ext2/ioctl.c index 5d46c09863f0..b386af2e45f4 100644 --- a/fs/ext2/ioctl.c +++ b/fs/ext2/ioctl.c @@ -51,10 +51,10 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext2_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -68,7 +68,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) */ if ((flags ^ oldflags) & (EXT2_APPEND_FL | EXT2_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto setflags_out; } @@ -80,7 +80,7 @@ long ext2_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ext2_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME_SEC; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setflags_out: @@ -102,10 +102,10 @@ setflags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_ctime = CURRENT_TIME_SEC; inode->i_generation = generation; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mark_inode_dirty(inode); setversion_out: diff --git a/fs/ext4/crypto.c b/fs/ext4/crypto.c index 1a0835073663..c8021208a7eb 100644 --- a/fs/ext4/crypto.c +++ b/fs/ext4/crypto.c @@ -384,14 +384,12 @@ int ext4_decrypt(struct page *page) EXT4_DECRYPT, page->index, page, page); } -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex) +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len) { struct ext4_crypto_ctx *ctx; struct page *ciphertext_page = NULL; struct bio *bio; - ext4_lblk_t lblk = le32_to_cpu(ex->ee_block); - ext4_fsblk_t pblk = ext4_ext_pblock(ex); - unsigned int len = ext4_ext_get_actual_len(ex); int ret, err = 0; #if 0 diff --git a/fs/ext4/crypto_key.c b/fs/ext4/crypto_key.c index c5882b36e558..9a16d1e75a49 100644 --- a/fs/ext4/crypto_key.c +++ b/fs/ext4/crypto_key.c @@ -213,9 +213,11 @@ retry: res = -ENOKEY; goto out; } + down_read(&keyring_key->sem); ukp = user_key_payload(keyring_key); if (ukp->datalen != sizeof(struct ext4_encryption_key)) { res = -EINVAL; + up_read(&keyring_key->sem); goto out; } master_key = (struct ext4_encryption_key *)ukp->data; @@ -226,10 +228,12 @@ retry: "ext4: key size incorrect: %d\n", master_key->size); res = -ENOKEY; + up_read(&keyring_key->sem); goto out; } res = ext4_derive_key_aes(ctx.nonce, master_key->raw, raw_key); + up_read(&keyring_key->sem); if (res) goto out; got_key: diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index cc7ca4e87144..0662b285dc8a 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -378,14 +378,22 @@ struct flex_groups { #define EXT4_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define EXT4_RESERVED_FL 0x80000000 /* reserved for ext4 lib */ -#define EXT4_FL_USER_VISIBLE 0x004BDFFF /* User visible flags */ -#define EXT4_FL_USER_MODIFIABLE 0x004380FF /* User modifiable flags */ +#define EXT4_FL_USER_VISIBLE 0x304BDFFF /* User visible flags */ +#define EXT4_FL_USER_MODIFIABLE 0x204380FF /* User modifiable flags */ + +#define EXT4_FL_XFLAG_VISIBLE (EXT4_SYNC_FL | \ + EXT4_IMMUTABLE_FL | \ + EXT4_APPEND_FL | \ + EXT4_NODUMP_FL | \ + EXT4_NOATIME_FL | \ + EXT4_PROJINHERIT_FL) /* Flags that should be inherited by new inodes from their parent. */ #define EXT4_FL_INHERITED (EXT4_SECRM_FL | EXT4_UNRM_FL | EXT4_COMPR_FL |\ EXT4_SYNC_FL | EXT4_NODUMP_FL | EXT4_NOATIME_FL |\ EXT4_NOCOMPR_FL | EXT4_JOURNAL_DATA_FL |\ - EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL) + EXT4_NOTAIL_FL | EXT4_DIRSYNC_FL |\ + EXT4_PROJINHERIT_FL) /* Flags that are appropriate for regular files (all but dir-specific ones). */ #define EXT4_REG_FLMASK (~(EXT4_DIRSYNC_FL | EXT4_TOPDIR_FL)) @@ -555,10 +563,12 @@ enum { #define EXT4_GET_BLOCKS_NO_NORMALIZE 0x0040 /* Request will not result in inode size update (user for fallocate) */ #define EXT4_GET_BLOCKS_KEEP_SIZE 0x0080 - /* Do not take i_data_sem locking in ext4_map_blocks */ -#define EXT4_GET_BLOCKS_NO_LOCK 0x0100 /* Convert written extents to unwritten */ -#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0200 +#define EXT4_GET_BLOCKS_CONVERT_UNWRITTEN 0x0100 + /* Write zeros to newly created written extents */ +#define EXT4_GET_BLOCKS_ZERO 0x0200 +#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ + EXT4_GET_BLOCKS_ZERO) /* * The bit position of these flags must not overlap with any of the @@ -616,6 +626,46 @@ enum { #define EXT4_IOC_GET_ENCRYPTION_PWSALT _IOW('f', 20, __u8[16]) #define EXT4_IOC_GET_ENCRYPTION_POLICY _IOW('f', 21, struct ext4_encryption_policy) +#ifndef FS_IOC_FSGETXATTR +/* Until the uapi changes get merged for project quota... */ + +#define FS_IOC_FSGETXATTR _IOR('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW('X', 32, struct fsxattr) + +/* + * Structure for FS_IOC_FSGETXATTR and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ +#endif /* !defined(FS_IOC_FSGETXATTR) */ + +#define EXT4_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define EXT4_IOC_FSSETXATTR FS_IOC_FSSETXATTR + #if defined(__KERNEL__) && defined(CONFIG_COMPAT) /* * ioctl commands in 32 bit emulation @@ -910,6 +960,15 @@ struct ext4_inode_info { * by other means, so we have i_data_sem. */ struct rw_semaphore i_data_sem; + /* + * i_mmap_sem is for serializing page faults with truncate / punch hole + * operations. We have to make sure that new page cannot be faulted in + * a section of the inode that is being punched. We cannot easily use + * i_data_sem for this since we need protection for the whole punch + * operation and i_data_sem ranks below transaction start so we have + * to occasionally drop it. + */ + struct rw_semaphore i_mmap_sem; struct inode vfs_inode; struct jbd2_inode *jinode; @@ -993,6 +1052,7 @@ struct ext4_inode_info { /* Encryption params */ struct ext4_crypt_info *i_crypt_info; #endif + kprojid_t i_projid; }; /* @@ -1248,7 +1308,7 @@ struct ext4_super_block { #endif /* Number of quota types we support */ -#define EXT4_MAXQUOTAS 2 +#define EXT4_MAXQUOTAS 3 /* * fourth extended-fs super-block data in memory @@ -1754,7 +1814,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT) EXT4_FEATURE_RO_COMPAT_HUGE_FILE |\ EXT4_FEATURE_RO_COMPAT_BIGALLOC |\ EXT4_FEATURE_RO_COMPAT_METADATA_CSUM|\ - EXT4_FEATURE_RO_COMPAT_QUOTA) + EXT4_FEATURE_RO_COMPAT_QUOTA |\ + EXT4_FEATURE_RO_COMPAT_PROJECT) #define EXTN_FEATURE_FUNCS(ver) \ static inline bool ext4_has_unknown_ext##ver##_compat_features(struct super_block *sb) \ @@ -1796,6 +1857,11 @@ static inline bool ext4_has_incompat_features(struct super_block *sb) #define EXT4_DEF_RESUID 0 #define EXT4_DEF_RESGID 0 +/* + * Default project ID + */ +#define EXT4_DEF_PROJID 0 + #define EXT4_DEF_INODE_READAHEAD_BLKS 32 /* @@ -2234,7 +2300,8 @@ void ext4_restore_control_page(struct page *data_page); struct page *ext4_encrypt(struct inode *inode, struct page *plaintext_page); int ext4_decrypt(struct page *page); -int ext4_encrypted_zeroout(struct inode *inode, struct ext4_extent *ex); +int ext4_encrypted_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); #ifdef CONFIG_EXT4_FS_ENCRYPTION int ext4_init_crypto(void); @@ -2440,8 +2507,8 @@ struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_get_block_write(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create); int ext4_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create); int ext4_da_get_block_prep(struct inode *inode, sector_t iblock, @@ -2484,9 +2551,13 @@ extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf); +extern int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf); extern qsize_t *ext4_get_reserved_space(struct inode *inode); +extern int ext4_get_projid(struct inode *inode, kprojid_t *projid); extern void ext4_da_update_reserve_space(struct inode *inode, int used, int quota_claim); +extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, + ext4_fsblk_t pblk, ext4_lblk_t len); /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, @@ -2825,7 +2896,7 @@ do { \ static inline void ext4_update_i_disksize(struct inode *inode, loff_t newsize) { WARN_ON_ONCE(S_ISREG(inode->i_mode) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); down_write(&EXT4_I(inode)->i_data_sem); if (newsize > EXT4_I(inode)->i_disksize) EXT4_I(inode)->i_disksize = newsize; @@ -2848,6 +2919,9 @@ static inline int ext4_update_inode_size(struct inode *inode, loff_t newsize) return changed; } +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len); + struct ext4_group_info { unsigned long bb_state; struct rb_root bb_free_root; @@ -2986,8 +3060,7 @@ extern int ext4_da_write_inline_data_end(struct inode *inode, loff_t pos, struct page *page); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, - struct inode *inode); + struct inode *dir, struct inode *inode); extern int ext4_try_create_inline_dir(handle_t *handle, struct inode *parent, struct inode *inode); diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index 551353b1b17a..0ffabaf90aa5 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -3119,19 +3119,11 @@ static int ext4_ext_zeroout(struct inode *inode, struct ext4_extent *ex) { ext4_fsblk_t ee_pblock; unsigned int ee_len; - int ret; ee_len = ext4_ext_get_actual_len(ex); ee_pblock = ext4_ext_pblock(ex); - - if (ext4_encrypted_inode(inode)) - return ext4_encrypted_zeroout(inode, ex); - - ret = sb_issue_zeroout(inode->i_sb, ee_pblock, ee_len, GFP_NOFS); - if (ret > 0) - ret = 0; - - return ret; + return ext4_issue_zeroout(inode, le32_to_cpu(ex->ee_block), ee_pblock, + ee_len); } /* @@ -4052,6 +4044,14 @@ ext4_ext_handle_unwritten_extents(handle_t *handle, struct inode *inode, } /* IO end_io complete, convert the filled extent to written */ if (flags & EXT4_GET_BLOCKS_CONVERT) { + if (flags & EXT4_GET_BLOCKS_ZERO) { + if (allocated > map->m_len) + allocated = map->m_len; + err = ext4_issue_zeroout(inode, map->m_lblk, newblock, + allocated); + if (err < 0) + goto out2; + } ret = ext4_convert_unwritten_extents_endio(handle, inode, map, ppath); if (ret >= 0) { @@ -4685,10 +4685,6 @@ static int ext4_alloc_file_blocks(struct file *file, ext4_lblk_t offset, if (len <= EXT_UNWRITTEN_MAX_LEN) flags |= EXT4_GET_BLOCKS_NO_NORMALIZE; - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - /* * credits to insert 1 extent into extent tree */ @@ -4752,8 +4748,6 @@ retry: goto retry; } - ext4_inode_resume_unlocked_dio(inode); - return ret > 0 ? ret2 : ret; } @@ -4770,7 +4764,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, int partial_begin, partial_end; loff_t start, end; ext4_lblk_t lblk; - struct address_space *mapping = inode->i_mapping; unsigned int blkbits = inode->i_blkbits; trace_ext4_zero_range(inode, offset, len, mode); @@ -4786,17 +4779,6 @@ static long ext4_zero_range(struct file *file, loff_t offset, } /* - * Write out all dirty pages to avoid race conditions - * Then release them. - */ - if (mapping->nrpages && mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + len - 1); - if (ret) - return ret; - } - - /* * Round up offset. This is not fallocate, we neet to zero out * blocks, so convert interior block aligned part of the range to * unwritten and possibly manually zero out unaligned parts of the @@ -4817,7 +4799,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, else max_blocks -= lblk; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Indirect files do not support unwritten extnets @@ -4839,6 +4821,10 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + /* Preallocate the range including the unaligned edges */ if (partial_begin || partial_end) { ret = ext4_alloc_file_blocks(file, @@ -4847,7 +4833,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, round_down(offset, 1 << blkbits)) >> blkbits, new_size, flags, mode); if (ret) - goto out_mutex; + goto out_dio; } @@ -4856,16 +4842,23 @@ static long ext4_zero_range(struct file *file, loff_t offset, flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); - /* Now release the pages and zero block aligned part of pages*/ + /* + * Prevent page faults from reinstantiating pages we have + * released from page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) { + up_write(&EXT4_I(inode)->i_mmap_sem); + goto out_dio; + } + /* Now release the pages and zero block aligned part of pages */ truncate_pagecache_range(inode, start, end - 1); inode->i_mtime = inode->i_ctime = ext4_current_time(inode); - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + up_write(&EXT4_I(inode)->i_mmap_sem); if (ret) goto out_dio; } @@ -4909,7 +4902,7 @@ static long ext4_zero_range(struct file *file, loff_t offset, out_dio: ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -4980,7 +4973,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) if (mode & FALLOC_FL_KEEP_SIZE) flags |= EXT4_GET_BLOCKS_KEEP_SIZE; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * We only support preallocation for extent-based files only @@ -4998,8 +4991,13 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) goto out; } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags, mode); + ext4_inode_resume_unlocked_dio(inode); if (ret) goto out; @@ -5008,7 +5006,7 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) EXT4_I(inode)->i_sync_tid); } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); return ret; } @@ -5494,21 +5492,7 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down offset to be aligned with page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation @@ -5524,17 +5508,43 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down offset to be aligned with page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* + * Write tail of the last page before removed range since it will get + * removed from the page cache below. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, offset); + if (ret) + goto out_mmap; + /* + * Write data that will be shifted to preserve them when discarding + * page cache below. We are also protected from pages becoming dirty + * by i_mmap_sem. + */ + ret = filemap_write_and_wait_range(inode->i_mapping, offset + len, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } down_write(&EXT4_I(inode)->i_data_sem); @@ -5573,10 +5583,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5627,21 +5638,7 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) return ret; } - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - return ret; - - /* Take mutex lock */ - mutex_lock(&inode->i_mutex); - + inode_lock(inode); /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { ret = -EOPNOTSUPP; @@ -5660,17 +5657,32 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) goto out_mutex; } - truncate_pagecache(inode, ioffset); - /* Wait for existing dio to complete */ ext4_inode_block_unlocked_dio(inode); inode_dio_wait(inode); + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); + /* + * Need to round down to align start offset to page size boundary + * for page size > block size. + */ + ioffset = round_down(offset, PAGE_SIZE); + /* Write out all dirty pages */ + ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, + LLONG_MAX); + if (ret) + goto out_mmap; + truncate_pagecache(inode, ioffset); + credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - goto out_dio; + goto out_mmap; } /* Expand file to avoid data loss if there is error while shifting */ @@ -5741,10 +5753,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len) out_stop: ext4_journal_stop(handle); -out_dio: +out_mmap: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -5779,8 +5792,8 @@ ext4_swap_extents(handle_t *handle, struct inode *inode1, BUG_ON(!rwsem_is_locked(&EXT4_I(inode1)->i_data_sem)); BUG_ON(!rwsem_is_locked(&EXT4_I(inode2)->i_data_sem)); - BUG_ON(!mutex_is_locked(&inode1->i_mutex)); - BUG_ON(!mutex_is_locked(&inode2->i_mutex)); + BUG_ON(!inode_is_locked(inode1)); + BUG_ON(!inode_is_locked(inode2)); *erp = ext4_es_remove_extent(inode1, lblk1, count); if (unlikely(*erp)) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 113837e7ba98..1126436dada1 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -113,7 +113,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ext4_unwritten_wait(inode); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; @@ -169,7 +169,7 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; @@ -186,50 +186,42 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return ret; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (aio_mutex) mutex_unlock(aio_mutex); return ret; } #ifdef CONFIG_FS_DAX -static void ext4_end_io_unwritten(struct buffer_head *bh, int uptodate) -{ - struct inode *inode = bh->b_assoc_map->host; - /* XXX: breaks on 32-bit > 16TB. Is that even supported? */ - loff_t offset = (loff_t)(uintptr_t)bh->b_private << inode->i_blkbits; - int err; - if (!uptodate) - return; - WARN_ON(!buffer_unwritten(bh)); - err = ext4_convert_unwritten_extents(NULL, inode, offset, bh->b_size); -} - static int ext4_dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf) { int result; handle_t *handle = NULL; - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; bool write = vmf->flags & FAULT_FLAG_WRITE; if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, EXT4_DATA_TRANS_BLOCKS(sb)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else - result = __dax_fault(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + result = __dax_fault(vma, vmf, ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } @@ -246,44 +238,88 @@ static int ext4_dax_pmd_fault(struct vm_area_struct *vma, unsigned long addr, if (write) { sb_start_pagefault(sb); file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, ext4_chunk_trans_blocks(inode, PMD_SIZE / PAGE_SIZE)); - } + } else + down_read(&EXT4_I(inode)->i_mmap_sem); if (IS_ERR(handle)) result = VM_FAULT_SIGBUS; else result = __dax_pmd_fault(vma, addr, pmd, flags, - ext4_get_block_dax, ext4_end_io_unwritten); + ext4_dax_mmap_get_block, NULL); if (write) { if (!IS_ERR(handle)) ext4_journal_stop(handle); + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); - } + } else + up_read(&EXT4_I(inode)->i_mmap_sem); return result; } static int ext4_dax_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - return dax_mkwrite(vma, vmf, ext4_get_block_dax, - ext4_end_io_unwritten); + int err; + struct inode *inode = file_inode(vma->vm_file); + + sb_start_pagefault(inode->i_sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + err = __dax_mkwrite(vma, vmf, ext4_dax_mmap_get_block, NULL); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(inode->i_sb); + + return err; +} + +/* + * Handle write fault for VM_MIXEDMAP mappings. Similarly to ext4_dax_mkwrite() + * handler we check for races agaist truncate. Note that since we cycle through + * i_mmap_sem, we are sure that also any hole punching that began before we + * were called is finished by now and so if it included part of the file we + * are working on, our pte will get unmapped and the check for pte_same() in + * wp_pfn_shared() fails. Thus fault gets retried and things work out as + * desired. + */ +static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, + struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + struct super_block *sb = inode->i_sb; + loff_t size; + int ret; + + sb_start_pagefault(sb); + file_update_time(vma->vm_file); + down_read(&EXT4_I(inode)->i_mmap_sem); + size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; + if (vmf->pgoff >= size) + ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + sb_end_pagefault(sb); + + return ret; } static const struct vm_operations_struct ext4_dax_vm_ops = { .fault = ext4_dax_fault, .pmd_fault = ext4_dax_pmd_fault, .page_mkwrite = ext4_dax_mkwrite, - .pfn_mkwrite = dax_pfn_mkwrite, + .pfn_mkwrite = ext4_dax_pfn_mkwrite, }; #else #define ext4_dax_vm_ops ext4_file_vm_ops #endif static const struct vm_operations_struct ext4_file_vm_ops = { - .fault = filemap_fault, + .fault = ext4_filemap_fault, .map_pages = filemap_map_pages, .page_mkwrite = ext4_page_mkwrite, }; @@ -527,11 +563,11 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -579,7 +615,7 @@ static loff_t ext4_seek_data(struct file *file, loff_t offset, loff_t maxsize) dataoff = (loff_t)last << blkbits; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (dataoff > isize) return -ENXIO; @@ -600,11 +636,11 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) int blkbits; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -655,7 +691,7 @@ static loff_t ext4_seek_hole(struct file *file, loff_t offset, loff_t maxsize) break; } while (last <= end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (holeoff > isize) holeoff = isize; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 1b8024d26f65..3fcfd50a2e8a 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -799,6 +799,13 @@ struct inode *__ext4_new_inode(handle_t *handle, struct inode *dir, inode->i_gid = dir->i_gid; } else inode_init_owner(inode, dir, mode); + + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) + ei->i_projid = EXT4_I(dir)->i_projid; + else + ei->i_projid = make_kprojid(&init_user_ns, EXT4_DEF_PROJID); + err = dquot_initialize(inode); if (err) goto out; diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index d884989cc83d..dfe3b9bafc0d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -995,12 +995,11 @@ void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, */ static int ext4_add_dirent_to_inline(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct ext4_iloc *iloc, void *inline_start, int inline_size) { - struct inode *dir = d_inode(dentry->d_parent); int err; struct ext4_dir_entry_2 *de; @@ -1245,12 +1244,11 @@ out: * the new created block. */ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { int ret, inline_size; void *inline_start; struct ext4_iloc iloc; - struct inode *dir = d_inode(dentry->d_parent); ret = ext4_get_inode_loc(dir, &iloc); if (ret) @@ -1264,7 +1262,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, EXT4_INLINE_DOTDOT_SIZE; inline_size = EXT4_MIN_INLINE_DATA_SIZE - EXT4_INLINE_DOTDOT_SIZE; - ret = ext4_add_dirent_to_inline(handle, fname, dentry, inode, &iloc, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); if (ret != -ENOSPC) goto out; @@ -1285,7 +1283,7 @@ int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, if (inline_size) { inline_start = ext4_get_inline_xattr_pos(dir, &iloc); - ret = ext4_add_dirent_to_inline(handle, fname, dentry, + ret = ext4_add_dirent_to_inline(handle, fname, dir, inode, &iloc, inline_start, inline_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index b3bd912df6bf..83bc8bfb3bea 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -383,6 +383,21 @@ static int __check_block_validity(struct inode *inode, const char *func, return 0; } +int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, + ext4_lblk_t len) +{ + int ret; + + if (ext4_encrypted_inode(inode)) + return ext4_encrypted_zeroout(inode, lblk, pblk, len); + + ret = sb_issue_zeroout(inode->i_sb, pblk, len, GFP_NOFS); + if (ret > 0) + ret = 0; + + return ret; +} + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -403,8 +418,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, * out taking i_data_sem. So at the time the unwritten extent * could be converted. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -412,8 +426,7 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, retval = ext4_ind_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); /* * We don't check m_len because extent will be collpased in status @@ -509,8 +522,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * Try to see if we can get the block without requesting a new * file system block. */ - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - down_read(&EXT4_I(inode)->i_data_sem); + down_read(&EXT4_I(inode)->i_data_sem); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { retval = ext4_ext_map_blocks(handle, inode, map, flags & EXT4_GET_BLOCKS_KEEP_SIZE); @@ -541,8 +553,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (ret < 0) retval = ret; } - if (!(flags & EXT4_GET_BLOCKS_NO_LOCK)) - up_read((&EXT4_I(inode)->i_data_sem)); + up_read((&EXT4_I(inode)->i_data_sem)); found: if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { @@ -626,13 +637,29 @@ found: } /* + * We have to zeroout blocks before inserting them into extent + * status tree. Otherwise someone could look them up there and + * use them before they are really zeroed. + */ + if (flags & EXT4_GET_BLOCKS_ZERO && + map->m_flags & EXT4_MAP_MAPPED && + map->m_flags & EXT4_MAP_NEW) { + ret = ext4_issue_zeroout(inode, map->m_lblk, + map->m_pblk, map->m_len); + if (ret) { + retval = ret; + goto out_sem; + } + } + + /* * If the extent has been zeroed out, we don't need to update * extent status tree. */ if ((flags & EXT4_GET_BLOCKS_PRE_IO) && ext4_es_lookup_extent(inode, map->m_lblk, &es)) { if (ext4_es_is_written(&es)) - goto has_zeroout; + goto out_sem; } status = map->m_flags & EXT4_MAP_UNWRITTEN ? EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; @@ -643,11 +670,13 @@ found: status |= EXTENT_STATUS_DELAYED; ret = ext4_es_insert_extent(inode, map->m_lblk, map->m_len, map->m_pblk, status); - if (ret < 0) + if (ret < 0) { retval = ret; + goto out_sem; + } } -has_zeroout: +out_sem: up_write((&EXT4_I(inode)->i_data_sem)); if (retval > 0 && map->m_flags & EXT4_MAP_MAPPED) { ret = check_block_validity(inode, map); @@ -674,7 +703,7 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map.m_lblk = iblock; map.m_len = bh->b_size >> inode->i_blkbits; - if (flags && !(flags & EXT4_GET_BLOCKS_NO_LOCK) && !handle) { + if (flags && !handle) { /* Direct IO write... */ if (map.m_len > DIO_MAX_BLOCKS) map.m_len = DIO_MAX_BLOCKS; @@ -694,16 +723,6 @@ static int _ext4_get_block(struct inode *inode, sector_t iblock, map_bh(bh, inode->i_sb, map.m_pblk); bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | map.m_flags; - if (IS_DAX(inode) && buffer_unwritten(bh)) { - /* - * dgc: I suspect unwritten conversion on ext4+DAX is - * fundamentally broken here when there are concurrent - * read/write in progress on this inode. - */ - WARN_ON_ONCE(io_end); - bh->b_assoc_map = inode->i_mapping; - bh->b_private = (void *)(unsigned long)iblock; - } if (io_end && io_end->flag & EXT4_IO_END_UNWRITTEN) set_buffer_defer_completion(bh); bh->b_size = inode->i_sb->s_blocksize * map.m_len; @@ -879,9 +898,6 @@ int do_journal_get_write_access(handle_t *handle, return ret; } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create); - #ifdef CONFIG_EXT4_FS_ENCRYPTION static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, get_block_t *get_block) @@ -3054,25 +3070,96 @@ int ext4_get_block_write(struct inode *inode, sector_t iblock, EXT4_GET_BLOCKS_IO_CREATE_EXT); } -static int ext4_get_block_write_nolock(struct inode *inode, sector_t iblock, +static int ext4_get_block_overwrite(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) { - ext4_debug("ext4_get_block_write_nolock: inode %lu, create flag %d\n", + int ret; + + ext4_debug("ext4_get_block_overwrite: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, - EXT4_GET_BLOCKS_NO_LOCK); + ret = _ext4_get_block(inode, iblock, bh_result, 0); + /* + * Blocks should have been preallocated! ext4_file_write_iter() checks + * that. + */ + WARN_ON_ONCE(!buffer_mapped(bh_result)); + + return ret; } -int ext4_get_block_dax(struct inode *inode, sector_t iblock, - struct buffer_head *bh_result, int create) +#ifdef CONFIG_FS_DAX +int ext4_dax_mmap_get_block(struct inode *inode, sector_t iblock, + struct buffer_head *bh_result, int create) { - int flags = EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_UNWRIT_EXT; - if (create) - flags |= EXT4_GET_BLOCKS_CREATE; - ext4_debug("ext4_get_block_dax: inode %lu, create flag %d\n", + int ret, err; + int credits; + struct ext4_map_blocks map; + handle_t *handle = NULL; + int flags = 0; + + ext4_debug("ext4_dax_mmap_get_block: inode %lu, create flag %d\n", inode->i_ino, create); - return _ext4_get_block(inode, iblock, bh_result, flags); + map.m_lblk = iblock; + map.m_len = bh_result->b_size >> inode->i_blkbits; + credits = ext4_chunk_trans_blocks(inode, map.m_len); + if (create) { + flags |= EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_CREATE_ZERO; + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + ret = ext4_map_blocks(handle, inode, &map, flags); + if (create) { + err = ext4_journal_stop(handle); + if (ret >= 0 && err < 0) + ret = err; + } + if (ret <= 0) + goto out; + if (map.m_flags & EXT4_MAP_UNWRITTEN) { + int err2; + + /* + * We are protected by i_mmap_sem so we know block cannot go + * away from under us even though we dropped i_data_sem. + * Convert extent to written and write zeros there. + * + * Note: We may get here even when create == 0. + */ + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + goto out; + } + + err = ext4_map_blocks(handle, inode, &map, + EXT4_GET_BLOCKS_CONVERT | EXT4_GET_BLOCKS_CREATE_ZERO); + if (err < 0) + ret = err; + err2 = ext4_journal_stop(handle); + if (err2 < 0 && ret > 0) + ret = err2; + } +out: + WARN_ON_ONCE(ret == 0 && create); + if (ret > 0) { + map_bh(bh_result, inode->i_sb, map.m_pblk); + bh_result->b_state = (bh_result->b_state & ~EXT4_MAP_FLAGS) | + map.m_flags; + /* + * At least for now we have to clear BH_New so that DAX code + * doesn't attempt to zero blocks again in a racy way. + */ + bh_result->b_state &= ~(1 << BH_New); + bh_result->b_size = map.m_len << inode->i_blkbits; + ret = 0; + } + return ret; } +#endif static void ext4_end_io_dio(struct kiocb *iocb, loff_t offset, ssize_t size, void *private) @@ -3143,10 +3230,8 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, /* If we do a overwrite dio, i_mutex locking can be released */ overwrite = *((int *)iocb->private); - if (overwrite) { - down_read(&EXT4_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); - } + if (overwrite) + inode_unlock(inode); /* * We could direct write to holes and fallocate. @@ -3189,7 +3274,7 @@ static ssize_t ext4_ext_direct_IO(struct kiocb *iocb, struct iov_iter *iter, } if (overwrite) { - get_block_func = ext4_get_block_write_nolock; + get_block_func = ext4_get_block_overwrite; } else { get_block_func = ext4_get_block_write; dio_flags = DIO_LOCKING; @@ -3245,10 +3330,8 @@ retake_lock: if (iov_iter_rw(iter) == WRITE) inode_dio_end(inode); /* take i_mutex locking again if we do a ovewrite dio */ - if (overwrite) { - up_read(&EXT4_I(inode)->i_data_sem); - mutex_lock(&inode->i_mutex); - } + if (overwrite) + inode_lock(inode); return ret; } @@ -3559,6 +3642,35 @@ int ext4_can_truncate(struct inode *inode) } /* + * We have to make sure i_disksize gets properly updated before we truncate + * page cache due to hole punching or zero range. Otherwise i_disksize update + * can get lost as it may have been postponed to submission of writeback but + * that will never happen after we truncate page cache. + */ +int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, + loff_t len) +{ + handle_t *handle; + loff_t size = i_size_read(inode); + + WARN_ON(!inode_is_locked(inode)); + if (offset > size || offset + len < size) + return 0; + + if (EXT4_I(inode)->i_disksize >= size) + return 0; + + handle = ext4_journal_start(inode, EXT4_HT_MISC, 1); + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_update_i_disksize(inode, size); + ext4_mark_inode_dirty(handle, inode); + ext4_journal_stop(handle); + + return 0; +} + +/* * ext4_punch_hole: punches a hole in a file by releaseing the blocks * associated with the given offset and length * @@ -3595,7 +3707,7 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* No need to punch hole beyond i_size */ if (offset >= inode->i_size) @@ -3623,17 +3735,26 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) } + /* Wait all existing dio workers, newcomers will block on i_mutex */ + ext4_inode_block_unlocked_dio(inode); + inode_dio_wait(inode); + + /* + * Prevent page faults from reinstantiating pages we have released from + * page cache. + */ + down_write(&EXT4_I(inode)->i_mmap_sem); first_block_offset = round_up(offset, sb->s_blocksize); last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) + if (last_block_offset > first_block_offset) { + ret = ext4_update_disksize_before_punch(inode, offset, length); + if (ret) + goto out_dio; truncate_pagecache_range(inode, first_block_offset, last_block_offset); - - /* Wait all existing dio workers, newcomers will block on i_mutex */ - ext4_inode_block_unlocked_dio(inode); - inode_dio_wait(inode); + } if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -3680,19 +3801,15 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length) if (IS_SYNC(inode)) ext4_handle_sync(handle); - /* Now release the pages again to reduce race window */ - if (last_block_offset > first_block_offset) - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - inode->i_mtime = inode->i_ctime = ext4_current_time(inode); ext4_mark_inode_dirty(handle, inode); out_stop: ext4_journal_stop(handle); out_dio: + up_write(&EXT4_I(inode)->i_mmap_sem); ext4_inode_resume_unlocked_dio(inode); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -3762,7 +3879,7 @@ void ext4_truncate(struct inode *inode) * have i_mutex locked because it's not necessary. */ if (!(inode->i_state & (I_NEW|I_FREEING))) - WARN_ON(!mutex_is_locked(&inode->i_mutex)); + WARN_ON(!inode_is_locked(inode)); trace_ext4_truncate_enter(inode); if (!ext4_can_truncate(inode)) @@ -4076,6 +4193,14 @@ static inline void ext4_iget_extra_inode(struct inode *inode, EXT4_I(inode)->i_inline_off = 0; } +int ext4_get_projid(struct inode *inode, kprojid_t *projid) +{ + if (!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, EXT4_FEATURE_RO_COMPAT_PROJECT)) + return -EOPNOTSUPP; + *projid = EXT4_I(inode)->i_projid; + return 0; +} + struct inode *ext4_iget(struct super_block *sb, unsigned long ino) { struct ext4_iloc iloc; @@ -4087,6 +4212,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) int block; uid_t i_uid; gid_t i_gid; + projid_t i_projid; inode = iget_locked(sb, ino); if (!inode) @@ -4136,12 +4262,20 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino) inode->i_mode = le16_to_cpu(raw_inode->i_mode); i_uid = (uid_t)le16_to_cpu(raw_inode->i_uid_low); i_gid = (gid_t)le16_to_cpu(raw_inode->i_gid_low); + if (EXT4_HAS_RO_COMPAT_FEATURE(sb, EXT4_FEATURE_RO_COMPAT_PROJECT) && + EXT4_INODE_SIZE(sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + i_projid = (projid_t)le32_to_cpu(raw_inode->i_projid); + else + i_projid = EXT4_DEF_PROJID; + if (!(test_opt(inode->i_sb, NO_UID32))) { i_uid |= le16_to_cpu(raw_inode->i_uid_high) << 16; i_gid |= le16_to_cpu(raw_inode->i_gid_high) << 16; } i_uid_write(inode, i_uid); i_gid_write(inode, i_gid); + ei->i_projid = make_kprojid(&init_user_ns, i_projid); set_nlink(inode, le16_to_cpu(raw_inode->i_links_count)); ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -4440,6 +4574,7 @@ static int ext4_do_update_inode(handle_t *handle, int need_datasync = 0, set_large_file = 0; uid_t i_uid; gid_t i_gid; + projid_t i_projid; spin_lock(&ei->i_raw_lock); @@ -4452,6 +4587,7 @@ static int ext4_do_update_inode(handle_t *handle, raw_inode->i_mode = cpu_to_le16(inode->i_mode); i_uid = i_uid_read(inode); i_gid = i_gid_read(inode); + i_projid = from_kprojid(&init_user_ns, ei->i_projid); if (!(test_opt(inode->i_sb, NO_UID32))) { raw_inode->i_uid_low = cpu_to_le16(low_16_bits(i_uid)); raw_inode->i_gid_low = cpu_to_le16(low_16_bits(i_gid)); @@ -4529,6 +4665,15 @@ static int ext4_do_update_inode(handle_t *handle, cpu_to_le16(ei->i_extra_isize); } } + + BUG_ON(!EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT) && + i_projid != EXT4_DEF_PROJID); + + if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE && + EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) + raw_inode->i_projid = cpu_to_le32(i_projid); + ext4_inode_csum_set(inode, raw_inode, ei); spin_unlock(&ei->i_raw_lock); if (inode->i_sb->s_flags & MS_LAZYTIME) @@ -4824,6 +4969,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) } else ext4_wait_for_tail_page_commit(inode); } + down_write(&EXT4_I(inode)->i_mmap_sem); /* * Truncate pagecache after we've waited for commit * in data=journal mode to make pages freeable. @@ -4831,6 +4977,7 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr) truncate_pagecache(inode, inode->i_size); if (shrink) ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } if (!rc) { @@ -5279,6 +5426,8 @@ int ext4_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); + + down_read(&EXT4_I(inode)->i_mmap_sem); /* Delalloc case is easy... */ if (test_opt(inode->i_sb, DELALLOC) && !ext4_should_journal_data(inode) && @@ -5348,6 +5497,19 @@ retry_alloc: out_ret: ret = block_page_mkwrite_return(ret); out: + up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(inode->i_sb); return ret; } + +int ext4_filemap_fault(struct vm_area_struct *vma, struct vm_fault *vmf) +{ + struct inode *inode = file_inode(vma->vm_file); + int err; + + down_read(&EXT4_I(inode)->i_mmap_sem); + err = filemap_fault(vma, vmf); + up_read(&EXT4_I(inode)->i_mmap_sem); + + return err; +} diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 5e872fd40e5e..0f6c36922c24 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -14,6 +14,7 @@ #include <linux/mount.h> #include <linux/file.h> #include <linux/random.h> +#include <linux/quotaops.h> #include <asm/uaccess.h> #include "ext4_jbd2.h" #include "ext4.h" @@ -202,6 +203,238 @@ static int uuid_is_zero(__u8 u[16]) return 1; } +static int ext4_ioctl_setflags(struct inode *inode, + unsigned int flags) +{ + struct ext4_inode_info *ei = EXT4_I(inode); + handle_t *handle = NULL; + int err = EPERM, migrate = 0; + struct ext4_iloc iloc; + unsigned int oldflags, mask, i; + unsigned int jflag; + + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto flags_out; + + oldflags = ei->i_flags; + + /* The JOURNAL_DATA flag is modifiable only by root */ + jflag = flags & EXT4_JOURNAL_DATA_FL; + + /* + * The IMMUTABLE and APPEND_ONLY flags can only be changed by + * the relevant capability. + * + * This test looks nicer. Thanks to Pauline Middelink + */ + if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { + if (!capable(CAP_LINUX_IMMUTABLE)) + goto flags_out; + } + + /* + * The JOURNAL_DATA flag can only be changed by + * the relevant capability. + */ + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { + if (!capable(CAP_SYS_RESOURCE)) + goto flags_out; + } + if ((flags ^ oldflags) & EXT4_EXTENTS_FL) + migrate = 1; + + if (flags & EXT4_EOFBLOCKS_FL) { + /* we don't support adding EOFBLOCKS flag */ + if (!(oldflags & EXT4_EOFBLOCKS_FL)) { + err = -EOPNOTSUPP; + goto flags_out; + } + } else if (oldflags & EXT4_EOFBLOCKS_FL) + ext4_truncate(inode); + + handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto flags_out; + } + if (IS_SYNC(inode)) + ext4_handle_sync(handle); + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto flags_err; + + for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { + if (!(mask & EXT4_FL_USER_MODIFIABLE)) + continue; + if (mask & flags) + ext4_set_inode_flag(inode, i); + else + ext4_clear_inode_flag(inode, i); + } + + ext4_set_inode_flags(inode); + inode->i_ctime = ext4_current_time(inode); + + err = ext4_mark_iloc_dirty(handle, inode, &iloc); +flags_err: + ext4_journal_stop(handle); + if (err) + goto flags_out; + + if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) + err = ext4_change_inode_journal_flag(inode, jflag); + if (err) + goto flags_out; + if (migrate) { + if (flags & EXT4_EXTENTS_FL) + err = ext4_ext_migrate(inode); + else + err = ext4_ind_migrate(inode); + } + +flags_out: + return err; +} + +#ifdef CONFIG_QUOTA +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + struct inode *inode = file_inode(filp); + struct super_block *sb = inode->i_sb; + struct ext4_inode_info *ei = EXT4_I(inode); + int err, rc; + handle_t *handle; + kprojid_t kprojid; + struct ext4_iloc iloc; + struct ext4_inode *raw_inode; + + if (!EXT4_HAS_RO_COMPAT_FEATURE(sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + else + return 0; + } + + if (EXT4_INODE_SIZE(sb) <= EXT4_GOOD_OLD_INODE_SIZE) + return -EOPNOTSUPP; + + kprojid = make_kprojid(&init_user_ns, (projid_t)projid); + + if (projid_eq(kprojid, EXT4_I(inode)->i_projid)) + return 0; + + err = mnt_want_write_file(filp); + if (err) + return err; + + err = -EPERM; + inode_lock(inode); + /* Is it quota file? Do not allow user to mess with it */ + if (IS_NOQUOTA(inode)) + goto out_unlock; + + err = ext4_get_inode_loc(inode, &iloc); + if (err) + goto out_unlock; + + raw_inode = ext4_raw_inode(&iloc); + if (!EXT4_FITS_IN_INODE(raw_inode, ei, i_projid)) { + err = -EOVERFLOW; + brelse(iloc.bh); + goto out_unlock; + } + brelse(iloc.bh); + + dquot_initialize(inode); + + handle = ext4_journal_start(inode, EXT4_HT_QUOTA, + EXT4_QUOTA_INIT_BLOCKS(sb) + + EXT4_QUOTA_DEL_BLOCKS(sb) + 3); + if (IS_ERR(handle)) { + err = PTR_ERR(handle); + goto out_unlock; + } + + err = ext4_reserve_inode_write(handle, inode, &iloc); + if (err) + goto out_stop; + + if (sb_has_quota_limits_enabled(sb, PRJQUOTA)) { + struct dquot *transfer_to[MAXQUOTAS] = { }; + + transfer_to[PRJQUOTA] = dqget(sb, make_kqid_projid(kprojid)); + if (transfer_to[PRJQUOTA]) { + err = __dquot_transfer(inode, transfer_to); + dqput(transfer_to[PRJQUOTA]); + if (err) + goto out_dirty; + } + } + EXT4_I(inode)->i_projid = kprojid; + inode->i_ctime = ext4_current_time(inode); +out_dirty: + rc = ext4_mark_iloc_dirty(handle, inode, &iloc); + if (!err) + err = rc; +out_stop: + ext4_journal_stop(handle); +out_unlock: + inode_unlock(inode); + mnt_drop_write_file(filp); + return err; +} +#else +static int ext4_ioctl_setproject(struct file *filp, __u32 projid) +{ + if (projid != EXT4_DEF_PROJID) + return -EOPNOTSUPP; + return 0; +} +#endif + +/* Transfer internal flags to xflags */ +static inline __u32 ext4_iflags_to_xflags(unsigned long iflags) +{ + __u32 xflags = 0; + + if (iflags & EXT4_SYNC_FL) + xflags |= FS_XFLAG_SYNC; + if (iflags & EXT4_IMMUTABLE_FL) + xflags |= FS_XFLAG_IMMUTABLE; + if (iflags & EXT4_APPEND_FL) + xflags |= FS_XFLAG_APPEND; + if (iflags & EXT4_NODUMP_FL) + xflags |= FS_XFLAG_NODUMP; + if (iflags & EXT4_NOATIME_FL) + xflags |= FS_XFLAG_NOATIME; + if (iflags & EXT4_PROJINHERIT_FL) + xflags |= FS_XFLAG_PROJINHERIT; + return xflags; +} + +/* Transfer xflags flags to internal */ +static inline unsigned long ext4_xflags_to_iflags(__u32 xflags) +{ + unsigned long iflags = 0; + + if (xflags & FS_XFLAG_SYNC) + iflags |= EXT4_SYNC_FL; + if (xflags & FS_XFLAG_IMMUTABLE) + iflags |= EXT4_IMMUTABLE_FL; + if (xflags & FS_XFLAG_APPEND) + iflags |= EXT4_APPEND_FL; + if (xflags & FS_XFLAG_NODUMP) + iflags |= EXT4_NODUMP_FL; + if (xflags & FS_XFLAG_NOATIME) + iflags |= EXT4_NOATIME_FL; + if (xflags & FS_XFLAG_PROJINHERIT) + iflags |= EXT4_PROJINHERIT_FL; + + return iflags; +} + long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -217,11 +450,7 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ei->i_flags & EXT4_FL_USER_VISIBLE; return put_user(flags, (int __user *) arg); case EXT4_IOC_SETFLAGS: { - handle_t *handle = NULL; - int err, migrate = 0; - struct ext4_iloc iloc; - unsigned int oldflags, mask, i; - unsigned int jflag; + int err; if (!inode_owner_or_capable(inode)) return -EACCES; @@ -235,90 +464,9 @@ long ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) flags = ext4_mask_flags(inode->i_mode, flags); - err = -EPERM; - mutex_lock(&inode->i_mutex); - /* Is it quota file? Do not allow user to mess with it */ - if (IS_NOQUOTA(inode)) - goto flags_out; - - oldflags = ei->i_flags; - - /* The JOURNAL_DATA flag is modifiable only by root */ - jflag = flags & EXT4_JOURNAL_DATA_FL; - - /* - * The IMMUTABLE and APPEND_ONLY flags can only be changed by - * the relevant capability. - * - * This test looks nicer. Thanks to Pauline Middelink - */ - if ((flags ^ oldflags) & (EXT4_APPEND_FL | EXT4_IMMUTABLE_FL)) { - if (!capable(CAP_LINUX_IMMUTABLE)) - goto flags_out; - } - - /* - * The JOURNAL_DATA flag can only be changed by - * the relevant capability. - */ - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) { - if (!capable(CAP_SYS_RESOURCE)) - goto flags_out; - } - if ((flags ^ oldflags) & EXT4_EXTENTS_FL) - migrate = 1; - - if (flags & EXT4_EOFBLOCKS_FL) { - /* we don't support adding EOFBLOCKS flag */ - if (!(oldflags & EXT4_EOFBLOCKS_FL)) { - err = -EOPNOTSUPP; - goto flags_out; - } - } else if (oldflags & EXT4_EOFBLOCKS_FL) - ext4_truncate(inode); - - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - err = PTR_ERR(handle); - goto flags_out; - } - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - err = ext4_reserve_inode_write(handle, inode, &iloc); - if (err) - goto flags_err; - - for (i = 0, mask = 1; i < 32; i++, mask <<= 1) { - if (!(mask & EXT4_FL_USER_MODIFIABLE)) - continue; - if (mask & flags) - ext4_set_inode_flag(inode, i); - else - ext4_clear_inode_flag(inode, i); - } - - ext4_set_inode_flags(inode); - inode->i_ctime = ext4_current_time(inode); - - err = ext4_mark_iloc_dirty(handle, inode, &iloc); -flags_err: - ext4_journal_stop(handle); - if (err) - goto flags_out; - - if ((jflag ^ oldflags) & (EXT4_JOURNAL_DATA_FL)) - err = ext4_change_inode_journal_flag(inode, jflag); - if (err) - goto flags_out; - if (migrate) { - if (flags & EXT4_EXTENTS_FL) - err = ext4_ext_migrate(inode); - else - err = ext4_ind_migrate(inode); - } - -flags_out: - mutex_unlock(&inode->i_mutex); + inode_lock(inode); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); mnt_drop_write_file(filp); return err; } @@ -349,7 +497,7 @@ flags_out: goto setversion_out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { err = PTR_ERR(handle); @@ -364,7 +512,7 @@ flags_out: ext4_journal_stop(handle); unlock_out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); setversion_out: mnt_drop_write_file(filp); return err; @@ -510,9 +658,9 @@ group_add_out: * ext4_ext_swap_inode_data before we switch the * inode format to prevent read. */ - mutex_lock(&(inode->i_mutex)); + inode_lock((inode)); err = ext4_ext_migrate(inode); - mutex_unlock(&(inode->i_mutex)); + inode_unlock((inode)); mnt_drop_write_file(filp); return err; } @@ -689,6 +837,60 @@ encryption_policy_out: return -EOPNOTSUPP; #endif } + case EXT4_IOC_FSGETXATTR: + { + struct fsxattr fa; + + memset(&fa, 0, sizeof(struct fsxattr)); + ext4_get_inode_flags(ei); + fa.fsx_xflags = ext4_iflags_to_xflags(ei->i_flags & EXT4_FL_USER_VISIBLE); + + if (EXT4_HAS_RO_COMPAT_FEATURE(inode->i_sb, + EXT4_FEATURE_RO_COMPAT_PROJECT)) { + fa.fsx_projid = (__u32)from_kprojid(&init_user_ns, + EXT4_I(inode)->i_projid); + } + + if (copy_to_user((struct fsxattr __user *)arg, + &fa, sizeof(fa))) + return -EFAULT; + return 0; + } + case EXT4_IOC_FSSETXATTR: + { + struct fsxattr fa; + int err; + + if (copy_from_user(&fa, (struct fsxattr __user *)arg, + sizeof(fa))) + return -EFAULT; + + /* Make sure caller has proper permission */ + if (!inode_owner_or_capable(inode)) + return -EACCES; + + err = mnt_want_write_file(filp); + if (err) + return err; + + flags = ext4_xflags_to_iflags(fa.fsx_xflags); + flags = ext4_mask_flags(inode->i_mode, flags); + + inode_lock(inode); + flags = (ei->i_flags & ~EXT4_FL_XFLAG_VISIBLE) | + (flags & EXT4_FL_XFLAG_VISIBLE); + err = ext4_ioctl_setflags(inode, flags); + inode_unlock(inode); + mnt_drop_write_file(filp); + if (err) + return err; + + err = ext4_ioctl_setproject(filp, fa.fsx_projid); + if (err) + return err; + + return 0; + } default: return -ENOTTY; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index f27e0c2598c5..06574dd77614 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -273,7 +273,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode); + struct inode *dir, struct inode *inode); /* checksumming functions */ void initialize_dirent_tail(struct ext4_dir_entry_tail *t, @@ -1928,10 +1928,9 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, * directory, and adds the dentry to the indexed directory. */ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, + struct inode *dir, struct inode *inode, struct buffer_head *bh) { - struct inode *dir = d_inode(dentry->d_parent); struct buffer_head *bh2; struct dx_root *root; struct dx_frame frames[2], *frame; @@ -2086,8 +2085,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, return retval; if (ext4_has_inline_data(dir)) { - retval = ext4_try_add_inline_entry(handle, &fname, - dentry, inode); + retval = ext4_try_add_inline_entry(handle, &fname, dir, inode); if (retval < 0) goto out; if (retval == 1) { @@ -2097,7 +2095,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, } if (is_dx(dir)) { - retval = ext4_dx_add_entry(handle, &fname, dentry, inode); + retval = ext4_dx_add_entry(handle, &fname, dir, inode); if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; ext4_clear_inode_flag(dir, EXT4_INODE_INDEX); @@ -2119,7 +2117,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (blocks == 1 && !dx_fallback && ext4_has_feature_dir_index(sb)) { - retval = make_indexed_dir(handle, &fname, dentry, + retval = make_indexed_dir(handle, &fname, dir, inode, bh); bh = NULL; /* make_indexed_dir releases bh */ goto out; @@ -2154,12 +2152,11 @@ out: * Returns 0 for success, or a negative error value */ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, - struct dentry *dentry, struct inode *inode) + struct inode *dir, struct inode *inode) { struct dx_frame frames[2], *frame; struct dx_entry *entries, *at; struct buffer_head *bh; - struct inode *dir = d_inode(dentry->d_parent); struct super_block *sb = dir->i_sb; struct ext4_dir_entry_2 *de; int err; @@ -2756,7 +2753,7 @@ int ext4_orphan_add(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* * Exit early if inode already is on orphan list. This is a big speedup * since we don't have to contend on the global s_orphan_lock. @@ -2838,7 +2835,7 @@ int ext4_orphan_del(handle_t *handle, struct inode *inode) return 0; WARN_ON_ONCE(!(inode->i_state & (I_NEW | I_FREEING)) && - !mutex_is_locked(&inode->i_mutex)); + !inode_is_locked(inode)); /* Do this quick check before taking global s_orphan_lock. */ if (list_empty(&ei->i_orphan)) return 0; @@ -3212,6 +3209,12 @@ static int ext4_link(struct dentry *old_dentry, if (ext4_encrypted_inode(dir) && !ext4_is_child_context_consistent_with_parent(dir, inode)) return -EPERM; + + if ((ext4_test_inode_flag(dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + err = dquot_initialize(dir); if (err) return err; @@ -3492,6 +3495,11 @@ static int ext4_rename(struct inode *old_dir, struct dentry *old_dentry, int credits; u8 old_file_type; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT)) && + (!projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; @@ -3701,6 +3709,14 @@ static int ext4_cross_rename(struct inode *old_dir, struct dentry *old_dentry, new.inode))) return -EPERM; + if ((ext4_test_inode_flag(new_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(new_dir)->i_projid, + EXT4_I(old_dentry->d_inode)->i_projid)) || + (ext4_test_inode_flag(old_dir, EXT4_INODE_PROJINHERIT) && + !projid_eq(EXT4_I(old_dir)->i_projid, + EXT4_I(new_dentry->d_inode)->i_projid))) + return -EXDEV; + retval = dquot_initialize(old.dir); if (retval) return retval; diff --git a/fs/ext4/super.c b/fs/ext4/super.c index f1b56ff01208..3ed01ec011d7 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -80,6 +80,36 @@ static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); +/* + * Lock ordering + * + * Note the difference between i_mmap_sem (EXT4_I(inode)->i_mmap_sem) and + * i_mmap_rwsem (inode->i_mmap_rwsem)! + * + * page fault path: + * mmap_sem -> sb_start_pagefault -> i_mmap_sem (r) -> transaction start -> + * page lock -> i_data_sem (rw) + * + * buffered write path: + * sb_start_write -> i_mutex -> mmap_sem + * sb_start_write -> i_mutex -> transaction start -> page lock -> + * i_data_sem (rw) + * + * truncate: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * i_mmap_rwsem (w) -> page lock + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (w) -> i_mmap_sem (w) -> + * transaction start -> i_data_sem (rw) + * + * direct IO: + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> mmap_sem + * sb_start_write -> i_mutex -> EXT4_STATE_DIOREAD_LOCK (r) -> + * transaction start -> i_data_sem (rw) + * + * writepages: + * transaction start -> page lock(s) -> i_data_sem (rw) + */ + #if !defined(CONFIG_EXT2_FS) && !defined(CONFIG_EXT2_FS_MODULE) && defined(CONFIG_EXT4_USE_FOR_EXT2) static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, @@ -958,6 +988,7 @@ static void init_once(void *foo) INIT_LIST_HEAD(&ei->i_orphan); init_rwsem(&ei->xattr_sem); init_rwsem(&ei->i_data_sem); + init_rwsem(&ei->i_mmap_sem); inode_init_once(&ei->vfs_inode); } @@ -1066,8 +1097,8 @@ static int bdev_try_to_free_page(struct super_block *sb, struct page *page, } #ifdef CONFIG_QUOTA -#define QTYPE2NAME(t) ((t) == USRQUOTA ? "user" : "group") -#define QTYPE2MOPT(on, t) ((t) == USRQUOTA?((on)##USRJQUOTA):((on)##GRPJQUOTA)) +static char *quotatypes[] = INITQFNAMES; +#define QTYPE2NAME(t) (quotatypes[t]) static int ext4_write_dquot(struct dquot *dquot); static int ext4_acquire_dquot(struct dquot *dquot); @@ -1100,6 +1131,7 @@ static const struct dquot_operations ext4_quota_operations = { .write_info = ext4_write_info, .alloc_dquot = dquot_alloc, .destroy_dquot = dquot_destroy, + .get_projid = ext4_get_projid, }; static const struct quotactl_ops ext4_qctl_operations = { @@ -2254,10 +2286,10 @@ static void ext4_orphan_cleanup(struct super_block *sb, __func__, inode->i_ino, inode->i_size); jbd_debug(2, "truncating inode %lu to %lld bytes\n", inode->i_ino, inode->i_size); - mutex_lock(&inode->i_mutex); + inode_lock(inode); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); nr_truncates++; } else { if (test_opt(sb, DEBUG)) @@ -2526,6 +2558,12 @@ static int ext4_feature_set_ok(struct super_block *sb, int readonly) "without CONFIG_QUOTA"); return 0; } + if (ext4_has_feature_project(sb) && !readonly) { + ext4_msg(sb, KERN_ERR, + "Filesystem with project quota feature cannot be mounted RDWR " + "without CONFIG_QUOTA"); + return 0; + } #endif /* CONFIG_QUOTA */ return 1; } @@ -3654,7 +3692,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) sb->s_qcop = &dquot_quotactl_sysfile_ops; else sb->s_qcop = &ext4_qctl_operations; - sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP; + sb->s_quota_types = QTYPE_MASK_USR | QTYPE_MASK_GRP | QTYPE_MASK_PRJ; #endif memcpy(sb->s_uuid, es->s_uuid, sizeof(es->s_uuid)); @@ -4790,6 +4828,48 @@ restore_opts: return err; } +#ifdef CONFIG_QUOTA +static int ext4_statfs_project(struct super_block *sb, + kprojid_t projid, struct kstatfs *buf) +{ + struct kqid qid; + struct dquot *dquot; + u64 limit; + u64 curblock; + + qid = make_kqid_projid(projid); + dquot = dqget(sb, qid); + if (IS_ERR(dquot)) + return PTR_ERR(dquot); + spin_lock(&dq_data_lock); + + limit = (dquot->dq_dqb.dqb_bsoftlimit ? + dquot->dq_dqb.dqb_bsoftlimit : + dquot->dq_dqb.dqb_bhardlimit) >> sb->s_blocksize_bits; + if (limit && buf->f_blocks > limit) { + curblock = dquot->dq_dqb.dqb_curspace >> sb->s_blocksize_bits; + buf->f_blocks = limit; + buf->f_bfree = buf->f_bavail = + (buf->f_blocks > curblock) ? + (buf->f_blocks - curblock) : 0; + } + + limit = dquot->dq_dqb.dqb_isoftlimit ? + dquot->dq_dqb.dqb_isoftlimit : + dquot->dq_dqb.dqb_ihardlimit; + if (limit && buf->f_files > limit) { + buf->f_files = limit; + buf->f_ffree = + (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? + (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + } + + spin_unlock(&dq_data_lock); + dqput(dquot); + return 0; +} +#endif + static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -4822,6 +4902,11 @@ static int ext4_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = fsid & 0xFFFFFFFFUL; buf->f_fsid.val[1] = (fsid >> 32) & 0xFFFFFFFFUL; +#ifdef CONFIG_QUOTA + if (ext4_test_inode_flag(dentry->d_inode, EXT4_INODE_PROJINHERIT) && + sb_has_quota_limits_enabled(sb, PRJQUOTA)) + ext4_statfs_project(sb, EXT4_I(dentry->d_inode)->i_projid, buf); +#endif return 0; } @@ -4986,7 +5071,8 @@ static int ext4_quota_enable(struct super_block *sb, int type, int format_id, struct inode *qf_inode; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; BUG_ON(!ext4_has_feature_quota(sb)); @@ -5014,7 +5100,8 @@ static int ext4_enable_quotas(struct super_block *sb) int type, err = 0; unsigned long qf_inums[EXT4_MAXQUOTAS] = { le32_to_cpu(EXT4_SB(sb)->s_es->s_usr_quota_inum), - le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum) + le32_to_cpu(EXT4_SB(sb)->s_es->s_grp_quota_inum), + le32_to_cpu(EXT4_SB(sb)->s_es->s_prj_quota_inum) }; sb_dqopt(sb)->flags |= DQUOT_QUOTA_SYS_FILE; diff --git a/fs/ext4/truncate.h b/fs/ext4/truncate.h index 011ba6670d99..c70d06a383e2 100644 --- a/fs/ext4/truncate.h +++ b/fs/ext4/truncate.h @@ -10,8 +10,10 @@ */ static inline void ext4_truncate_failed_write(struct inode *inode) { + down_write(&EXT4_I(inode)->i_mmap_sem); truncate_inode_pages(inode->i_mapping, inode->i_size); ext4_truncate(inode); + up_write(&EXT4_I(inode)->i_mmap_sem); } /* diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index ac9e7c6aac74..5c06db17e41f 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -794,7 +794,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, return ret; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (start >= isize) @@ -860,7 +860,7 @@ out: if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 18ddb1e5182a..ea272be62677 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -333,7 +333,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) loff_t isize; int err = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); if (offset >= isize) @@ -388,10 +388,10 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) found: if (whence == SEEK_HOLE && data_ofs > isize) data_ofs = isize; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return vfs_setpos(file, data_ofs, maxbytes); fail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -ENXIO; } @@ -1219,7 +1219,7 @@ static long f2fs_fallocate(struct file *file, int mode, FALLOC_FL_INSERT_RANGE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { if (offset >= inode->i_size) @@ -1243,7 +1243,7 @@ static long f2fs_fallocate(struct file *file, int mode, } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); trace_f2fs_fallocate(inode, mode, offset, len, ret); return ret; @@ -1307,13 +1307,13 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = f2fs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = fi->i_flags; if ((flags ^ oldflags) & (FS_APPEND_FL | FS_IMMUTABLE_FL)) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = -EPERM; goto out; } @@ -1322,7 +1322,7 @@ static int f2fs_ioc_setflags(struct file *filp, unsigned long arg) flags = flags & FS_FL_USER_MODIFIABLE; flags |= oldflags & ~FS_FL_USER_MODIFIABLE; fi->i_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); f2fs_set_inode_flags(inode); inode->i_ctime = CURRENT_TIME; @@ -1667,7 +1667,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, f2fs_balance_fs(sbi, true); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* writeback all dirty pages in the range */ err = filemap_write_and_wait_range(inode->i_mapping, range->start, @@ -1778,7 +1778,7 @@ do_map: clear_out: clear_inode_flag(F2FS_I(inode), FI_DO_DEFRAG); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!err) range->len = (u64)total << PAGE_CACHE_SHIFT; return err; diff --git a/fs/fat/dir.c b/fs/fat/dir.c index 7def96caec5f..d0b95c95079b 100644 --- a/fs/fat/dir.c +++ b/fs/fat/dir.c @@ -769,7 +769,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, buf.dirent = dirent; buf.result = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); buf.ctx.pos = file->f_pos; ret = -ENOENT; if (!IS_DEADDIR(inode)) { @@ -777,7 +777,7 @@ static int fat_ioctl_readdir(struct inode *inode, struct file *file, short_only, both ? &buf : NULL); file->f_pos = buf.ctx.pos; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret >= 0) ret = buf.result; return ret; diff --git a/fs/fat/file.c b/fs/fat/file.c index 43d3475da83a..f70185668832 100644 --- a/fs/fat/file.c +++ b/fs/fat/file.c @@ -24,9 +24,9 @@ static int fat_ioctl_get_attributes(struct inode *inode, u32 __user *user_attr) { u32 attr; - mutex_lock(&inode->i_mutex); + inode_lock(inode); attr = fat_make_attrs(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(attr, user_attr); } @@ -47,7 +47,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) err = mnt_want_write_file(file); if (err) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * ATTR_VOLUME and ATTR_DIR cannot be changed; this also @@ -109,7 +109,7 @@ static int fat_ioctl_set_attributes(struct file *file, u32 __user *user_attr) fat_save_attrs(inode, attr); mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(file); out: return err; @@ -246,7 +246,7 @@ static long fat_fallocate(struct file *file, int mode, if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_KEEP_SIZE) { ondisksize = inode->i_blocks << 9; if ((offset + len) <= ondisksize) @@ -272,7 +272,7 @@ static long fat_fallocate(struct file *file, int mode, } error: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/filesystems.c b/fs/filesystems.c index 5797d45a78cb..c5618db110be 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -46,9 +46,9 @@ void put_filesystem(struct file_system_type *fs) static struct file_system_type **find_filesystem(const char *name, unsigned len) { struct file_system_type **p; - for (p=&file_systems; *p; p=&(*p)->next) - if (strlen((*p)->name) == len && - strncmp((*p)->name, name, len) == 0) + for (p = &file_systems; *p; p = &(*p)->next) + if (strncmp((*p)->name, name, len) == 0 && + !(*p)->name[len]) break; return p; } diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 712601f299b8..4b855b65d457 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -944,7 +944,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, if (!parent) return -ENOENT; - mutex_lock(&parent->i_mutex); + inode_lock(parent); if (!S_ISDIR(parent->i_mode)) goto unlock; @@ -962,7 +962,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, fuse_invalidate_entry(entry); if (child_nodeid != 0 && d_really_is_positive(entry)) { - mutex_lock(&d_inode(entry)->i_mutex); + inode_lock(d_inode(entry)); if (get_node_id(d_inode(entry)) != child_nodeid) { err = -ENOENT; goto badentry; @@ -983,7 +983,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, clear_nlink(d_inode(entry)); err = 0; badentry: - mutex_unlock(&d_inode(entry)->i_mutex); + inode_unlock(d_inode(entry)); if (!err) d_delete(entry); } else { @@ -992,7 +992,7 @@ int fuse_reverse_inval_entry(struct super_block *sb, u64 parent_nodeid, dput(entry); unlock: - mutex_unlock(&parent->i_mutex); + inode_unlock(parent); iput(parent); return err; } @@ -1504,7 +1504,7 @@ void fuse_set_nowrite(struct inode *inode) struct fuse_conn *fc = get_fuse_conn(inode); struct fuse_inode *fi = get_fuse_inode(inode); - BUG_ON(!mutex_is_locked(&inode->i_mutex)); + BUG_ON(!inode_is_locked(inode)); spin_lock(&fc->lock); BUG_ON(fi->writectr < 0); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index aa03aab6a24f..b03d253ece15 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -207,7 +207,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) return err; if (lock_inode) - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = fuse_do_open(fc, get_node_id(inode), file, isdir); @@ -215,7 +215,7 @@ int fuse_open_common(struct inode *inode, struct file *file, bool isdir) fuse_finish_open(inode, file); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -413,9 +413,9 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); req = fuse_get_req_nofail_nopages(fc, file); memset(&inarg, 0, sizeof(inarg)); @@ -450,7 +450,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, if (is_bad_inode(inode)) return -EIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Start writeback against all dirty pages of the inode, then @@ -486,7 +486,7 @@ int fuse_fsync_common(struct file *file, loff_t start, loff_t end, err = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -1160,7 +1160,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) return generic_file_write_iter(iocb, from); } - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We can write back this queue in page reclaim */ current->backing_dev_info = inode_to_bdi(inode); @@ -1210,7 +1210,7 @@ static ssize_t fuse_file_write_iter(struct kiocb *iocb, struct iov_iter *from) } out: current->backing_dev_info = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return written ? written : err; } @@ -1322,10 +1322,10 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { if (!write) - mutex_lock(&inode->i_mutex); + inode_lock(inode); fuse_sync_writes(inode); if (!write) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } while (count) { @@ -1413,14 +1413,14 @@ static ssize_t fuse_direct_write_iter(struct kiocb *iocb, struct iov_iter *from) return -EIO; /* Don't allow parallel writes to the same file */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = generic_write_checks(iocb, from); if (res > 0) res = fuse_direct_io(&io, from, &iocb->ki_pos, FUSE_DIO_WRITE); fuse_invalidate_attr(inode); if (res > 0) fuse_write_update_size(inode, iocb->ki_pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -2287,17 +2287,17 @@ static loff_t fuse_file_llseek(struct file *file, loff_t offset, int whence) retval = generic_file_llseek(file, offset, whence); break; case SEEK_END: - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fuse_update_attributes(inode, NULL, file, NULL); if (!retval) retval = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); break; case SEEK_HOLE: case SEEK_DATA: - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fuse_lseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); break; default: retval = -EINVAL; @@ -2944,7 +2944,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, return -EOPNOTSUPP; if (lock_inode) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { loff_t endbyte = offset + length - 1; err = filemap_write_and_wait_range(inode->i_mapping, @@ -2990,7 +2990,7 @@ out: clear_bit(FUSE_I_SIZE_UNSTABLE, &fi->state); if (lock_inode) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index 7412863cda1e..c9384f932975 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -914,7 +914,7 @@ static long gfs2_fallocate(struct file *file, int mode, loff_t offset, loff_t le if ((mode & ~FALLOC_FL_KEEP_SIZE) || gfs2_is_jdata(ip)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); gfs2_holder_init(ip->i_gl, LM_ST_EXCLUSIVE, 0, &gh); ret = gfs2_glock_nq(&gh); @@ -946,7 +946,7 @@ out_unlock: gfs2_glock_dq(&gh); out_uninit: gfs2_holder_uninit(&gh); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 3e94400d587c..352f958769e1 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -2067,7 +2067,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_SHARED, 0, &gh); if (ret) @@ -2094,7 +2094,7 @@ static int gfs2_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, gfs2_glock_dq_uninit(&gh); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c index be6d9c450b22..a39891344259 100644 --- a/fs/gfs2/quota.c +++ b/fs/gfs2/quota.c @@ -888,7 +888,7 @@ static int do_sync(unsigned int num_qd, struct gfs2_quota_data **qda) return -ENOMEM; sort(qda, num_qd, sizeof(struct gfs2_quota_data *), sort_qd, NULL); - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); for (qx = 0; qx < num_qd; qx++) { error = gfs2_glock_nq_init(qda[qx]->qd_gl, LM_ST_EXCLUSIVE, GL_NOCACHE, &ghs[qx]); @@ -953,7 +953,7 @@ out_alloc: out: while (qx--) gfs2_glock_dq_uninit(&ghs[qx]); - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); kfree(ghs); gfs2_log_flush(ip->i_gl->gl_name.ln_sbd, ip->i_gl, NORMAL_FLUSH); return error; @@ -1674,7 +1674,7 @@ static int gfs2_set_dqblk(struct super_block *sb, struct kqid qid, if (error) goto out_put; - mutex_lock(&ip->i_inode.i_mutex); + inode_lock(&ip->i_inode); error = gfs2_glock_nq_init(qd->qd_gl, LM_ST_EXCLUSIVE, 0, &q_gh); if (error) goto out_unlockput; @@ -1739,7 +1739,7 @@ out_i: out_q: gfs2_glock_dq_uninit(&q_gh); out_unlockput: - mutex_unlock(&ip->i_inode.i_mutex); + inode_unlock(&ip->i_inode); out_put: qd_put(qd); return error; diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index 70788e03820a..e9f2b855f831 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -173,9 +173,9 @@ static int hfs_dir_release(struct inode *inode, struct file *file) { struct hfs_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfs/inode.c b/fs/hfs/inode.c index b99ebddb10cb..6686bf39a5b5 100644 --- a/fs/hfs/inode.c +++ b/fs/hfs/inode.c @@ -570,13 +570,13 @@ static int hfs_file_release(struct inode *inode, struct file *file) if (HFS_IS_RSRC(inode)) inode = HFS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfs_file_truncate(inode); //if (inode->i_flags & S_DEAD) { // hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL); // hfs_delete_inode(inode); //} - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -656,7 +656,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* sync the inode to buffers */ ret = write_inode_now(inode, 0); @@ -668,7 +668,7 @@ static int hfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = sync_blockdev(sb->s_bdev); if (!ret) ret = err; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index d0f39dcbb58e..a4e867e08947 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -284,9 +284,9 @@ static int hfsplus_dir_release(struct inode *inode, struct file *file) { struct hfsplus_readdir_data *rd = file->private_data; if (rd) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); list_del(&rd->list); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); kfree(rd); } return 0; diff --git a/fs/hfsplus/inode.c b/fs/hfsplus/inode.c index 19b33f8151f1..1a6394cdb54e 100644 --- a/fs/hfsplus/inode.c +++ b/fs/hfsplus/inode.c @@ -229,14 +229,14 @@ static int hfsplus_file_release(struct inode *inode, struct file *file) if (HFSPLUS_IS_RSRC(inode)) inode = HFSPLUS_I(inode)->rsrc_inode; if (atomic_dec_and_test(&HFSPLUS_I(inode)->opencnt)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); hfsplus_file_truncate(inode); if (inode->i_flags & S_DEAD) { hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb)->hidden_dir, NULL); hfsplus_delete_inode(inode); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } @@ -286,7 +286,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, error = filemap_write_and_wait_range(inode->i_mapping, start, end); if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * Sync inode metadata into the catalog and extent trees. @@ -327,7 +327,7 @@ int hfsplus_file_fsync(struct file *file, loff_t start, loff_t end, if (!test_bit(HFSPLUS_SB_NOBARRIER, &sbi->flags)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/hfsplus/ioctl.c b/fs/hfsplus/ioctl.c index 0624ce4e0702..32a49e292b6a 100644 --- a/fs/hfsplus/ioctl.c +++ b/fs/hfsplus/ioctl.c @@ -93,7 +93,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) goto out_drop_write; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); if ((flags & (FS_IMMUTABLE_FL|FS_APPEND_FL)) || inode->i_flags & (S_IMMUTABLE|S_APPEND)) { @@ -126,7 +126,7 @@ static int hfsplus_ioctl_setflags(struct file *file, int __user *user_flags) mark_inode_dirty(inode); out_unlock_inode: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out_drop_write: mnt_drop_write_file(file); out: diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index cfaa18c7a337..d1abbee281d1 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -378,9 +378,9 @@ static int hostfs_fsync(struct file *file, loff_t start, loff_t end, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = fsync_file(HOSTFS_I(inode)->fd, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/hpfs/dir.c b/fs/hpfs/dir.c index dc540bfcee1d..e57a53c13d86 100644 --- a/fs/hpfs/dir.c +++ b/fs/hpfs/dir.c @@ -33,7 +33,7 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) if (whence == SEEK_DATA || whence == SEEK_HOLE) return -EINVAL; - mutex_lock(&i->i_mutex); + inode_lock(i); hpfs_lock(s); /*pr_info("dir lseek\n");*/ @@ -48,12 +48,12 @@ static loff_t hpfs_dir_lseek(struct file *filp, loff_t off, int whence) ok: filp->f_pos = new_off; hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return new_off; fail: /*pr_warn("illegal lseek: %016llx\n", new_off);*/ hpfs_unlock(s); - mutex_unlock(&i->i_mutex); + inode_unlock(i); return -ESPIPE; } diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 8bbf7f3e2a27..e1f465a389d5 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -141,7 +141,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma_len = (loff_t)(vma->vm_end - vma->vm_start); - mutex_lock(&inode->i_mutex); + inode_lock(inode); file_accessed(file); ret = -ENOMEM; @@ -157,7 +157,7 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) if (vma->vm_flags & VM_WRITE && inode->i_size < len) inode->i_size = len; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -530,7 +530,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (hole_end > hole_start) { struct address_space *mapping = inode->i_mapping; - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_mmap_lock_write(mapping); if (!RB_EMPTY_ROOT(&mapping->i_mmap)) hugetlb_vmdelete_list(&mapping->i_mmap, @@ -538,7 +538,7 @@ static long hugetlbfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) hole_end >> PAGE_SHIFT); i_mmap_unlock_write(mapping); remove_inode_hugepages(inode, hole_start, hole_end); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; @@ -572,7 +572,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, start = offset >> hpage_shift; end = (offset + len + hpage_size - 1) >> hpage_shift; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We need to check rlimit even when FALLOC_FL_KEEP_SIZE */ error = inode_newsize_ok(inode, offset + len); @@ -659,7 +659,7 @@ static long hugetlbfs_fallocate(struct file *file, int mode, loff_t offset, i_size_write(inode, offset + len); inode->i_ctime = CURRENT_TIME; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/fs/inode.c b/fs/inode.c index e491e54d2430..9f62db3bcc3e 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -495,7 +495,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrshadows); + BUG_ON(inode->i_data.nrexceptional); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); @@ -966,9 +966,9 @@ void lock_two_nondirectories(struct inode *inode1, struct inode *inode2) swap(inode1, inode2); if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_lock(&inode1->i_mutex); + inode_lock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_lock_nested(&inode2->i_mutex, I_MUTEX_NONDIR2); + inode_lock_nested(inode2, I_MUTEX_NONDIR2); } EXPORT_SYMBOL(lock_two_nondirectories); @@ -980,9 +980,9 @@ EXPORT_SYMBOL(lock_two_nondirectories); void unlock_two_nondirectories(struct inode *inode1, struct inode *inode2) { if (inode1 && !S_ISDIR(inode1->i_mode)) - mutex_unlock(&inode1->i_mutex); + inode_unlock(inode1); if (inode2 && !S_ISDIR(inode2->i_mode) && inode2 != inode1) - mutex_unlock(&inode2->i_mutex); + inode_unlock(inode2); } EXPORT_SYMBOL(unlock_two_nondirectories); diff --git a/fs/ioctl.c b/fs/ioctl.c index 29466c380958..116a333e9c77 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -434,9 +434,9 @@ int generic_block_fiemap(struct inode *inode, u64 len, get_block_t *get_block) { int ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = __generic_block_fiemap(inode, fieinfo, start, len, get_block); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(generic_block_fiemap); diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a3750f902adc..0ae91ad6df2d 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -17,6 +17,7 @@ #include <linux/slab.h> #include <linux/vmalloc.h> #include <linux/mtd/mtd.h> +#include <linux/mm.h> /* kvfree() */ #include "nodelist.h" static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, @@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) return 0; out_free: -#ifndef __ECOS - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else -#endif - kfree(c->blocks); + kvfree(c->blocks); return ret; } diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f509f62e12f6..c5ac5944bc1b 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -39,10 +39,10 @@ int jffs2_fsync(struct file *filp, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Trigger GC to flush any pending writes for this inode */ jffs2_flush_wbuf_gc(c, inode->i_ino); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf1682036d..bead25ae8fe4 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); out_inohash: jffs2_clear_xattr_subsystem(c); kfree(c->inocache_list); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb080c272149..0a9a114bb9d1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); diff --git a/fs/jfs/file.c b/fs/jfs/file.c index 0e026a7bdcd4..4ce7735dd042 100644 --- a/fs/jfs/file.c +++ b/fs/jfs/file.c @@ -38,17 +38,17 @@ int jfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (rc) return rc; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!(inode->i_state & I_DIRTY_ALL) || (datasync && !(inode->i_state & I_DIRTY_DATASYNC))) { /* Make sure committed changes hit the disk */ jfs_flush_journal(JFS_SBI(inode->i_sb)->log, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc; } rc |= jfs_commit_inode(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return rc ? -EIO : 0; } diff --git a/fs/jfs/ioctl.c b/fs/jfs/ioctl.c index 8db8b7d61e40..8653cac7e12e 100644 --- a/fs/jfs/ioctl.c +++ b/fs/jfs/ioctl.c @@ -96,7 +96,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) } /* Lock against other parallel changes of flags */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); jfs_get_inode_flags(jfs_inode); oldflags = jfs_inode->mode2; @@ -109,7 +109,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) ((flags ^ oldflags) & (JFS_APPEND_FL | JFS_IMMUTABLE_FL))) { if (!capable(CAP_LINUX_IMMUTABLE)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); err = -EPERM; goto setflags_out; } @@ -120,7 +120,7 @@ long jfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) jfs_inode->mode2 = flags; jfs_set_inode_flags(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); setflags_out: diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 900925b5eb8c..4f5d85ba8e23 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -792,7 +792,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, struct buffer_head tmp_bh; struct buffer_head *bh; - mutex_lock(&inode->i_mutex); + inode_lock(inode); while (towrite > 0) { tocopy = sb->s_blocksize - offset < towrite ? sb->s_blocksize - offset : towrite; @@ -824,7 +824,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } out: if (len == towrite) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } if (inode->i_size < off+len-towrite) @@ -832,7 +832,7 @@ out: inode->i_version++; inode->i_mtime = inode->i_ctime = CURRENT_TIME; mark_inode_dirty(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return len - towrite; } diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 821973853340..996b7742c90b 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -1511,9 +1511,9 @@ static loff_t kernfs_dir_fop_llseek(struct file *file, loff_t offset, struct inode *inode = file_inode(file); loff_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_file_llseek(file, offset, whence); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/libfs.c b/fs/libfs.c index 01491299f348..0ca80b2af420 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -89,7 +89,7 @@ EXPORT_SYMBOL(dcache_dir_close); loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) { struct dentry *dentry = file->f_path.dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); switch (whence) { case 1: offset += file->f_pos; @@ -97,7 +97,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) if (offset >= 0) break; default: - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return -EINVAL; } if (offset != file->f_pos) { @@ -124,7 +124,7 @@ loff_t dcache_dir_lseek(struct file *file, loff_t offset, int whence) spin_unlock(&dentry->d_lock); } } - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return offset; } EXPORT_SYMBOL(dcache_dir_lseek); @@ -941,7 +941,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = sync_mapping_buffers(inode->i_mapping); if (!(inode->i_state & I_DIRTY_ALL)) goto out; @@ -953,7 +953,7 @@ int __generic_file_fsync(struct file *file, loff_t start, loff_t end, ret = err; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } EXPORT_SYMBOL(__generic_file_fsync); diff --git a/fs/locks.c b/fs/locks.c index af1ed74a657f..7c5f91be9b65 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1650,12 +1650,12 @@ generic_add_lease(struct file *filp, long arg, struct file_lock **flp, void **pr * bother, maybe that's a sign this just isn't a good file to * hand out a delegation on. */ - if (is_deleg && !mutex_trylock(&inode->i_mutex)) + if (is_deleg && !inode_trylock(inode)) return -EAGAIN; if (is_deleg && arg == F_WRLCK) { /* Write delegations are not currently supported: */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(1); return -EINVAL; } @@ -1732,7 +1732,7 @@ out: spin_unlock(&ctx->flc_lock); locks_dispose_list(&dispose); if (is_deleg) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error && !my_fl) *flp = NULL; return error; diff --git a/fs/logfs/file.c b/fs/logfs/file.c index 1a6f0167b16a..61eaeb1b6cac 100644 --- a/fs/logfs/file.c +++ b/fs/logfs/file.c @@ -204,12 +204,12 @@ long logfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg) if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = li->li_flags; flags &= LOGFS_FL_USER_MODIFIABLE; flags |= oldflags & ~LOGFS_FL_USER_MODIFIABLE; li->li_flags = flags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode->i_ctime = CURRENT_TIME; mark_inode_dirty_sync(inode); @@ -230,11 +230,11 @@ int logfs_fsync(struct file *file, loff_t start, loff_t end, int datasync) if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); logfs_get_wblocks(sb, NULL, WF_LOCK); logfs_write_anchor(sb); logfs_put_wblocks(sb, NULL, WF_LOCK); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/namei.c b/fs/namei.c index bceefd5588a2..f624d132e01e 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1629,9 +1629,9 @@ static int lookup_slow(struct nameidata *nd, struct path *path) parent = nd->path.dentry; BUG_ON(nd->inode != parent->d_inode); - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = __lookup_hash(&nd->last, parent, nd->flags); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (IS_ERR(dentry)) return PTR_ERR(dentry); path->mnt = nd->path.mnt; @@ -2229,10 +2229,10 @@ struct dentry *kern_path_locked(const char *name, struct path *path) putname(filename); return ERR_PTR(-EINVAL); } - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); d = __lookup_hash(&last, path->dentry, 0); if (IS_ERR(d)) { - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); } putname(filename); @@ -2282,7 +2282,7 @@ struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) unsigned int c; int err; - WARN_ON_ONCE(!mutex_is_locked(&base->d_inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(base->d_inode)); this.name = name; this.len = len; @@ -2380,9 +2380,9 @@ struct dentry *lookup_one_len_unlocked(const char *name, if (ret) return ret; - mutex_lock(&base->d_inode->i_mutex); + inode_lock(base->d_inode); ret = __lookup_hash(&this, base, 0); - mutex_unlock(&base->d_inode->i_mutex); + inode_unlock(base->d_inode); return ret; } EXPORT_SYMBOL(lookup_one_len_unlocked); @@ -2463,7 +2463,7 @@ mountpoint_last(struct nameidata *nd, struct path *path) goto done; } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = d_lookup(dir, &nd->last); if (!dentry) { /* @@ -2473,16 +2473,16 @@ mountpoint_last(struct nameidata *nd, struct path *path) */ dentry = d_alloc(dir, &nd->last); if (!dentry) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return -ENOMEM; } dentry = lookup_real(dir->d_inode, dentry, nd->flags); if (IS_ERR(dentry)) { - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); return PTR_ERR(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); done: if (d_is_negative(dentry)) { @@ -2672,7 +2672,7 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) struct dentry *p; if (p1 == p2) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); return NULL; } @@ -2680,29 +2680,29 @@ struct dentry *lock_rename(struct dentry *p1, struct dentry *p2) p = d_ancestor(p2, p1); if (p) { - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p1->d_inode, I_MUTEX_CHILD); return p; } p = d_ancestor(p1, p2); if (p) { - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_CHILD); return p; } - mutex_lock_nested(&p1->d_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&p2->d_inode->i_mutex, I_MUTEX_PARENT2); + inode_lock_nested(p1->d_inode, I_MUTEX_PARENT); + inode_lock_nested(p2->d_inode, I_MUTEX_PARENT2); return NULL; } EXPORT_SYMBOL(lock_rename); void unlock_rename(struct dentry *p1, struct dentry *p2) { - mutex_unlock(&p1->d_inode->i_mutex); + inode_unlock(p1->d_inode); if (p1 != p2) { - mutex_unlock(&p2->d_inode->i_mutex); + inode_unlock(p2->d_inode); mutex_unlock(&p1->d_inode->i_sb->s_vfs_rename_mutex); } } @@ -3141,9 +3141,9 @@ retry_lookup: * dropping this one anyway. */ } - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); error = lookup_open(nd, &path, file, op, got_write, opened); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (error <= 0) { if (error) @@ -3489,7 +3489,7 @@ static struct dentry *filename_create(int dfd, struct filename *name, * Do the final lookup. */ lookup_flags |= LOOKUP_CREATE | LOOKUP_EXCL; - mutex_lock_nested(&path->dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path->dentry, lookup_flags); if (IS_ERR(dentry)) goto unlock; @@ -3518,7 +3518,7 @@ fail: dput(dentry); dentry = ERR_PTR(error); unlock: - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); if (!err2) mnt_drop_write(path->mnt); out: @@ -3538,7 +3538,7 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { dput(dentry); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); } @@ -3735,7 +3735,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) return -EPERM; dget(dentry); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); error = -EBUSY; if (is_local_mountpoint(dentry)) @@ -3755,7 +3755,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) detach_mounts(dentry); out: - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); dput(dentry); if (!error) d_delete(dentry); @@ -3794,7 +3794,7 @@ retry: if (error) goto exit1; - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (IS_ERR(dentry)) @@ -3810,7 +3810,7 @@ retry: exit3: dput(dentry); exit2: - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); mnt_drop_write(path.mnt); exit1: path_put(&path); @@ -3856,7 +3856,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate if (!dir->i_op->unlink) return -EPERM; - mutex_lock(&target->i_mutex); + inode_lock(target); if (is_local_mountpoint(dentry)) error = -EBUSY; else { @@ -3873,7 +3873,7 @@ int vfs_unlink(struct inode *dir, struct dentry *dentry, struct inode **delegate } } out: - mutex_unlock(&target->i_mutex); + inode_unlock(target); /* We don't d_delete() NFS sillyrenamed files--they still exist. */ if (!error && !(dentry->d_flags & DCACHE_NFSFS_RENAMED)) { @@ -3916,7 +3916,7 @@ retry: if (error) goto exit1; retry_deleg: - mutex_lock_nested(&path.dentry->d_inode->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(path.dentry->d_inode, I_MUTEX_PARENT); dentry = __lookup_hash(&last, path.dentry, lookup_flags); error = PTR_ERR(dentry); if (!IS_ERR(dentry)) { @@ -3934,7 +3934,7 @@ retry_deleg: exit2: dput(dentry); } - mutex_unlock(&path.dentry->d_inode->i_mutex); + inode_unlock(path.dentry->d_inode); if (inode) iput(inode); /* truncate the inode here */ inode = NULL; @@ -4086,7 +4086,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Make sure we don't allow creating hardlink to an unlinked file */ if (inode->i_nlink == 0 && !(inode->i_state & I_LINKABLE)) error = -ENOENT; @@ -4103,7 +4103,7 @@ int vfs_link(struct dentry *old_dentry, struct inode *dir, struct dentry *new_de inode->i_state &= ~I_LINKABLE; spin_unlock(&inode->i_lock); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_link(dir, inode, new_dentry); return error; @@ -4303,7 +4303,7 @@ int vfs_rename(struct inode *old_dir, struct dentry *old_dentry, if (!is_dir || (flags & RENAME_EXCHANGE)) lock_two_nondirectories(source, target); else if (target) - mutex_lock(&target->i_mutex); + inode_lock(target); error = -EBUSY; if (is_local_mountpoint(old_dentry) || is_local_mountpoint(new_dentry)) @@ -4356,7 +4356,7 @@ out: if (!is_dir || (flags & RENAME_EXCHANGE)) unlock_two_nondirectories(source, target); else if (target) - mutex_unlock(&target->i_mutex); + inode_unlock(target); dput(new_dentry); if (!error) { fsnotify_move(old_dir, new_dir, old_name, is_dir, diff --git a/fs/namespace.c b/fs/namespace.c index a830e1463704..4fb1691b4355 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1961,9 +1961,9 @@ static struct mountpoint *lock_mount(struct path *path) struct vfsmount *mnt; struct dentry *dentry = path->dentry; retry: - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); if (unlikely(cant_mount(dentry))) { - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ERR_PTR(-ENOENT); } namespace_lock(); @@ -1974,13 +1974,13 @@ retry: mp = new_mountpoint(dentry); if (IS_ERR(mp)) { namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return mp; } return mp; } namespace_unlock(); - mutex_unlock(&path->dentry->d_inode->i_mutex); + inode_unlock(path->dentry->d_inode); path_put(path); path->mnt = mnt; dentry = path->dentry = dget(mnt->mnt_root); @@ -1992,7 +1992,7 @@ static void unlock_mount(struct mountpoint *where) struct dentry *dentry = where->m_dentry; put_mountpoint(where); namespace_unlock(); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) diff --git a/fs/ncpfs/dir.c b/fs/ncpfs/dir.c index f0e3e9e747dd..26c2de2de13f 100644 --- a/fs/ncpfs/dir.c +++ b/fs/ncpfs/dir.c @@ -369,7 +369,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) if (!res) { struct inode *inode = d_inode(dentry); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (finfo.i.dirEntNum == NCP_FINFO(inode)->dirEntNum) { ncp_new_dentry(dentry); val=1; @@ -377,7 +377,7 @@ ncp_lookup_validate(struct dentry *dentry, unsigned int flags) ncp_dbg(2, "found, but dirEntNum changed\n"); ncp_update_inode2(inode, &finfo); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } finished: @@ -639,9 +639,9 @@ ncp_fill_cache(struct file *file, struct dir_context *ctx, } else { struct inode *inode = d_inode(newdent); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(inode, I_MUTEX_CHILD); ncp_update_inode2(inode, entry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } if (ctl.idx >= NCP_DIRCACHE_SIZE) { diff --git a/fs/ncpfs/file.c b/fs/ncpfs/file.c index 011324ce9df2..dd38ca1f2ecb 100644 --- a/fs/ncpfs/file.c +++ b/fs/ncpfs/file.c @@ -224,10 +224,10 @@ ncp_file_write_iter(struct kiocb *iocb, struct iov_iter *from) iocb->ki_pos = pos; if (pos > i_size_read(inode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (pos > i_size_read(inode)) i_size_write(inode, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } ncp_dbg(1, "exit %pD2\n", file); outrel: diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index c82a21228a34..9cce67043f92 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -940,7 +940,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dfprintk(FILE, "NFS: llseek dir(%pD2, %lld, %d)\n", filp, offset, whence); - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case 1: offset += filp->f_pos; @@ -957,7 +957,7 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence) dir_ctx->duped = 0; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -972,9 +972,9 @@ static int nfs_fsync_dir(struct file *filp, loff_t start, loff_t end, dfprintk(FILE, "NFS: fsync dir(%pD2) datasync %d\n", filp, datasync); - mutex_lock(&inode->i_mutex); + inode_lock(inode); nfs_inc_stats(inode, NFSIOS_VFSFSYNC); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 7ab7ec9f4eed..7a0cfd3266e5 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -580,7 +580,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (!count) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) goto out_unlock; @@ -608,7 +608,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, NFS_I(inode)->read_io += count; result = nfs_direct_read_schedule_iovec(dreq, iter, pos); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -622,7 +622,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return result; } @@ -1005,7 +1005,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos = iocb->ki_pos; end = (pos + iov_iter_count(iter) - 1) >> PAGE_CACHE_SHIFT; - mutex_lock(&inode->i_mutex); + inode_lock(inode); result = nfs_sync_mapping(mapping); if (result) @@ -1045,7 +1045,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) pos >> PAGE_CACHE_SHIFT, end); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!result) { result = nfs_direct_wait(dreq); @@ -1066,7 +1066,7 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter) out_release: nfs_direct_req_release(dreq); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return result; } diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 4ef8f5addcad..748bb813b8ec 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -278,9 +278,9 @@ nfs_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index bb1f4e7a3270..3384dc8e6683 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -971,7 +971,7 @@ filelayout_mark_request_commit(struct nfs_page *req, u32 i, j; if (fl->commit_through_mds) { - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } else { /* Note that we are calling nfs4_fl_calc_j_index on each page * that ends up being committed to a data server. An attractive diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 6594e9f903a0..5bcd92d50e82 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1948,11 +1948,9 @@ ff_layout_encode_layoutreturn(struct pnfs_layout_hdr *lo, start = xdr_reserve_space(xdr, 4); BUG_ON(!start); - if (ff_layout_encode_ioerr(flo, xdr, args)) - goto out; - + ff_layout_encode_ioerr(flo, xdr, args); ff_layout_encode_iostats(flo, xdr, args); -out: + *start = cpu_to_be32((xdr->p - start - 1) * 4); dprintk("%s: Return\n", __func__); } diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index bd0327541366..29898a9550fa 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -218,63 +218,55 @@ static void extend_ds_error(struct nfs4_ff_layout_ds_err *err, err->length = end - err->offset; } -static bool ds_error_can_merge(struct nfs4_ff_layout_ds_err *err, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, - struct nfs4_deviceid *deviceid) +static int +ff_ds_error_match(const struct nfs4_ff_layout_ds_err *e1, + const struct nfs4_ff_layout_ds_err *e2) { - return err->status == status && err->opnum == opnum && - nfs4_stateid_match(&err->stateid, stateid) && - !memcmp(&err->deviceid, deviceid, sizeof(*deviceid)) && - end_offset(err->offset, err->length) >= offset && - err->offset <= end_offset(offset, length); -} - -static bool merge_ds_error(struct nfs4_ff_layout_ds_err *old, - struct nfs4_ff_layout_ds_err *new) -{ - if (!ds_error_can_merge(old, new->offset, new->length, new->status, - new->opnum, &new->stateid, &new->deviceid)) - return false; - - extend_ds_error(old, new->offset, new->length); - return true; + int ret; + + if (e1->opnum != e2->opnum) + return e1->opnum < e2->opnum ? -1 : 1; + if (e1->status != e2->status) + return e1->status < e2->status ? -1 : 1; + ret = memcmp(&e1->stateid, &e2->stateid, sizeof(e1->stateid)); + if (ret != 0) + return ret; + ret = memcmp(&e1->deviceid, &e2->deviceid, sizeof(e1->deviceid)); + if (ret != 0) + return ret; + if (end_offset(e1->offset, e1->length) < e2->offset) + return -1; + if (e1->offset > end_offset(e2->offset, e2->length)) + return 1; + /* If ranges overlap or are contiguous, they are the same */ + return 0; } -static bool +static void ff_layout_add_ds_error_locked(struct nfs4_flexfile_layout *flo, struct nfs4_ff_layout_ds_err *dserr) { - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (merge_ds_error(err, dserr)) { - return true; - } - } - - list_add(&dserr->list, &flo->error_list); - return false; -} - -static bool -ff_layout_update_ds_error(struct nfs4_flexfile_layout *flo, u64 offset, - u64 length, int status, enum nfs_opnum4 opnum, - nfs4_stateid *stateid, struct nfs4_deviceid *deviceid) -{ - bool found = false; - struct nfs4_ff_layout_ds_err *err; - - list_for_each_entry(err, &flo->error_list, list) { - if (ds_error_can_merge(err, offset, length, status, opnum, - stateid, deviceid)) { - found = true; - extend_ds_error(err, offset, length); + struct nfs4_ff_layout_ds_err *err, *tmp; + struct list_head *head = &flo->error_list; + int match; + + /* Do insertion sort w/ merges */ + list_for_each_entry_safe(err, tmp, &flo->error_list, list) { + match = ff_ds_error_match(err, dserr); + if (match < 0) + continue; + if (match > 0) { + /* Add entry "dserr" _before_ entry "err" */ + head = &err->list; break; } + /* Entries match, so merge "err" into "dserr" */ + extend_ds_error(dserr, err->offset, err->length); + list_del(&err->list); + kfree(err); } - return found; + list_add_tail(&dserr->list, head); } int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, @@ -283,7 +275,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, gfp_t gfp_flags) { struct nfs4_ff_layout_ds_err *dserr; - bool needfree; if (status == 0) return 0; @@ -291,14 +282,6 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, if (mirror->mirror_ds == NULL) return -EINVAL; - spin_lock(&flo->generic_hdr.plh_inode->i_lock); - if (ff_layout_update_ds_error(flo, offset, length, status, opnum, - &mirror->stateid, - &mirror->mirror_ds->id_node.deviceid)) { - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - return 0; - } - spin_unlock(&flo->generic_hdr.plh_inode->i_lock); dserr = kmalloc(sizeof(*dserr), gfp_flags); if (!dserr) return -ENOMEM; @@ -313,10 +296,8 @@ int ff_layout_track_ds_error(struct nfs4_flexfile_layout *flo, NFS4_DEVICEID4_SIZE); spin_lock(&flo->generic_hdr.plh_inode->i_lock); - needfree = ff_layout_add_ds_error_locked(flo, dserr); + ff_layout_add_ds_error_locked(flo, dserr); spin_unlock(&flo->generic_hdr.plh_inode->i_lock); - if (needfree) - kfree(dserr); return 0; } diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 8e24d886d2c5..86faecf8f328 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -661,9 +661,9 @@ int nfs_getattr(struct vfsmount *mnt, struct dentry *dentry, struct kstat *stat) trace_nfs_getattr_enter(inode); /* Flush out writes to the server in order to update c/mtime. */ if (S_ISREG(inode->i_mode)) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs_sync_inode(inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err) goto out; } @@ -1178,9 +1178,9 @@ static int __nfs_revalidate_mapping(struct inode *inode, spin_unlock(&inode->i_lock); trace_nfs_invalidate_mapping_enter(inode); if (may_lock) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_invalidate_mapping(inode, mapping); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } else ret = nfs_invalidate_mapping(inode, mapping); trace_nfs_invalidate_mapping_exit(inode, ret); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 4e8cc942336c..9a547aa3ec8e 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -484,7 +484,7 @@ void nfs_retry_commit(struct list_head *page_list, struct nfs_commit_info *cinfo, u32 ds_commit_idx); void nfs_commitdata_release(struct nfs_commit_data *data); -void nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, +void nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo); void nfs_request_add_commit_list_locked(struct nfs_page *req, struct list_head *dst, diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 6e8174930a48..bd25dc7077f7 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -101,13 +101,13 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) if (!nfs_server_capable(inode, NFS_CAP_ALLOCATE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } @@ -123,7 +123,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return -EOPNOTSUPP; nfs_wb_all(inode); - mutex_lock(&inode->i_mutex); + inode_lock(inode); err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == 0) @@ -131,7 +131,7 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == -EOPNOTSUPP) NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 26f9a23e2b25..57ca1c8039c1 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -141,11 +141,11 @@ nfs4_file_fsync(struct file *file, loff_t start, loff_t end, int datasync) ret = filemap_write_and_wait_range(inode->i_mapping, start, end); if (ret != 0) break; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = nfs_file_fsync_commit(file, start, end, datasync); if (!ret) ret = pnfs_sync_inode(inode, !!datasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * If nfs_file_fsync_commit detected a server reboot, then * resend all dirty pages that might have been covered by @@ -219,13 +219,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, /* XXX: do we lock at all? what if server needs CB_RECALL_LAYOUT? */ if (same_inode) { - mutex_lock(&src_inode->i_mutex); + inode_lock(src_inode); } else if (dst_inode < src_inode) { - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dst_inode, I_MUTEX_PARENT); + inode_lock_nested(src_inode, I_MUTEX_CHILD); } else { - mutex_lock_nested(&src_inode->i_mutex, I_MUTEX_PARENT); - mutex_lock_nested(&dst_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(src_inode, I_MUTEX_PARENT); + inode_lock_nested(dst_inode, I_MUTEX_CHILD); } /* flush all pending writes on both src and dst so that server @@ -246,13 +246,13 @@ static int nfs42_clone_file_range(struct file *src_file, loff_t src_off, out_unlock: if (same_inode) { - mutex_unlock(&src_inode->i_mutex); + inode_unlock(src_inode); } else if (dst_inode < src_inode) { - mutex_unlock(&src_inode->i_mutex); - mutex_unlock(&dst_inode->i_mutex); + inode_unlock(src_inode); + inode_unlock(dst_inode); } else { - mutex_unlock(&dst_inode->i_mutex); - mutex_unlock(&src_inode->i_mutex); + inode_unlock(dst_inode); + inode_unlock(src_inode); } out: return ret; diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ce43cd6d88c6..5754835a2886 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -830,11 +830,10 @@ EXPORT_SYMBOL_GPL(nfs_request_add_commit_list_locked); * holding the nfs_page lock. */ void -nfs_request_add_commit_list(struct nfs_page *req, struct list_head *dst, - struct nfs_commit_info *cinfo) +nfs_request_add_commit_list(struct nfs_page *req, struct nfs_commit_info *cinfo) { spin_lock(cinfo->lock); - nfs_request_add_commit_list_locked(req, dst, cinfo); + nfs_request_add_commit_list_locked(req, &cinfo->mds->list, cinfo); spin_unlock(cinfo->lock); nfs_mark_page_unstable(req->wb_page, cinfo); } @@ -892,7 +891,7 @@ nfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg, { if (pnfs_mark_request_commit(req, lseg, cinfo, ds_commit_idx)) return; - nfs_request_add_commit_list(req, &cinfo->mds->list, cinfo); + nfs_request_add_commit_list(req, cinfo); } static void diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index 819ad812c71b..4cba7865f496 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -55,10 +55,10 @@ nfsd4_security_inode_setsecctx(struct svc_fh *resfh, struct xdr_netobj *label, u struct inode *inode = d_inode(resfh->fh_dentry); int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = security_inode_setsecctx(resfh->fh_dentry, label->data, label->len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (status) /* diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 79f0307a5ec8..dc8ebecf5618 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -192,7 +192,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) dir = nn->rec_file->f_path.dentry; /* lock the parent */ - mutex_lock(&d_inode(dir)->i_mutex); + inode_lock(d_inode(dir)); dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); if (IS_ERR(dentry)) { @@ -213,7 +213,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) out_put: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (status == 0) { if (nn->in_grace) { crp = nfs4_client_to_reclaim(dname, nn); @@ -286,7 +286,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } status = iterate_dir(nn->rec_file, &ctx.ctx); - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { @@ -302,7 +302,7 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_del(&entry->list); kfree(entry); } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); nfs4_reset_creds(original_cred); list_for_each_entry_safe(entry, tmp, &ctx.names, list) { @@ -322,7 +322,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); dir = nn->rec_file->f_path.dentry; - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); dentry = lookup_one_len(name, dir, namlen); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); @@ -335,7 +335,7 @@ nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) out: dput(dentry); out_unlock: - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); return status; } diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 0770bcb543c8..f84fe6bf9aee 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -288,7 +288,7 @@ fh_lock_nested(struct svc_fh *fhp, unsigned int subclass) } inode = d_inode(dentry); - mutex_lock_nested(&inode->i_mutex, subclass); + inode_lock_nested(inode, subclass); fill_pre_wcc(fhp); fhp->fh_locked = true; } @@ -307,7 +307,7 @@ fh_unlock(struct svc_fh *fhp) { if (fhp->fh_locked) { fill_post_wcc(fhp); - mutex_unlock(&d_inode(fhp->fh_dentry)->i_mutex); + inode_unlock(d_inode(fhp->fh_dentry)); fhp->fh_locked = false; } } diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 6739077f17fe..5d2a57e4c03a 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -493,9 +493,9 @@ __be32 nfsd4_set_nfs4_label(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; - mutex_lock(&d_inode(dentry)->i_mutex); + inode_lock(d_inode(dentry)); host_error = security_inode_setsecctx(dentry, label->data, label->len); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); return nfserrno(host_error); } #else diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c index 10b22527a617..21a1e2e0d92f 100644 --- a/fs/nilfs2/inode.c +++ b/fs/nilfs2/inode.c @@ -1003,7 +1003,7 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret) return ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); isize = i_size_read(inode); @@ -1113,6 +1113,6 @@ int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, if (ret == 1) ret = 0; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } diff --git a/fs/nilfs2/ioctl.c b/fs/nilfs2/ioctl.c index aba43811d6ef..e8fe24882b5b 100644 --- a/fs/nilfs2/ioctl.c +++ b/fs/nilfs2/ioctl.c @@ -158,7 +158,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, flags = nilfs_mask_flags(inode->i_mode, flags); - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = NILFS_I(inode)->i_flags; @@ -186,7 +186,7 @@ static int nilfs_ioctl_setflags(struct inode *inode, struct file *filp, nilfs_mark_inode_dirty(inode); ret = nilfs_transaction_commit(inode->i_sb); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); mnt_drop_write_file(filp); return ret; } diff --git a/fs/ntfs/dir.c b/fs/ntfs/dir.c index 9e38dafa3bc7..b2eff5816adc 100644 --- a/fs/ntfs/dir.c +++ b/fs/ntfs/dir.c @@ -1509,7 +1509,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(!S_ISDIR(vi->i_mode)); /* If the bitmap attribute inode is in memory sync it, too. */ @@ -1532,7 +1532,7 @@ static int ntfs_dir_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/file.c b/fs/ntfs/file.c index 9d383e5eff0e..bed4d427dfae 100644 --- a/fs/ntfs/file.c +++ b/fs/ntfs/file.c @@ -1944,14 +1944,14 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t written = 0; ssize_t err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); /* We can write back this queue in page reclaim. */ current->backing_dev_info = inode_to_bdi(vi); err = ntfs_prepare_file_for_write(iocb, from); if (iov_iter_count(from) && !err) written = ntfs_perform_write(file, from, iocb->ki_pos); current->backing_dev_info = NULL; - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); if (likely(written > 0)) { err = generic_write_sync(file, iocb->ki_pos, written); if (err < 0) @@ -1996,7 +1996,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, err = filemap_write_and_wait_range(vi->i_mapping, start, end); if (err) return err; - mutex_lock(&vi->i_mutex); + inode_lock(vi); BUG_ON(S_ISDIR(vi->i_mode)); if (!datasync || !NInoNonResident(NTFS_I(vi))) @@ -2015,7 +2015,7 @@ static int ntfs_file_fsync(struct file *filp, loff_t start, loff_t end, else ntfs_warning(vi->i_sb, "Failed to f%ssync inode 0x%lx. Error " "%u.", datasync ? "data" : "", vi->i_ino, -ret); - mutex_unlock(&vi->i_mutex); + inode_unlock(vi); return ret; } diff --git a/fs/ntfs/quota.c b/fs/ntfs/quota.c index d80e3315cab0..9793e68ba1dd 100644 --- a/fs/ntfs/quota.c +++ b/fs/ntfs/quota.c @@ -48,7 +48,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_error(vol->sb, "Quota inodes are not open."); return false; } - mutex_lock(&vol->quota_q_ino->i_mutex); + inode_lock(vol->quota_q_ino); ictx = ntfs_index_ctx_get(NTFS_I(vol->quota_q_ino)); if (!ictx) { ntfs_error(vol->sb, "Failed to get index context."); @@ -98,7 +98,7 @@ bool ntfs_mark_quotas_out_of_date(ntfs_volume *vol) ntfs_index_entry_mark_dirty(ictx); set_done: ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); /* * We set the flag so we do not try to mark the quotas out of date * again on remount. @@ -110,7 +110,7 @@ done: err_out: if (ictx) ntfs_index_ctx_put(ictx); - mutex_unlock(&vol->quota_q_ino->i_mutex); + inode_unlock(vol->quota_q_ino); return false; } diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c index 2f77f8dfb861..1b38abdaa3ed 100644 --- a/fs/ntfs/super.c +++ b/fs/ntfs/super.c @@ -1284,10 +1284,10 @@ static int check_windows_hibernation_status(ntfs_volume *vol) * Find the inode number for the hibernation file by looking up the * filename hiberfil.sys in the root directory. */ - mutex_lock(&vol->root_ino->i_mutex); + inode_lock(vol->root_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->root_ino), hiberfil, 12, &name); - mutex_unlock(&vol->root_ino->i_mutex); + inode_unlock(vol->root_ino); if (IS_ERR_MREF(mref)) { ret = MREF_ERR(mref); /* If the file does not exist, Windows is not hibernated. */ @@ -1377,10 +1377,10 @@ static bool load_and_init_quota(ntfs_volume *vol) * Find the inode number for the quota file by looking up the filename * $Quota in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), Quota, 6, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, quotas are disabled and have @@ -1460,10 +1460,10 @@ static bool load_and_init_usnjrnl(ntfs_volume *vol) * Find the inode number for the transaction log file by looking up the * filename $UsnJrnl in the extended system files directory $Extend. */ - mutex_lock(&vol->extend_ino->i_mutex); + inode_lock(vol->extend_ino); mref = ntfs_lookup_inode_by_name(NTFS_I(vol->extend_ino), UsnJrnl, 8, &name); - mutex_unlock(&vol->extend_ino->i_mutex); + inode_unlock(vol->extend_ino); if (IS_ERR_MREF(mref)) { /* * If the file does not exist, transaction logging is disabled, diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index a3ded88718c9..d002579c6f2b 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -5719,7 +5719,7 @@ int ocfs2_remove_btree_range(struct inode *inode, goto bail; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5776,7 +5776,7 @@ int ocfs2_remove_btree_range(struct inode *inode, out_commit: ocfs2_commit_trans(osb, handle); out: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); bail: if (meta_ac) ocfs2_free_alloc_context(meta_ac); @@ -5832,7 +5832,7 @@ int ocfs2_truncate_log_append(struct ocfs2_super *osb, struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); @@ -5980,7 +5980,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) struct ocfs2_dinode *di; struct ocfs2_truncate_log *tl; - BUG_ON(mutex_trylock(&tl_inode->i_mutex)); + BUG_ON(inode_trylock(tl_inode)); di = (struct ocfs2_dinode *) tl_bh->b_data; @@ -6008,7 +6008,7 @@ int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) goto out; } - mutex_lock(&data_alloc_inode->i_mutex); + inode_lock(data_alloc_inode); status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); if (status < 0) { @@ -6035,7 +6035,7 @@ out_unlock: ocfs2_inode_unlock(data_alloc_inode, 1); out_mutex: - mutex_unlock(&data_alloc_inode->i_mutex); + inode_unlock(data_alloc_inode); iput(data_alloc_inode); out: @@ -6047,9 +6047,9 @@ int ocfs2_flush_truncate_log(struct ocfs2_super *osb) int status; struct inode *tl_inode = osb->osb_tl_inode; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); status = __ocfs2_flush_truncate_log(osb); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6208,7 +6208,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, (unsigned long long)le64_to_cpu(tl_copy->i_blkno), num_recs); - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); for(i = 0; i < num_recs; i++) { if (ocfs2_truncate_log_needs_flush(osb)) { status = __ocfs2_flush_truncate_log(osb); @@ -6239,7 +6239,7 @@ int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, } bail_up: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); return status; } @@ -6346,7 +6346,7 @@ static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, goto out; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_inode_lock(inode, &di_bh, 1); if (ret) { @@ -6395,7 +6395,7 @@ out_unlock: ocfs2_inode_unlock(inode, 1); brelse(di_bh); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); out: while(head) { @@ -6439,7 +6439,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, handle_t *handle; int ret = 0; - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); while (head) { if (ocfs2_truncate_log_needs_flush(osb)) { @@ -6471,7 +6471,7 @@ static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); while (head) { /* Premature exit may have left some dangling items. */ @@ -7355,7 +7355,7 @@ int ocfs2_trim_fs(struct super_block *sb, struct fstrim_range *range) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 0); if (ret < 0) { @@ -7422,7 +7422,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 0); brelse(main_bm_bh); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: return ret; diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 7f604727f487..794fd1587f34 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -2046,9 +2046,9 @@ static int ocfs2_try_to_free_truncate_log(struct ocfs2_super *osb, int ret = 0; unsigned int truncated_clusters; - mutex_lock(&osb->osb_tl_inode->i_mutex); + inode_lock(osb->osb_tl_inode); truncated_clusters = osb->truncated_clusters; - mutex_unlock(&osb->osb_tl_inode->i_mutex); + inode_unlock(osb->osb_tl_inode); /* * Check whether we can succeed in allocating if we free diff --git a/fs/ocfs2/dir.c b/fs/ocfs2/dir.c index ffecf89c8c1c..e1adf285fc31 100644 --- a/fs/ocfs2/dir.c +++ b/fs/ocfs2/dir.c @@ -4361,7 +4361,7 @@ static int ocfs2_dx_dir_remove_index(struct inode *dir, mlog_errno(ret); goto out; } - mutex_lock(&dx_alloc_inode->i_mutex); + inode_lock(dx_alloc_inode); ret = ocfs2_inode_lock(dx_alloc_inode, &dx_alloc_bh, 1); if (ret) { @@ -4410,7 +4410,7 @@ out_unlock: ocfs2_inode_unlock(dx_alloc_inode, 1); out_mutex: - mutex_unlock(&dx_alloc_inode->i_mutex); + inode_unlock(dx_alloc_inode); brelse(dx_alloc_bh); out: iput(dx_alloc_inode); diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index f92612e4b9d6..474e57f834e6 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -1390,6 +1390,7 @@ static int __ocfs2_cluster_lock(struct ocfs2_super *osb, unsigned int gen; int noqueue_attempted = 0; int dlm_locked = 0; + int kick_dc = 0; if (!(lockres->l_flags & OCFS2_LOCK_INITIALIZED)) { mlog_errno(-EINVAL); @@ -1524,7 +1525,12 @@ update_holders: unlock: lockres_clear_flags(lockres, OCFS2_LOCK_UPCONVERT_FINISHING); + /* ocfs2_unblock_lock reques on seeing OCFS2_LOCK_UPCONVERT_FINISHING */ + kick_dc = (lockres->l_flags & OCFS2_LOCK_BLOCKED); + spin_unlock_irqrestore(&lockres->l_lock, flags); + if (kick_dc) + ocfs2_wake_downconvert_thread(osb); out: /* * This is helping work around a lock inversion between the page lock diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index d63127932509..7cb38fdca229 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -1872,7 +1872,7 @@ static int __ocfs2_change_file_space(struct file *file, struct inode *inode, if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes on other nodes @@ -1991,7 +1991,7 @@ out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return ret; } @@ -2299,7 +2299,7 @@ static ssize_t ocfs2_file_write_iter(struct kiocb *iocb, appending = iocb->ki_flags & IOCB_APPEND ? 1 : 0; direct_io = iocb->ki_flags & IOCB_DIRECT ? 1 : 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); relock: /* @@ -2435,7 +2435,7 @@ out: ocfs2_rw_unlock(inode, rw_level); out_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (written) ret = written; @@ -2547,7 +2547,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file->f_mapping->host; int ret = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_SET: @@ -2585,7 +2585,7 @@ static loff_t ocfs2_file_llseek(struct file *file, loff_t offset, int whence) offset = vfs_setpos(file, offset, inode->i_sb->s_maxbytes); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret) return ret; return offset; diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 97a563bab9a8..36294446d960 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -630,10 +630,10 @@ static int ocfs2_remove_inode(struct inode *inode, goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &inode_alloc_bh, 1); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); mlog_errno(status); goto bail; @@ -680,7 +680,7 @@ bail_commit: ocfs2_commit_trans(osb, handle); bail_unlock: ocfs2_inode_unlock(inode_alloc_inode, 1); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); brelse(inode_alloc_bh); bail: iput(inode_alloc_inode); @@ -751,10 +751,10 @@ static int ocfs2_wipe_inode(struct inode *inode, /* Lock the orphan dir. The lock will be held for the entire * delete_inode operation. We do this now to avoid races with * recovery completion on other nodes. */ - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); mlog_errno(status); goto bail; @@ -803,7 +803,7 @@ bail_unlock_dir: return status; ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); bail: iput(orphan_dir_inode); diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index 16b0bb482ea7..4506ec5ec2ea 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -86,7 +86,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, unsigned oldflags; int status; - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_inode_lock(inode, &bh, 1); if (status < 0) { @@ -135,7 +135,7 @@ static int ocfs2_set_inode_attr(struct inode *inode, unsigned flags, bail_unlock: ocfs2_inode_unlock(inode, 1); bail: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); brelse(bh); @@ -287,7 +287,7 @@ static int ocfs2_info_scan_inode_alloc(struct ocfs2_super *osb, struct ocfs2_dinode *dinode_alloc = NULL; if (inode_alloc) - mutex_lock(&inode_alloc->i_mutex); + inode_lock(inode_alloc); if (o2info_coherent(&fi->ifi_req)) { status = ocfs2_inode_lock(inode_alloc, &bh, 0); @@ -317,7 +317,7 @@ bail: ocfs2_inode_unlock(inode_alloc, 0); if (inode_alloc) - mutex_unlock(&inode_alloc->i_mutex); + inode_unlock(inode_alloc); brelse(bh); @@ -547,7 +547,7 @@ static int ocfs2_info_freefrag_scan_bitmap(struct ocfs2_super *osb, struct ocfs2_dinode *gb_dinode = NULL; if (gb_inode) - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); if (o2info_coherent(&ffg->iff_req)) { status = ocfs2_inode_lock(gb_inode, &bh, 0); @@ -604,7 +604,7 @@ bail: ocfs2_inode_unlock(gb_inode, 0); if (gb_inode) - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); iput(gb_inode); brelse(bh); diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index 3772a2dbb980..61b833b721d8 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -2088,7 +2088,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, return status; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, NULL, 0); if (status < 0) { mlog_errno(status); @@ -2106,7 +2106,7 @@ static int ocfs2_queue_orphans(struct ocfs2_super *osb, out_cluster: ocfs2_inode_unlock(orphan_dir_inode, 0); out: - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); return status; } @@ -2196,7 +2196,7 @@ static int ocfs2_recover_orphans(struct ocfs2_super *osb, oi->ip_next_orphan = NULL; if (oi->ip_flags & OCFS2_INODE_DIO_ORPHAN_ENTRY) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = ocfs2_rw_lock(inode, 1); if (ret < 0) { mlog_errno(ret); @@ -2235,7 +2235,7 @@ unlock_inode: unlock_rw: ocfs2_rw_unlock(inode, 1); unlock_mutex: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* clear dio flag in ocfs2_inode_info */ oi->ip_flags &= ~OCFS2_INODE_DIO_ORPHAN_ENTRY; diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index e9c99e35f5ea..7d62c43a2c3e 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -414,7 +414,7 @@ void ocfs2_shutdown_local_alloc(struct ocfs2_super *osb) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -468,7 +468,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -506,7 +506,7 @@ int ocfs2_begin_local_alloc_recovery(struct ocfs2_super *osb, goto bail; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); status = ocfs2_read_inode_block_full(inode, &alloc_bh, OCFS2_BH_IGNORE_CACHE); @@ -539,7 +539,7 @@ bail: brelse(alloc_bh); if (inode) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); } @@ -571,7 +571,7 @@ int ocfs2_complete_local_alloc_recovery(struct ocfs2_super *osb, goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); status = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (status < 0) { @@ -601,7 +601,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); brelse(main_bm_bh); @@ -643,7 +643,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, goto bail; } - mutex_lock(&local_alloc_inode->i_mutex); + inode_lock(local_alloc_inode); /* * We must double check state and allocator bits because @@ -709,7 +709,7 @@ int ocfs2_reserve_local_alloc_bits(struct ocfs2_super *osb, status = 0; bail: if (status < 0 && local_alloc_inode) { - mutex_unlock(&local_alloc_inode->i_mutex); + inode_unlock(local_alloc_inode); iput(local_alloc_inode); } diff --git a/fs/ocfs2/move_extents.c b/fs/ocfs2/move_extents.c index 124471d26a73..e3d05d9901a3 100644 --- a/fs/ocfs2/move_extents.c +++ b/fs/ocfs2/move_extents.c @@ -276,7 +276,7 @@ static int ocfs2_defrag_extent(struct ocfs2_move_extents_context *context, * context->data_ac->ac_resv = &OCFS2_I(inode)->ip_la_data_resv; */ - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -338,7 +338,7 @@ out_commit: ocfs2_commit_trans(osb, handle); out_unlock_mutex: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (context->data_ac) { ocfs2_free_alloc_context(context->data_ac); @@ -632,7 +632,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out; } - mutex_lock(&gb_inode->i_mutex); + inode_lock(gb_inode); ret = ocfs2_inode_lock(gb_inode, &gb_bh, 1); if (ret) { @@ -640,7 +640,7 @@ static int ocfs2_move_extent(struct ocfs2_move_extents_context *context, goto out_unlock_gb_mutex; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); handle = ocfs2_start_trans(osb, credits); if (IS_ERR(handle)) { @@ -708,11 +708,11 @@ out_commit: brelse(gd_bh); out_unlock_tl_inode: - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ocfs2_inode_unlock(gb_inode, 1); out_unlock_gb_mutex: - mutex_unlock(&gb_inode->i_mutex); + inode_unlock(gb_inode); brelse(gb_bh); iput(gb_inode); @@ -905,7 +905,7 @@ static int ocfs2_move_extents(struct ocfs2_move_extents_context *context) if (ocfs2_is_hard_readonly(osb) || ocfs2_is_soft_readonly(osb)) return -EROFS; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* * This prevents concurrent writes from other nodes @@ -969,7 +969,7 @@ out_inode_unlock: out_rw_unlock: ocfs2_rw_unlock(inode, 1); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return status; } diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index ab42c38031b1..6b3e87189a64 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -1045,7 +1045,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -1664,7 +1664,7 @@ bail: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2121,11 +2121,11 @@ static int ocfs2_lookup_lock_orphan_dir(struct ocfs2_super *osb, return ret; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); ret = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (ret < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(ret); @@ -2226,7 +2226,7 @@ out: if (ret) { ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); } @@ -2495,7 +2495,7 @@ out: ocfs2_free_alloc_context(inode_ac); /* Unroll orphan dir locking */ - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); ocfs2_inode_unlock(orphan_dir, 1); iput(orphan_dir); } @@ -2602,7 +2602,7 @@ leave: if (orphan_dir) { /* This was locked for us in ocfs2_prepare_orphan_dir() */ ocfs2_inode_unlock(orphan_dir, 1); - mutex_unlock(&orphan_dir->i_mutex); + inode_unlock(orphan_dir); iput(orphan_dir); } @@ -2689,7 +2689,7 @@ int ocfs2_add_inode_to_orphan(struct ocfs2_super *osb, bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); ocfs2_free_dir_lookup_result(&orphan_insert); @@ -2721,10 +2721,10 @@ int ocfs2_del_inode_from_orphan(struct ocfs2_super *osb, goto bail; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); mlog_errno(status); goto bail; @@ -2770,7 +2770,7 @@ bail_commit: bail_unlock_orphan: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); brelse(orphan_dir_bh); iput(orphan_dir_inode); @@ -2834,12 +2834,12 @@ int ocfs2_mv_orphaned_inode_to_new(struct inode *dir, goto leave; } - mutex_lock(&orphan_dir_inode->i_mutex); + inode_lock(orphan_dir_inode); status = ocfs2_inode_lock(orphan_dir_inode, &orphan_dir_bh, 1); if (status < 0) { mlog_errno(status); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); goto leave; } @@ -2901,7 +2901,7 @@ out_commit: ocfs2_commit_trans(osb, handle); orphan_unlock: ocfs2_inode_unlock(orphan_dir_inode, 1); - mutex_unlock(&orphan_dir_inode->i_mutex); + inode_unlock(orphan_dir_inode); iput(orphan_dir_inode); leave: diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index fde9ef18cff3..9c9dd30bc945 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -308,7 +308,7 @@ int ocfs2_lock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) WARN_ON(bh != oinfo->dqi_gqi_bh); spin_unlock(&dq_data_lock); if (ex) { - mutex_lock(&oinfo->dqi_gqinode->i_mutex); + inode_lock(oinfo->dqi_gqinode); down_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } else { down_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); @@ -320,7 +320,7 @@ void ocfs2_unlock_global_qf(struct ocfs2_mem_dqinfo *oinfo, int ex) { if (ex) { up_write(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); - mutex_unlock(&oinfo->dqi_gqinode->i_mutex); + inode_unlock(oinfo->dqi_gqinode); } else { up_read(&OCFS2_I(oinfo->dqi_gqinode)->ip_alloc_sem); } diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 252119860e6c..3eff031aaf26 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -807,7 +807,7 @@ int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh) mlog_errno(ret); goto out; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1); if (ret) { @@ -867,7 +867,7 @@ out_unlock: } out_mutex: if (alloc_inode) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); } out: @@ -4197,7 +4197,7 @@ static int __ocfs2_reflink(struct dentry *old_dentry, goto out; } - mutex_lock_nested(&new_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(new_inode, I_MUTEX_CHILD); ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1, OI_LS_REFLINK_TARGET); if (ret) { @@ -4231,7 +4231,7 @@ inode_unlock: ocfs2_inode_unlock(new_inode, 1); brelse(new_bh); out_unlock: - mutex_unlock(&new_inode->i_mutex); + inode_unlock(new_inode); out: if (!ret) { ret = filemap_fdatawait(inode->i_mapping); @@ -4402,11 +4402,11 @@ static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir, return error; } - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = dquot_initialize(dir); if (!error) error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (!error) fsnotify_create(dir, new_dentry); return error; diff --git a/fs/ocfs2/resize.c b/fs/ocfs2/resize.c index 79b8021302b3..576b9a04873f 100644 --- a/fs/ocfs2/resize.c +++ b/fs/ocfs2/resize.c @@ -301,7 +301,7 @@ int ocfs2_group_extend(struct inode * inode, int new_clusters) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -375,7 +375,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: @@ -486,7 +486,7 @@ int ocfs2_group_add(struct inode *inode, struct ocfs2_new_group_input *input) goto out; } - mutex_lock(&main_bm_inode->i_mutex); + inode_lock(main_bm_inode); ret = ocfs2_inode_lock(main_bm_inode, &main_bm_bh, 1); if (ret < 0) { @@ -590,7 +590,7 @@ out_unlock: ocfs2_inode_unlock(main_bm_inode, 1); out_mutex: - mutex_unlock(&main_bm_inode->i_mutex); + inode_unlock(main_bm_inode); iput(main_bm_inode); out: diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index fc6d25f6d444..2f19aeec5482 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -141,7 +141,7 @@ void ocfs2_free_ac_resource(struct ocfs2_alloc_context *ac) if (ac->ac_which != OCFS2_AC_USE_LOCAL) ocfs2_inode_unlock(inode, 1); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); iput(inode); ac->ac_inode = NULL; @@ -797,11 +797,11 @@ static int ocfs2_reserve_suballoc_bits(struct ocfs2_super *osb, return -EINVAL; } - mutex_lock(&alloc_inode->i_mutex); + inode_lock(alloc_inode); status = ocfs2_inode_lock(alloc_inode, &bh, 1); if (status < 0) { - mutex_unlock(&alloc_inode->i_mutex); + inode_unlock(alloc_inode); iput(alloc_inode); mlog_errno(status); @@ -2875,10 +2875,10 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) goto bail; } - mutex_lock(&inode_alloc_inode->i_mutex); + inode_lock(inode_alloc_inode); status = ocfs2_inode_lock(inode_alloc_inode, &alloc_bh, 0); if (status < 0) { - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); mlog(ML_ERROR, "lock on alloc inode on slot %u failed %d\n", (u32)suballoc_slot, status); @@ -2891,7 +2891,7 @@ int ocfs2_test_inode_bit(struct ocfs2_super *osb, u64 blkno, int *res) mlog(ML_ERROR, "test suballoc bit failed %d\n", status); ocfs2_inode_unlock(inode_alloc_inode, 0); - mutex_unlock(&inode_alloc_inode->i_mutex); + inode_unlock(inode_alloc_inode); iput(inode_alloc_inode); brelse(alloc_bh); diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index f0e241ffd94f..7d3d979f57d9 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -2524,7 +2524,7 @@ static int ocfs2_xattr_free_block(struct inode *inode, mlog_errno(ret); goto out; } - mutex_lock(&xb_alloc_inode->i_mutex); + inode_lock(xb_alloc_inode); ret = ocfs2_inode_lock(xb_alloc_inode, &xb_alloc_bh, 1); if (ret < 0) { @@ -2549,7 +2549,7 @@ out_unlock: ocfs2_inode_unlock(xb_alloc_inode, 1); brelse(xb_alloc_bh); out_mutex: - mutex_unlock(&xb_alloc_inode->i_mutex); + inode_unlock(xb_alloc_inode); iput(xb_alloc_inode); out: brelse(blk_bh); @@ -3619,17 +3619,17 @@ int ocfs2_xattr_set(struct inode *inode, } } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); if (ret < 0) { - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); mlog_errno(ret); goto cleanup; } } - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); ret = ocfs2_init_xattr_set_ctxt(inode, di, &xi, &xis, &xbs, &ctxt, ref_meta, &credits); @@ -5460,7 +5460,7 @@ static int ocfs2_rm_xattr_cluster(struct inode *inode, return ret; } - mutex_lock(&tl_inode->i_mutex); + inode_lock(tl_inode); if (ocfs2_truncate_log_needs_flush(osb)) { ret = __ocfs2_flush_truncate_log(osb); @@ -5504,7 +5504,7 @@ out_commit: out: ocfs2_schedule_truncate_log_flush(osb, 1); - mutex_unlock(&tl_inode->i_mutex); + inode_unlock(tl_inode); if (meta_ac) ocfs2_free_alloc_context(meta_ac); diff --git a/fs/open.c b/fs/open.c index b25b1542c530..55bdc75e2172 100644 --- a/fs/open.c +++ b/fs/open.c @@ -58,10 +58,10 @@ int do_truncate(struct dentry *dentry, loff_t length, unsigned int time_attrs, if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock(dentry->d_inode); /* Note any delegations or leases have already been broken: */ ret = notify_change(dentry, &newattrs, NULL); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(dentry->d_inode); return ret; } @@ -510,7 +510,7 @@ static int chmod_common(struct path *path, umode_t mode) if (error) return error; retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chmod(path, mode); if (error) goto out_unlock; @@ -518,7 +518,7 @@ retry_deleg: newattrs.ia_valid = ATTR_MODE | ATTR_CTIME; error = notify_change(path->dentry, &newattrs, &delegated_inode); out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) @@ -593,11 +593,11 @@ retry_deleg: if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_SGID | ATTR_KILL_PRIV; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_path_chown(path, uid, gid); if (!error) error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index eff6319d5037..d894e7cd9a86 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -248,9 +248,9 @@ static int ovl_copy_up_locked(struct dentry *workdir, struct dentry *upperdir, if (err) goto out_cleanup; - mutex_lock(&newdentry->d_inode->i_mutex); + inode_lock(newdentry->d_inode); err = ovl_set_attr(newdentry, stat); - mutex_unlock(&newdentry->d_inode->i_mutex); + inode_unlock(newdentry->d_inode); if (err) goto out_cleanup; diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index 692ceda3bc21..ed95272d57a6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -167,7 +167,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, struct dentry *newdentry; int err; - mutex_lock_nested(&udir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(udir, I_MUTEX_PARENT); newdentry = lookup_one_len(dentry->d_name.name, upperdir, dentry->d_name.len); err = PTR_ERR(newdentry); @@ -185,7 +185,7 @@ static int ovl_create_upper(struct dentry *dentry, struct inode *inode, out_dput: dput(newdentry); out_unlock: - mutex_unlock(&udir->i_mutex); + inode_unlock(udir); return err; } @@ -258,9 +258,9 @@ static struct dentry *ovl_clear_empty(struct dentry *dentry, if (err) goto out_cleanup; - mutex_lock(&opaquedir->d_inode->i_mutex); + inode_lock(opaquedir->d_inode); err = ovl_set_attr(opaquedir, &stat); - mutex_unlock(&opaquedir->d_inode->i_mutex); + inode_unlock(opaquedir->d_inode); if (err) goto out_cleanup; @@ -599,7 +599,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) struct dentry *upper = ovl_dentry_upper(dentry); int err; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); err = -ESTALE; if (upper->d_parent == upperdir) { /* Don't let d_delete() think it can reset d_inode */ @@ -619,7 +619,7 @@ static int ovl_remove_upper(struct dentry *dentry, bool is_dir) * now. */ d_drop(dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return err; } diff --git a/fs/overlayfs/inode.c b/fs/overlayfs/inode.c index bf996e574f3d..49e204560655 100644 --- a/fs/overlayfs/inode.c +++ b/fs/overlayfs/inode.c @@ -63,9 +63,9 @@ int ovl_setattr(struct dentry *dentry, struct iattr *attr) if (!err) { upperdentry = ovl_dentry_upper(dentry); - mutex_lock(&upperdentry->d_inode->i_mutex); + inode_lock(upperdentry->d_inode); err = notify_change(upperdentry, attr, NULL); - mutex_unlock(&upperdentry->d_inode->i_mutex); + inode_unlock(upperdentry->d_inode); } ovl_drop_write(dentry); out: diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index adcb1398c481..fdaf28f75e12 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -228,7 +228,7 @@ static int ovl_check_whiteouts(struct dentry *dir, struct ovl_readdir_data *rdd) dput(dentry); } } - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); } revert_creds(old_cred); put_cred(override_cred); @@ -399,7 +399,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) loff_t res; struct ovl_dir_file *od = file->private_data; - mutex_lock(&file_inode(file)->i_mutex); + inode_lock(file_inode(file)); if (!file->f_pos) ovl_dir_reset(file); @@ -429,7 +429,7 @@ static loff_t ovl_dir_llseek(struct file *file, loff_t offset, int origin) res = offset; } out_unlock: - mutex_unlock(&file_inode(file)->i_mutex); + inode_unlock(file_inode(file)); return res; } @@ -454,10 +454,10 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, ovl_path_upper(dentry, &upperpath); realfile = ovl_path_open(&upperpath, O_RDONLY); smp_mb__before_spinlock(); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (!od->upperfile) { if (IS_ERR(realfile)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return PTR_ERR(realfile); } od->upperfile = realfile; @@ -467,7 +467,7 @@ static int ovl_dir_fsync(struct file *file, loff_t start, loff_t end, fput(realfile); realfile = od->upperfile; } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } } @@ -479,9 +479,9 @@ static int ovl_dir_release(struct inode *inode, struct file *file) struct ovl_dir_file *od = file->private_data; if (od->cache) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); ovl_cache_put(od, file->f_path.dentry); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } fput(od->realfile); if (od->upperfile) @@ -557,7 +557,7 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) { struct ovl_cache_entry *p; - mutex_lock_nested(&upper->d_inode->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(upper->d_inode, I_MUTEX_CHILD); list_for_each_entry(p, list, l_node) { struct dentry *dentry; @@ -575,5 +575,5 @@ void ovl_cleanup_whiteouts(struct dentry *upper, struct list_head *list) ovl_cleanup(upper->d_inode, dentry); dput(dentry); } - mutex_unlock(&upper->d_inode->i_mutex); + inode_unlock(upper->d_inode); } diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index d250604f985a..8d826bd56b26 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -229,7 +229,7 @@ void ovl_dentry_update(struct dentry *dentry, struct dentry *upperdentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&upperdentry->d_parent->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(upperdentry->d_parent->d_inode)); WARN_ON(oe->__upperdentry); BUG_ON(!upperdentry->d_inode); /* @@ -244,7 +244,7 @@ void ovl_dentry_version_inc(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); oe->version++; } @@ -252,7 +252,7 @@ u64 ovl_dentry_version_get(struct dentry *dentry) { struct ovl_entry *oe = dentry->d_fsdata; - WARN_ON(!mutex_is_locked(&dentry->d_inode->i_mutex)); + WARN_ON(!inode_is_locked(dentry->d_inode)); return oe->version; } @@ -375,9 +375,9 @@ static inline struct dentry *ovl_lookup_real(struct dentry *dir, { struct dentry *dentry; - mutex_lock(&dir->d_inode->i_mutex); + inode_lock(dir->d_inode); dentry = lookup_one_len(name->name, dir, name->len); - mutex_unlock(&dir->d_inode->i_mutex); + inode_unlock(dir->d_inode); if (IS_ERR(dentry)) { if (PTR_ERR(dentry) == -ENOENT) @@ -744,7 +744,7 @@ static struct dentry *ovl_workdir_create(struct vfsmount *mnt, if (err) return ERR_PTR(err); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); retry: work = lookup_one_len(OVL_WORKDIR_NAME, dentry, strlen(OVL_WORKDIR_NAME)); @@ -770,7 +770,7 @@ retry: goto out_dput; } out_unlock: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); mnt_drop_write(mnt); return work; diff --git a/fs/pipe.c b/fs/pipe.c index 42cf8ddf0e55..ab8dad3ccb6a 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -38,6 +38,12 @@ unsigned int pipe_max_size = 1048576; */ unsigned int pipe_min_size = PAGE_SIZE; +/* Maximum allocatable pages per user. Hard limit is unset by default, soft + * matches default values. + */ +unsigned long pipe_user_pages_hard; +unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; + /* * We use a start+len construction, which provides full use of the * allocated memory. @@ -583,20 +589,49 @@ pipe_fasync(int fd, struct file *filp, int on) return retval; } +static void account_pipe_buffers(struct pipe_inode_info *pipe, + unsigned long old, unsigned long new) +{ + atomic_long_add(new - old, &pipe->user->pipe_bufs); +} + +static bool too_many_pipe_buffers_soft(struct user_struct *user) +{ + return pipe_user_pages_soft && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_soft; +} + +static bool too_many_pipe_buffers_hard(struct user_struct *user) +{ + return pipe_user_pages_hard && + atomic_long_read(&user->pipe_bufs) >= pipe_user_pages_hard; +} + struct pipe_inode_info *alloc_pipe_info(void) { struct pipe_inode_info *pipe; pipe = kzalloc(sizeof(struct pipe_inode_info), GFP_KERNEL); if (pipe) { - pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * PIPE_DEF_BUFFERS, GFP_KERNEL); + unsigned long pipe_bufs = PIPE_DEF_BUFFERS; + struct user_struct *user = get_current_user(); + + if (!too_many_pipe_buffers_hard(user)) { + if (too_many_pipe_buffers_soft(user)) + pipe_bufs = 1; + pipe->bufs = kzalloc(sizeof(struct pipe_buffer) * pipe_bufs, GFP_KERNEL); + } + if (pipe->bufs) { init_waitqueue_head(&pipe->wait); pipe->r_counter = pipe->w_counter = 1; - pipe->buffers = PIPE_DEF_BUFFERS; + pipe->buffers = pipe_bufs; + pipe->user = user; + account_pipe_buffers(pipe, 0, pipe_bufs); mutex_init(&pipe->mutex); return pipe; } + free_uid(user); kfree(pipe); } @@ -607,6 +642,8 @@ void free_pipe_info(struct pipe_inode_info *pipe) { int i; + account_pipe_buffers(pipe, pipe->buffers, 0); + free_uid(pipe->user); for (i = 0; i < pipe->buffers; i++) { struct pipe_buffer *buf = pipe->bufs + i; if (buf->ops) @@ -998,6 +1035,7 @@ static long pipe_set_size(struct pipe_inode_info *pipe, unsigned long nr_pages) memcpy(bufs + head, pipe->bufs, tail * sizeof(struct pipe_buffer)); } + account_pipe_buffers(pipe, pipe->buffers, nr_pages); pipe->curbuf = 0; kfree(pipe->bufs); pipe->bufs = bufs; @@ -1069,6 +1107,11 @@ long pipe_fcntl(struct file *file, unsigned int cmd, unsigned long arg) if (!capable(CAP_SYS_RESOURCE) && size > pipe_max_size) { ret = -EPERM; goto out; + } else if ((too_many_pipe_buffers_hard(pipe->user) || + too_many_pipe_buffers_soft(pipe->user)) && + !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) { + ret = -EPERM; + goto out; } ret = pipe_set_size(pipe, nr_pages); break; diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 92e6726f6e37..a939f5ed7f89 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -552,9 +552,9 @@ static int open_kcore(struct inode *inode, struct file *filp) if (kcore_need_update) kcore_update_ram(); if (i_size_read(inode) != proc_root_kcore->size) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, proc_root_kcore->size); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/proc/self.c b/fs/proc/self.c index 67e8db442cf0..b6a8d3529fea 100644 --- a/fs/proc/self.c +++ b/fs/proc/self.c @@ -50,7 +50,7 @@ int proc_setup_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); self = d_alloc_name(s->s_root, "self"); if (self) { struct inode *inode = new_inode_pseudo(s); @@ -69,7 +69,7 @@ int proc_setup_self(struct super_block *s) } else { self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(self)) { pr_err("proc_fill_super: can't allocate /proc/self\n"); return PTR_ERR(self); diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 71ffc91060f6..85d16c67c33e 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -602,7 +602,8 @@ static int smaps_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { smaps_pmd_entry(pmd, addr, walk); spin_unlock(ptl); return 0; @@ -913,7 +914,8 @@ static int clear_refs_pte_range(pmd_t *pmd, unsigned long addr, spinlock_t *ptl; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (cp->type == CLEAR_REFS_SOFT_DIRTY) { clear_soft_dirty_pmd(vma, addr, pmd); goto out; @@ -1187,7 +1189,8 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, int err = 0; #ifdef CONFIG_TRANSPARENT_HUGEPAGE - if (pmd_trans_huge_lock(pmdp, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmdp, vma); + if (ptl) { u64 flags = 0, frame = 0; pmd_t pmd = *pmdp; @@ -1519,7 +1522,8 @@ static int gather_pte_stats(pmd_t *pmd, unsigned long addr, pte_t *orig_pte; pte_t *pte; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { pte_t huge_pte = *(pte_t *)pmd; struct page *page; diff --git a/fs/proc/thread_self.c b/fs/proc/thread_self.c index 9eacd59e0360..e58a31e8fb2a 100644 --- a/fs/proc/thread_self.c +++ b/fs/proc/thread_self.c @@ -52,7 +52,7 @@ int proc_setup_thread_self(struct super_block *s) struct pid_namespace *ns = s->s_fs_info; struct dentry *thread_self; - mutex_lock(&root_inode->i_mutex); + inode_lock(root_inode); thread_self = d_alloc_name(s->s_root, "thread-self"); if (thread_self) { struct inode *inode = new_inode_pseudo(s); @@ -71,7 +71,7 @@ int proc_setup_thread_self(struct super_block *s) } else { thread_self = ERR_PTR(-ENOMEM); } - mutex_unlock(&root_inode->i_mutex); + inode_unlock(root_inode); if (IS_ERR(thread_self)) { pr_err("proc_fill_super: can't allocate /proc/thread_self\n"); return PTR_ERR(thread_self); diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index d8c439d813ce..dc645b66cd79 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -377,7 +377,7 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, break; } - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); dentry = d_alloc_name(root, name); if (!dentry) @@ -397,12 +397,12 @@ int pstore_mkfile(enum pstore_type_id type, char *psname, u64 id, int count, list_add(&private->list, &allpstore); spin_unlock_irqrestore(&allpstore_lock, flags); - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); return 0; fail_lockedalloc: - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); kfree(private); fail_alloc: iput(inode); diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index fbd70af98820..3c3b81bb6dfe 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -682,9 +682,9 @@ int dquot_quota_sync(struct super_block *sb, int type) continue; if (!sb_has_quota_active(sb, cnt)) continue; - mutex_lock(&dqopt->files[cnt]->i_mutex); + inode_lock(dqopt->files[cnt]); truncate_inode_pages(&dqopt->files[cnt]->i_data, 0); - mutex_unlock(&dqopt->files[cnt]->i_mutex); + inode_unlock(dqopt->files[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2162,12 +2162,12 @@ int dquot_disable(struct super_block *sb, int type, unsigned int flags) /* If quota was reenabled in the meantime, we have * nothing to do */ if (!sb_has_quota_loaded(sb, cnt)) { - mutex_lock(&toputinode[cnt]->i_mutex); + inode_lock(toputinode[cnt]); toputinode[cnt]->i_flags &= ~(S_IMMUTABLE | S_NOATIME | S_NOQUOTA); truncate_inode_pages(&toputinode[cnt]->i_data, 0); - mutex_unlock(&toputinode[cnt]->i_mutex); + inode_unlock(toputinode[cnt]); mark_inode_dirty_sync(toputinode[cnt]); } mutex_unlock(&dqopt->dqonoff_mutex); @@ -2258,11 +2258,11 @@ static int vfs_load_quota_inode(struct inode *inode, int type, int format_id, /* We don't want quota and atime on quota files (deadlocks * possible) Also nobody should write to the file - we use * special IO operations which ignore the immutable bit. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); oldflags = inode->i_flags & (S_NOATIME | S_IMMUTABLE | S_NOQUOTA); inode->i_flags |= S_NOQUOTA | S_NOATIME | S_IMMUTABLE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); /* * When S_NOQUOTA is set, remove dquot references as no more * references can be added @@ -2305,12 +2305,12 @@ out_file_init: iput(inode); out_lock: if (oldflags != -1) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Set the flags back (in the case of accidental quotaon() * on a wrong file we don't want to mess up the flags) */ inode->i_flags &= ~(S_NOATIME | S_NOQUOTA | S_IMMUTABLE); inode->i_flags |= oldflags; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } mutex_unlock(&dqopt->dqonoff_mutex); out_fmt: @@ -2430,9 +2430,9 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - mutex_lock(&d_inode(sb->s_root)->i_mutex); + inode_lock(d_inode(sb->s_root)); dentry = lookup_one_len(qf_name, sb->s_root, strlen(qf_name)); - mutex_unlock(&d_inode(sb->s_root)->i_mutex); + inode_unlock(d_inode(sb->s_root)); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/read_write.c b/fs/read_write.c index 06b07d5a08fe..324ec271cc4e 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -238,7 +238,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); loff_t retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); switch (whence) { case SEEK_END: offset += i_size_read(inode); @@ -283,7 +283,7 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) retval = offset; } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return retval; } EXPORT_SYMBOL(default_llseek); @@ -1656,6 +1656,9 @@ next_file: mnt_drop_write_file(dst_file); next_loop: fdput(dst_fd); + + if (fatal_signal_pending(current)) + goto out; } out: diff --git a/fs/readdir.c b/fs/readdir.c index ced679179cac..e69ef3b79787 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -44,7 +44,7 @@ int iterate_dir(struct file *file, struct dir_context *ctx) fsnotify_access(file); file_accessed(file); } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return res; } diff --git a/fs/reiserfs/dir.c b/fs/reiserfs/dir.c index 4a024e2ceb9f..3abd4004184b 100644 --- a/fs/reiserfs/dir.c +++ b/fs/reiserfs/dir.c @@ -38,11 +38,11 @@ static int reiserfs_dir_fsync(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); reiserfs_write_lock(inode->i_sb); err = reiserfs_commit_for_inode(inode); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (err < 0) return err; return 0; diff --git a/fs/reiserfs/file.c b/fs/reiserfs/file.c index 96a1bcf33db4..9424a4ba93a9 100644 --- a/fs/reiserfs/file.c +++ b/fs/reiserfs/file.c @@ -158,7 +158,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); BUG_ON(!S_ISREG(inode->i_mode)); err = sync_mapping_buffers(inode->i_mapping); reiserfs_write_lock(inode->i_sb); @@ -166,7 +166,7 @@ static int reiserfs_sync_file(struct file *filp, loff_t start, loff_t end, reiserfs_write_unlock(inode->i_sb); if (barrier_done != 1 && reiserfs_barrier_flush(inode->i_sb)) blkdev_issue_flush(inode->i_sb->s_bdev, GFP_KERNEL, NULL); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (barrier_done < 0) return barrier_done; return (err < 0) ? -EIO : 0; diff --git a/fs/reiserfs/ioctl.c b/fs/reiserfs/ioctl.c index 6ec8a30a0911..036a1fc0a8c3 100644 --- a/fs/reiserfs/ioctl.c +++ b/fs/reiserfs/ioctl.c @@ -224,7 +224,7 @@ out_unlock: page_cache_release(page); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); reiserfs_write_unlock(inode->i_sb); return retval; } diff --git a/fs/reiserfs/super.c b/fs/reiserfs/super.c index 05db7473bcb5..c0306ec8ed7b 100644 --- a/fs/reiserfs/super.c +++ b/fs/reiserfs/super.c @@ -288,7 +288,7 @@ static int finish_unfinished(struct super_block *s) pathrelse(&path); inode = reiserfs_iget(s, &obj_key); - if (!inode) { + if (IS_ERR_OR_NULL(inode)) { /* * the unlink almost completed, it just did not * manage to remove "save" link and release objectid diff --git a/fs/reiserfs/xattr.c b/fs/reiserfs/xattr.c index e5ddb4e5ea94..57e0b2310532 100644 --- a/fs/reiserfs/xattr.c +++ b/fs/reiserfs/xattr.c @@ -64,14 +64,14 @@ #ifdef CONFIG_REISERFS_FS_XATTR static int xattr_create(struct inode *dir, struct dentry *dentry, int mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->create(dir, dentry, mode, true); } #endif static int xattr_mkdir(struct inode *dir, struct dentry *dentry, umode_t mode) { - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); return dir->i_op->mkdir(dir, dentry, mode); } @@ -85,11 +85,11 @@ static int xattr_unlink(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->unlink(dir, dentry); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -100,13 +100,13 @@ static int xattr_rmdir(struct inode *dir, struct dentry *dentry) { int error; - BUG_ON(!mutex_is_locked(&dir->i_mutex)); + BUG_ON(!inode_is_locked(dir)); - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(d_inode(dentry), I_MUTEX_CHILD); error = dir->i_op->rmdir(dir, dentry); if (!error) d_inode(dentry)->i_flags |= S_DEAD; - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); if (!error) d_delete(dentry); @@ -123,7 +123,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) if (d_really_is_negative(privroot)) return ERR_PTR(-ENODATA); - mutex_lock_nested(&d_inode(privroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(privroot), I_MUTEX_XATTR); xaroot = dget(REISERFS_SB(sb)->xattr_root); if (!xaroot) @@ -139,7 +139,7 @@ static struct dentry *open_xa_root(struct super_block *sb, int flags) } } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); return xaroot; } @@ -156,7 +156,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) le32_to_cpu(INODE_PKEY(inode)->k_objectid), inode->i_generation); - mutex_lock_nested(&d_inode(xaroot)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xaroot), I_MUTEX_XATTR); xadir = lookup_one_len(namebuf, xaroot, strlen(namebuf)); if (!IS_ERR(xadir) && d_really_is_negative(xadir)) { @@ -170,7 +170,7 @@ static struct dentry *open_xa_dir(const struct inode *inode, int flags) } } - mutex_unlock(&d_inode(xaroot)->i_mutex); + inode_unlock(d_inode(xaroot)); dput(xaroot); return xadir; } @@ -195,7 +195,7 @@ fill_with_dentries(struct dir_context *ctx, const char *name, int namelen, container_of(ctx, struct reiserfs_dentry_buf, ctx); struct dentry *dentry; - WARN_ON_ONCE(!mutex_is_locked(&d_inode(dbuf->xadir)->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(d_inode(dbuf->xadir))); if (dbuf->count == ARRAY_SIZE(dbuf->dentries)) return -ENOSPC; @@ -254,7 +254,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, goto out_dir; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); buf.xadir = dir; while (1) { @@ -276,7 +276,7 @@ static int reiserfs_for_each_xattr(struct inode *inode, break; buf.count = 0; } - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); cleanup_dentry_buf(&buf); @@ -298,13 +298,13 @@ static int reiserfs_for_each_xattr(struct inode *inode, if (!err) { int jerror; - mutex_lock_nested(&d_inode(dir->d_parent)->i_mutex, + inode_lock_nested(d_inode(dir->d_parent), I_MUTEX_XATTR); err = action(dir, data); reiserfs_write_lock(inode->i_sb); jerror = journal_end(&th); reiserfs_write_unlock(inode->i_sb); - mutex_unlock(&d_inode(dir->d_parent)->i_mutex); + inode_unlock(d_inode(dir->d_parent)); err = jerror ?: err; } } @@ -384,7 +384,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (IS_ERR(xadir)) return ERR_CAST(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); xafile = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(xafile)) { err = PTR_ERR(xafile); @@ -404,7 +404,7 @@ static struct dentry *xattr_lookup(struct inode *inode, const char *name, if (err) dput(xafile); out: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); if (err) return ERR_PTR(err); @@ -469,7 +469,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) if (IS_ERR(xadir)) return PTR_ERR(xadir); - mutex_lock_nested(&d_inode(xadir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(xadir), I_MUTEX_XATTR); dentry = lookup_one_len(name, xadir, strlen(name)); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); @@ -483,7 +483,7 @@ static int lookup_and_delete_xattr(struct inode *inode, const char *name) dput(dentry); out_dput: - mutex_unlock(&d_inode(xadir)->i_mutex); + inode_unlock(d_inode(xadir)); dput(xadir); return err; } @@ -580,11 +580,11 @@ reiserfs_xattr_set_handle(struct reiserfs_transaction_handle *th, .ia_valid = ATTR_SIZE | ATTR_CTIME, }; - mutex_lock_nested(&d_inode(dentry)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dentry), I_MUTEX_XATTR); inode_dio_wait(d_inode(dentry)); err = reiserfs_setattr(dentry, &newattrs); - mutex_unlock(&d_inode(dentry)->i_mutex); + inode_unlock(d_inode(dentry)); } else update_ctime(inode); out_unlock: @@ -888,9 +888,9 @@ ssize_t reiserfs_listxattr(struct dentry * dentry, char *buffer, size_t size) goto out; } - mutex_lock_nested(&d_inode(dir)->i_mutex, I_MUTEX_XATTR); + inode_lock_nested(d_inode(dir), I_MUTEX_XATTR); err = reiserfs_readdir_inode(d_inode(dir), &buf.ctx); - mutex_unlock(&d_inode(dir)->i_mutex); + inode_unlock(d_inode(dir)); if (!err) err = buf.pos; @@ -905,7 +905,7 @@ static int create_privroot(struct dentry *dentry) int err; struct inode *inode = d_inode(dentry->d_parent); - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); err = xattr_mkdir(inode, dentry, 0700); if (err || d_really_is_negative(dentry)) { @@ -995,7 +995,7 @@ int reiserfs_lookup_privroot(struct super_block *s) int err = 0; /* If we don't have the privroot located yet - go find it */ - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); dentry = lookup_one_len(PRIVROOT_NAME, s->s_root, strlen(PRIVROOT_NAME)); if (!IS_ERR(dentry)) { @@ -1005,7 +1005,7 @@ int reiserfs_lookup_privroot(struct super_block *s) d_inode(dentry)->i_flags |= S_PRIVATE; } else err = PTR_ERR(dentry); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); return err; } @@ -1025,14 +1025,14 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) goto error; if (d_really_is_negative(privroot) && !(mount_flags & MS_RDONLY)) { - mutex_lock(&d_inode(s->s_root)->i_mutex); + inode_lock(d_inode(s->s_root)); err = create_privroot(REISERFS_SB(s)->priv_root); - mutex_unlock(&d_inode(s->s_root)->i_mutex); + inode_unlock(d_inode(s->s_root)); } if (d_really_is_positive(privroot)) { s->s_xattr = reiserfs_xattr_handlers; - mutex_lock(&d_inode(privroot)->i_mutex); + inode_lock(d_inode(privroot)); if (!REISERFS_SB(s)->xattr_root) { struct dentry *dentry; @@ -1043,7 +1043,7 @@ int reiserfs_xattr_init(struct super_block *s, int mount_flags) else err = PTR_ERR(dentry); } - mutex_unlock(&d_inode(privroot)->i_mutex); + inode_unlock(d_inode(privroot)); } error: diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index c66f2423e1f5..4a0e48f92104 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -84,9 +84,9 @@ static int tracefs_syscall_mkdir(struct inode *inode, struct dentry *dentry, umo * the files within the tracefs system. It is up to the individual * mkdir routine to handle races. */ - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); ret = tracefs_ops.mkdir(name); - mutex_lock(&inode->i_mutex); + inode_lock(inode); kfree(name); @@ -109,13 +109,13 @@ static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) * This time we need to unlock not only the parent (inode) but * also the directory that is being deleted. */ - mutex_unlock(&inode->i_mutex); - mutex_unlock(&dentry->d_inode->i_mutex); + inode_unlock(inode); + inode_unlock(dentry->d_inode); ret = tracefs_ops.rmdir(name); - mutex_lock_nested(&inode->i_mutex, I_MUTEX_PARENT); - mutex_lock(&dentry->d_inode->i_mutex); + inode_lock_nested(inode, I_MUTEX_PARENT); + inode_lock(dentry->d_inode); kfree(name); @@ -334,7 +334,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (!parent) parent = tracefs_mount->mnt_root; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); dentry = lookup_one_len(name, parent, strlen(name)); if (!IS_ERR(dentry) && dentry->d_inode) { dput(dentry); @@ -342,7 +342,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) } if (IS_ERR(dentry)) { - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -351,7 +351,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) static struct dentry *failed_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); dput(dentry); simple_release_fs(&tracefs_mount, &tracefs_mount_count); return NULL; @@ -359,7 +359,7 @@ static struct dentry *failed_creating(struct dentry *dentry) static struct dentry *end_creating(struct dentry *dentry) { - mutex_unlock(&dentry->d_parent->d_inode->i_mutex); + inode_unlock(dentry->d_parent->d_inode); return dentry; } @@ -544,9 +544,9 @@ void tracefs_remove(struct dentry *dentry) if (!parent || !parent->d_inode) return; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); ret = __tracefs_remove(dentry, parent); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); if (!ret) simple_release_fs(&tracefs_mount, &tracefs_mount_count); } @@ -572,7 +572,7 @@ void tracefs_remove_recursive(struct dentry *dentry) parent = dentry; down: - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); loop: /* * The parent->d_subdirs is protected by the d_lock. Outside that @@ -587,7 +587,7 @@ void tracefs_remove_recursive(struct dentry *dentry) /* perhaps simple_empty(child) makes more sense */ if (!list_empty(&child->d_subdirs)) { spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); parent = child; goto down; } @@ -608,10 +608,10 @@ void tracefs_remove_recursive(struct dentry *dentry) } spin_unlock(&parent->d_lock); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); child = parent; parent = parent->d_parent; - mutex_lock(&parent->d_inode->i_mutex); + inode_lock(parent->d_inode); if (child != dentry) /* go up */ @@ -619,7 +619,7 @@ void tracefs_remove_recursive(struct dentry *dentry) if (!__tracefs_remove(child, parent)) simple_release_fs(&tracefs_mount, &tracefs_mount_count); - mutex_unlock(&parent->d_inode->i_mutex); + inode_unlock(parent->d_inode); } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index e49bd2808bf3..795992a8321e 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -515,8 +515,8 @@ static int ubifs_link(struct dentry *old_dentry, struct inode *dir, dbg_gen("dent '%pd' to ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) @@ -572,8 +572,8 @@ static int ubifs_unlink(struct inode *dir, struct dentry *dentry) dbg_gen("dent '%pd' from ino %lu (nlink %d) in dir ino %lu", dentry, inode->i_ino, inode->i_nlink, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = dbg_check_synced_i_size(c, inode); if (err) return err; @@ -661,8 +661,8 @@ static int ubifs_rmdir(struct inode *dir, struct dentry *dentry) dbg_gen("directory '%pd', ino %lu in dir ino %lu", dentry, inode->i_ino, dir->i_ino); - ubifs_assert(mutex_is_locked(&dir->i_mutex)); - ubifs_assert(mutex_is_locked(&inode->i_mutex)); + ubifs_assert(inode_is_locked(dir)); + ubifs_assert(inode_is_locked(inode)); err = check_dir_empty(c, d_inode(dentry)); if (err) return err; @@ -996,10 +996,10 @@ static int ubifs_rename(struct inode *old_dir, struct dentry *old_dentry, dbg_gen("dent '%pd' ino %lu in dir ino %lu to dent '%pd' in dir ino %lu", old_dentry, old_inode->i_ino, old_dir->i_ino, new_dentry, new_dir->i_ino); - ubifs_assert(mutex_is_locked(&old_dir->i_mutex)); - ubifs_assert(mutex_is_locked(&new_dir->i_mutex)); + ubifs_assert(inode_is_locked(old_dir)); + ubifs_assert(inode_is_locked(new_dir)); if (unlink) - ubifs_assert(mutex_is_locked(&new_inode->i_mutex)); + ubifs_assert(inode_is_locked(new_inode)); if (unlink && is_dir) { diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index eff62801acbf..065c88f8e4b8 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -1317,7 +1317,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) err = filemap_write_and_wait_range(inode->i_mapping, start, end); if (err) return err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* Synchronize the inode unless this is a 'datasync()' call. */ if (!datasync || (inode->i_state & I_DIRTY_DATASYNC)) { @@ -1332,7 +1332,7 @@ int ubifs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ err = ubifs_sync_wbufs_by_inode(c, inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err; } diff --git a/fs/ubifs/xattr.c b/fs/ubifs/xattr.c index e53292d0c21b..c7f4d434d098 100644 --- a/fs/ubifs/xattr.c +++ b/fs/ubifs/xattr.c @@ -313,7 +313,7 @@ static int setxattr(struct inode *host, const char *name, const void *value, union ubifs_key key; int err, type; - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); if (size > UBIFS_MAX_INO_DATA) return -ERANGE; @@ -550,7 +550,7 @@ int ubifs_removexattr(struct dentry *dentry, const char *name) dbg_gen("xattr '%s', ino %lu ('%pd')", name, host->i_ino, dentry); - ubifs_assert(mutex_is_locked(&host->i_mutex)); + ubifs_assert(inode_is_locked(host)); err = check_namespace(&nm); if (err < 0) diff --git a/fs/udf/file.c b/fs/udf/file.c index bddf3d071dae..1af98963d860 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -122,7 +122,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct udf_inode_info *iinfo = UDF_I(inode); int err; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = generic_write_checks(iocb, from); if (retval <= 0) @@ -136,7 +136,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) (udf_file_entry_alloc_offset(inode) + end)) { err = udf_expand_file_adinicb(inode); if (err) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); udf_debug("udf_expand_adinicb: err=%d\n", err); return err; } @@ -149,7 +149,7 @@ static ssize_t udf_file_write_iter(struct kiocb *iocb, struct iov_iter *from) retval = __generic_file_write_iter(iocb, from); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval > 0) { mark_inode_dirty(inode); @@ -223,12 +223,12 @@ static int udf_release_file(struct inode *inode, struct file *filp) * Grab i_mutex to avoid races with writes changing i_size * while we are running. */ - mutex_lock(&inode->i_mutex); + inode_lock(inode); down_write(&UDF_I(inode)->i_data_sem); udf_discard_prealloc(inode); udf_truncate_tail_extent(inode); up_write(&UDF_I(inode)->i_data_sem); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } return 0; } diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 87dc16d15572..166d3ed32c39 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -262,7 +262,7 @@ int udf_expand_file_adinicb(struct inode *inode) .nr_to_write = 1, }; - WARN_ON_ONCE(!mutex_is_locked(&inode->i_mutex)); + WARN_ON_ONCE(!inode_is_locked(inode)); if (!iinfo->i_lenAlloc) { if (UDF_QUERY_FLAG(inode->i_sb, UDF_FLAG_USE_SHORT_AD)) iinfo->i_alloc_type = ICBTAG_FLAG_AD_SHORT; diff --git a/fs/udf/super.c b/fs/udf/super.c index 0fbb4c7c72e8..a522c15a0bfd 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) { int i; int nr_groups = bitmap->s_nr_groups; - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * - nr_groups); for (i = 0; i < nr_groups; i++) if (bitmap->s_block_bitmap[i]) brelse(bitmap->s_block_bitmap[i]); - if (size <= PAGE_SIZE) - kfree(bitmap); - else - vfree(bitmap); + kvfree(bitmap); } static void udf_free_partition(struct udf_part_map *map) diff --git a/fs/utimes.c b/fs/utimes.c index aa138d64560a..85c40f4f373d 100644 --- a/fs/utimes.c +++ b/fs/utimes.c @@ -103,9 +103,9 @@ static int utimes_common(struct path *path, struct timespec *times) } } retry_deleg: - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = notify_change(path->dentry, &newattrs, &delegated_inode); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (delegated_inode) { error = break_deleg_wait(&delegated_inode); if (!error) diff --git a/fs/xattr.c b/fs/xattr.c index d5dd6c8b82a7..07d0e47f6a7f 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -129,7 +129,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_setxattr(dentry, name, value, size, flags); if (error) goto out; @@ -137,7 +137,7 @@ vfs_setxattr(struct dentry *dentry, const char *name, const void *value, error = __vfs_setxattr_noperm(dentry, name, value, size, flags); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_setxattr); @@ -277,7 +277,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) if (error) return error; - mutex_lock(&inode->i_mutex); + inode_lock(inode); error = security_inode_removexattr(dentry, name); if (error) goto out; @@ -290,7 +290,7 @@ vfs_removexattr(struct dentry *dentry, const char *name) } out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(vfs_removexattr); diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index e2536bb1c760..dc97eb21af07 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -984,8 +984,6 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) /* * Values for di_flags - * There should be a one-to-one correspondence between these flags and the - * XFS_XFLAG_s. */ #define XFS_DIFLAG_REALTIME_BIT 0 /* file's blocks come from rt area */ #define XFS_DIFLAG_PREALLOC_BIT 1 /* file space has been preallocated */ @@ -1026,6 +1024,15 @@ static inline void xfs_dinode_put_rdev(struct xfs_dinode *dip, xfs_dev_t rdev) XFS_DIFLAG_EXTSZINHERIT | XFS_DIFLAG_NODEFRAG | XFS_DIFLAG_FILESTREAM) /* + * Values for di_flags2 These start by being exposed to userspace in the upper + * 16 bits of the XFS_XFLAG_s range. + */ +#define XFS_DIFLAG2_DAX_BIT 0 /* use DAX for this inode */ +#define XFS_DIFLAG2_DAX (1 << XFS_DIFLAG2_DAX_BIT) + +#define XFS_DIFLAG2_ANY (XFS_DIFLAG2_DAX) + +/* * Inode number format: * low inopblog bits - offset in block * next agblklog bits - block number in ag diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index b2b73a998d42..fffe3d01bd9f 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -36,40 +36,6 @@ struct dioattr { #endif /* - * Structure for XFS_IOC_FSGETXATTR[A] and XFS_IOC_FSSETXATTR. - */ -#ifndef HAVE_FSXATTR -struct fsxattr { - __u32 fsx_xflags; /* xflags field value (get/set) */ - __u32 fsx_extsize; /* extsize field value (get/set)*/ - __u32 fsx_nextents; /* nextents field value (get) */ - __u32 fsx_projid; /* project identifier (get/set) */ - unsigned char fsx_pad[12]; -}; -#endif - -/* - * Flags for the bs_xflags/fsx_xflags field - * There should be a one-to-one correspondence between these flags and the - * XFS_DIFLAG_s. - */ -#define XFS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ -#define XFS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ -#define XFS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ -#define XFS_XFLAG_APPEND 0x00000010 /* all writes append */ -#define XFS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ -#define XFS_XFLAG_NOATIME 0x00000040 /* do not update access time */ -#define XFS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ -#define XFS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ -#define XFS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ -#define XFS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ -#define XFS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ -#define XFS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ -#define XFS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ -#define XFS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ -#define XFS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ - -/* * Structure for XFS_IOC_GETBMAP. * On input, fill in bmv_offset and bmv_length of the first structure * to indicate the area of interest in the file, and bmv_entries with @@ -514,8 +480,8 @@ typedef struct xfs_swapext #define XFS_IOC_ALLOCSP _IOW ('X', 10, struct xfs_flock64) #define XFS_IOC_FREESP _IOW ('X', 11, struct xfs_flock64) #define XFS_IOC_DIOINFO _IOR ('X', 30, struct dioattr) -#define XFS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) -#define XFS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) +#define XFS_IOC_FSGETXATTR FS_IOC_FSGETXATTR +#define XFS_IOC_FSSETXATTR FS_IOC_FSSETXATTR #define XFS_IOC_ALLOCSP64 _IOW ('X', 36, struct xfs_flock64) #define XFS_IOC_FREESP64 _IOW ('X', 37, struct xfs_flock64) #define XFS_IOC_GETBMAP _IOWR('X', 38, struct getbmap) diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index daed4bfb85b2..435c7de42e5f 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -1527,6 +1527,16 @@ xfs_wait_buftarg( LIST_HEAD(dispose); int loop = 0; + /* + * We need to flush the buffer workqueue to ensure that all IO + * completion processing is 100% done. Just waiting on buffer locks is + * not sufficient for async IO as the reference count held over IO is + * not released until after the buffer lock is dropped. Hence we need to + * ensure here that all reference counts have been dropped before we + * start walking the LRU list. + */ + drain_workqueue(btp->bt_mount->m_buf_workqueue); + /* loop until there is nothing left on the lru list. */ while (list_lru_count(&btp->bt_lru)) { list_lru_walk(&btp->bt_lru, xfs_buftarg_wait_rele, diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebe9b8290a70..52883ac3cf84 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -55,7 +55,7 @@ xfs_rw_ilock( int type) { if (type & XFS_IOLOCK_EXCL) - mutex_lock(&VFS_I(ip)->i_mutex); + inode_lock(VFS_I(ip)); xfs_ilock(ip, type); } @@ -66,7 +66,7 @@ xfs_rw_iunlock( { xfs_iunlock(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } static inline void @@ -76,7 +76,7 @@ xfs_rw_ilock_demote( { xfs_ilock_demote(ip, type); if (type & XFS_IOLOCK_EXCL) - mutex_unlock(&VFS_I(ip)->i_mutex); + inode_unlock(VFS_I(ip)); } /* @@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault( /* * pfn_mkwrite was originally inteneded to ensure we capture time stamp * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault - * barrier in place. + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED + * to ensure we serialise the fault barrier in place. */ static int xfs_filemap_pfn_mkwrite( @@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite( size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else if (IS_DAX(inode)) + ret = dax_pfn_mkwrite(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index ae3758a90ed6..ceba1a83cacc 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -610,60 +610,69 @@ __xfs_iflock( STATIC uint _xfs_dic2xflags( - __uint16_t di_flags) + __uint16_t di_flags, + uint64_t di_flags2, + bool has_attr) { uint flags = 0; if (di_flags & XFS_DIFLAG_ANY) { if (di_flags & XFS_DIFLAG_REALTIME) - flags |= XFS_XFLAG_REALTIME; + flags |= FS_XFLAG_REALTIME; if (di_flags & XFS_DIFLAG_PREALLOC) - flags |= XFS_XFLAG_PREALLOC; + flags |= FS_XFLAG_PREALLOC; if (di_flags & XFS_DIFLAG_IMMUTABLE) - flags |= XFS_XFLAG_IMMUTABLE; + flags |= FS_XFLAG_IMMUTABLE; if (di_flags & XFS_DIFLAG_APPEND) - flags |= XFS_XFLAG_APPEND; + flags |= FS_XFLAG_APPEND; if (di_flags & XFS_DIFLAG_SYNC) - flags |= XFS_XFLAG_SYNC; + flags |= FS_XFLAG_SYNC; if (di_flags & XFS_DIFLAG_NOATIME) - flags |= XFS_XFLAG_NOATIME; + flags |= FS_XFLAG_NOATIME; if (di_flags & XFS_DIFLAG_NODUMP) - flags |= XFS_XFLAG_NODUMP; + flags |= FS_XFLAG_NODUMP; if (di_flags & XFS_DIFLAG_RTINHERIT) - flags |= XFS_XFLAG_RTINHERIT; + flags |= FS_XFLAG_RTINHERIT; if (di_flags & XFS_DIFLAG_PROJINHERIT) - flags |= XFS_XFLAG_PROJINHERIT; + flags |= FS_XFLAG_PROJINHERIT; if (di_flags & XFS_DIFLAG_NOSYMLINKS) - flags |= XFS_XFLAG_NOSYMLINKS; + flags |= FS_XFLAG_NOSYMLINKS; if (di_flags & XFS_DIFLAG_EXTSIZE) - flags |= XFS_XFLAG_EXTSIZE; + flags |= FS_XFLAG_EXTSIZE; if (di_flags & XFS_DIFLAG_EXTSZINHERIT) - flags |= XFS_XFLAG_EXTSZINHERIT; + flags |= FS_XFLAG_EXTSZINHERIT; if (di_flags & XFS_DIFLAG_NODEFRAG) - flags |= XFS_XFLAG_NODEFRAG; + flags |= FS_XFLAG_NODEFRAG; if (di_flags & XFS_DIFLAG_FILESTREAM) - flags |= XFS_XFLAG_FILESTREAM; + flags |= FS_XFLAG_FILESTREAM; } + if (di_flags2 & XFS_DIFLAG2_ANY) { + if (di_flags2 & XFS_DIFLAG2_DAX) + flags |= FS_XFLAG_DAX; + } + + if (has_attr) + flags |= FS_XFLAG_HASATTR; + return flags; } uint xfs_ip2xflags( - xfs_inode_t *ip) + struct xfs_inode *ip) { - xfs_icdinode_t *dic = &ip->i_d; + struct xfs_icdinode *dic = &ip->i_d; - return _xfs_dic2xflags(dic->di_flags) | - (XFS_IFORK_Q(ip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(dic->di_flags, dic->di_flags2, XFS_IFORK_Q(ip)); } uint xfs_dic2xflags( - xfs_dinode_t *dip) + struct xfs_dinode *dip) { - return _xfs_dic2xflags(be16_to_cpu(dip->di_flags)) | - (XFS_DFORK_Q(dip) ? XFS_XFLAG_HASATTR : 0); + return _xfs_dic2xflags(be16_to_cpu(dip->di_flags), + be64_to_cpu(dip->di_flags2), XFS_DFORK_Q(dip)); } /* @@ -862,7 +871,8 @@ xfs_ialloc( case S_IFREG: case S_IFDIR: if (pip && (pip->i_d.di_flags & XFS_DIFLAG_ANY)) { - uint di_flags = 0; + uint64_t di_flags2 = 0; + uint di_flags = 0; if (S_ISDIR(mode)) { if (pip->i_d.di_flags & XFS_DIFLAG_RTINHERIT) @@ -898,7 +908,11 @@ xfs_ialloc( di_flags |= XFS_DIFLAG_NODEFRAG; if (pip->i_d.di_flags & XFS_DIFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; + if (pip->i_d.di_flags2 & XFS_DIFLAG2_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + ip->i_d.di_flags |= di_flags; + ip->i_d.di_flags2 |= di_flags2; } /* FALLTHROUGH */ case S_IFLNK: diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index d42738deec6d..478d04e07f95 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -859,25 +859,25 @@ xfs_merge_ioc_xflags( unsigned int xflags = start; if (flags & FS_IMMUTABLE_FL) - xflags |= XFS_XFLAG_IMMUTABLE; + xflags |= FS_XFLAG_IMMUTABLE; else - xflags &= ~XFS_XFLAG_IMMUTABLE; + xflags &= ~FS_XFLAG_IMMUTABLE; if (flags & FS_APPEND_FL) - xflags |= XFS_XFLAG_APPEND; + xflags |= FS_XFLAG_APPEND; else - xflags &= ~XFS_XFLAG_APPEND; + xflags &= ~FS_XFLAG_APPEND; if (flags & FS_SYNC_FL) - xflags |= XFS_XFLAG_SYNC; + xflags |= FS_XFLAG_SYNC; else - xflags &= ~XFS_XFLAG_SYNC; + xflags &= ~FS_XFLAG_SYNC; if (flags & FS_NOATIME_FL) - xflags |= XFS_XFLAG_NOATIME; + xflags |= FS_XFLAG_NOATIME; else - xflags &= ~XFS_XFLAG_NOATIME; + xflags &= ~FS_XFLAG_NOATIME; if (flags & FS_NODUMP_FL) - xflags |= XFS_XFLAG_NODUMP; + xflags |= FS_XFLAG_NODUMP; else - xflags &= ~XFS_XFLAG_NODUMP; + xflags &= ~FS_XFLAG_NODUMP; return xflags; } @@ -945,40 +945,51 @@ xfs_set_diflags( unsigned int xflags) { unsigned int di_flags; + uint64_t di_flags2; /* can't set PREALLOC this way, just preserve it */ di_flags = (ip->i_d.di_flags & XFS_DIFLAG_PREALLOC); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) di_flags |= XFS_DIFLAG_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) di_flags |= XFS_DIFLAG_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) di_flags |= XFS_DIFLAG_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) di_flags |= XFS_DIFLAG_NOATIME; - if (xflags & XFS_XFLAG_NODUMP) + if (xflags & FS_XFLAG_NODUMP) di_flags |= XFS_DIFLAG_NODUMP; - if (xflags & XFS_XFLAG_NODEFRAG) + if (xflags & FS_XFLAG_NODEFRAG) di_flags |= XFS_DIFLAG_NODEFRAG; - if (xflags & XFS_XFLAG_FILESTREAM) + if (xflags & FS_XFLAG_FILESTREAM) di_flags |= XFS_DIFLAG_FILESTREAM; if (S_ISDIR(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_RTINHERIT) + if (xflags & FS_XFLAG_RTINHERIT) di_flags |= XFS_DIFLAG_RTINHERIT; - if (xflags & XFS_XFLAG_NOSYMLINKS) + if (xflags & FS_XFLAG_NOSYMLINKS) di_flags |= XFS_DIFLAG_NOSYMLINKS; - if (xflags & XFS_XFLAG_EXTSZINHERIT) + if (xflags & FS_XFLAG_EXTSZINHERIT) di_flags |= XFS_DIFLAG_EXTSZINHERIT; - if (xflags & XFS_XFLAG_PROJINHERIT) + if (xflags & FS_XFLAG_PROJINHERIT) di_flags |= XFS_DIFLAG_PROJINHERIT; } else if (S_ISREG(ip->i_d.di_mode)) { - if (xflags & XFS_XFLAG_REALTIME) + if (xflags & FS_XFLAG_REALTIME) di_flags |= XFS_DIFLAG_REALTIME; - if (xflags & XFS_XFLAG_EXTSIZE) + if (xflags & FS_XFLAG_EXTSIZE) di_flags |= XFS_DIFLAG_EXTSIZE; } - ip->i_d.di_flags = di_flags; + + /* diflags2 only valid for v3 inodes. */ + if (ip->i_d.di_version < 3) + return; + + di_flags2 = 0; + if (xflags & FS_XFLAG_DAX) + di_flags2 |= XFS_DIFLAG2_DAX; + + ip->i_d.di_flags2 = di_flags2; + } STATIC void @@ -988,22 +999,27 @@ xfs_diflags_to_linux( struct inode *inode = VFS_I(ip); unsigned int xflags = xfs_ip2xflags(ip); - if (xflags & XFS_XFLAG_IMMUTABLE) + if (xflags & FS_XFLAG_IMMUTABLE) inode->i_flags |= S_IMMUTABLE; else inode->i_flags &= ~S_IMMUTABLE; - if (xflags & XFS_XFLAG_APPEND) + if (xflags & FS_XFLAG_APPEND) inode->i_flags |= S_APPEND; else inode->i_flags &= ~S_APPEND; - if (xflags & XFS_XFLAG_SYNC) + if (xflags & FS_XFLAG_SYNC) inode->i_flags |= S_SYNC; else inode->i_flags &= ~S_SYNC; - if (xflags & XFS_XFLAG_NOATIME) + if (xflags & FS_XFLAG_NOATIME) inode->i_flags |= S_NOATIME; else inode->i_flags &= ~S_NOATIME; + if (xflags & FS_XFLAG_DAX) + inode->i_flags |= S_DAX; + else + inode->i_flags &= ~S_DAX; + } static int @@ -1016,11 +1032,11 @@ xfs_ioctl_setattr_xflags( /* Can't change realtime flag if any extents are allocated. */ if ((ip->i_d.di_nextents || ip->i_delayed_blks) && - XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & XFS_XFLAG_REALTIME)) + XFS_IS_REALTIME_INODE(ip) != (fa->fsx_xflags & FS_XFLAG_REALTIME)) return -EINVAL; /* If realtime flag is set then must have realtime device */ - if (fa->fsx_xflags & XFS_XFLAG_REALTIME) { + if (fa->fsx_xflags & FS_XFLAG_REALTIME) { if (mp->m_sb.sb_rblocks == 0 || mp->m_sb.sb_rextsize == 0 || (ip->i_d.di_extsize % mp->m_sb.sb_rextsize)) return -EINVAL; @@ -1031,7 +1047,7 @@ xfs_ioctl_setattr_xflags( * we have appropriate permission. */ if (((ip->i_d.di_flags & (XFS_DIFLAG_IMMUTABLE | XFS_DIFLAG_APPEND)) || - (fa->fsx_xflags & (XFS_XFLAG_IMMUTABLE | XFS_XFLAG_APPEND))) && + (fa->fsx_xflags & (FS_XFLAG_IMMUTABLE | FS_XFLAG_APPEND))) && !capable(CAP_LINUX_IMMUTABLE)) return -EPERM; @@ -1095,8 +1111,8 @@ out_cancel: * extent size hint validation is somewhat cumbersome. Rules are: * * 1. extent size hint is only valid for directories and regular files - * 2. XFS_XFLAG_EXTSIZE is only valid for regular files - * 3. XFS_XFLAG_EXTSZINHERIT is only valid for directories. + * 2. FS_XFLAG_EXTSIZE is only valid for regular files + * 3. FS_XFLAG_EXTSZINHERIT is only valid for directories. * 4. can only be changed on regular files if no extents are allocated * 5. can be changed on directories at any time * 6. extsize hint of 0 turns off hints, clears inode flags. @@ -1112,10 +1128,10 @@ xfs_ioctl_setattr_check_extsize( { struct xfs_mount *mp = ip->i_mount; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) + if ((fa->fsx_xflags & FS_XFLAG_EXTSIZE) && !S_ISREG(ip->i_d.di_mode)) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_EXTSZINHERIT) && + if ((fa->fsx_xflags & FS_XFLAG_EXTSZINHERIT) && !S_ISDIR(ip->i_d.di_mode)) return -EINVAL; @@ -1132,7 +1148,7 @@ xfs_ioctl_setattr_check_extsize( return -EINVAL; if (XFS_IS_REALTIME_INODE(ip) || - (fa->fsx_xflags & XFS_XFLAG_REALTIME)) { + (fa->fsx_xflags & FS_XFLAG_REALTIME)) { size = mp->m_sb.sb_rextsize << mp->m_sb.sb_blocklog; } else { size = mp->m_sb.sb_blocksize; @@ -1143,7 +1159,7 @@ xfs_ioctl_setattr_check_extsize( if (fa->fsx_extsize % size) return -EINVAL; } else - fa->fsx_xflags &= ~(XFS_XFLAG_EXTSIZE | XFS_XFLAG_EXTSZINHERIT); + fa->fsx_xflags &= ~(FS_XFLAG_EXTSIZE | FS_XFLAG_EXTSZINHERIT); return 0; } @@ -1168,7 +1184,7 @@ xfs_ioctl_setattr_check_projid( if (xfs_get_projid(ip) != fa->fsx_projid) return -EINVAL; - if ((fa->fsx_xflags & XFS_XFLAG_PROJINHERIT) != + if ((fa->fsx_xflags & FS_XFLAG_PROJINHERIT) != (ip->i_d.di_flags & XFS_DIFLAG_PROJINHERIT)) return -EINVAL; diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 06eafafe636e..76b71a1c6c32 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -1205,8 +1205,8 @@ xfs_diflags_to_iflags( inode->i_flags |= S_SYNC; if (flags & XFS_DIFLAG_NOATIME) inode->i_flags |= S_NOATIME; - /* XXX: Also needs an on-disk per inode flag! */ - if (ip->i_mount->m_flags & XFS_MOUNT_DAX) + if (ip->i_mount->m_flags & XFS_MOUNT_DAX || + ip->i_d.di_flags2 & XFS_DIFLAG2_DAX) inode->i_flags |= S_DAX; } diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index dc6221942b85..ade236e90bb3 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -42,11 +42,11 @@ xfs_break_layouts( while ((error = break_layout(inode, false) == -EWOULDBLOCK)) { xfs_iunlock(ip, *iolock); if (with_imutex && (*iolock & XFS_IOLOCK_EXCL)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); error = break_layout(inode, true); *iolock = XFS_IOLOCK_EXCL; if (with_imutex) - mutex_lock(&inode->i_mutex); + inode_lock(inode); xfs_ilock(ip, *iolock); } diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index aa67339b9537..4f18fd92ca13 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -497,7 +497,6 @@ xfsaild( long tout = 0; /* milliseconds */ current->flags |= PF_MEMALLOC; - set_freezable(); while (!kthread_should_stop()) { if (tout && tout <= 20) diff --git a/include/crypto/hash.h b/include/crypto/hash.h index 3d69c93d50e8..6361892ea737 100644 --- a/include/crypto/hash.h +++ b/include/crypto/hash.h @@ -204,6 +204,7 @@ struct crypto_ahash { unsigned int keylen); unsigned int reqsize; + bool has_setkey; struct crypto_tfm base; }; @@ -375,6 +376,11 @@ static inline void *ahash_request_ctx(struct ahash_request *req) int crypto_ahash_setkey(struct crypto_ahash *tfm, const u8 *key, unsigned int keylen); +static inline bool crypto_ahash_has_setkey(struct crypto_ahash *tfm) +{ + return tfm->has_setkey; +} + /** * crypto_ahash_finup() - update and finalize message digest * @req: reference to the ahash_request handle that holds all information diff --git a/include/crypto/if_alg.h b/include/crypto/if_alg.h index 018afb264ac2..a2bfd7843f18 100644 --- a/include/crypto/if_alg.h +++ b/include/crypto/if_alg.h @@ -30,6 +30,9 @@ struct alg_sock { struct sock *parent; + unsigned int refcnt; + unsigned int nokey_refcnt; + const struct af_alg_type *type; void *private; }; @@ -50,9 +53,11 @@ struct af_alg_type { void (*release)(void *private); int (*setkey)(void *private, const u8 *key, unsigned int keylen); int (*accept)(void *private, struct sock *sk); + int (*accept_nokey)(void *private, struct sock *sk); int (*setauthsize)(void *private, unsigned int authsize); struct proto_ops *ops; + struct proto_ops *ops_nokey; struct module *owner; char name[14]; }; @@ -67,6 +72,7 @@ int af_alg_register_type(const struct af_alg_type *type); int af_alg_unregister_type(const struct af_alg_type *type); int af_alg_release(struct socket *sock); +void af_alg_release_parent(struct sock *sk); int af_alg_accept(struct sock *sk, struct socket *newsock); int af_alg_make_sg(struct af_alg_sgl *sgl, struct iov_iter *iter, int len); @@ -83,11 +89,6 @@ static inline struct alg_sock *alg_sk(struct sock *sk) return (struct alg_sock *)sk; } -static inline void af_alg_release_parent(struct sock *sk) -{ - sock_put(alg_sk(sk)->parent); -} - static inline void af_alg_init_completion(struct af_alg_completion *completion) { init_completion(&completion->completion); diff --git a/include/crypto/skcipher.h b/include/crypto/skcipher.h index d8dd41fb034f..fd8742a40ff3 100644 --- a/include/crypto/skcipher.h +++ b/include/crypto/skcipher.h @@ -61,6 +61,8 @@ struct crypto_skcipher { unsigned int ivsize; unsigned int reqsize; + bool has_setkey; + struct crypto_tfm base; }; @@ -305,6 +307,11 @@ static inline int crypto_skcipher_setkey(struct crypto_skcipher *tfm, return tfm->setkey(tfm, key, keylen); } +static inline bool crypto_skcipher_has_setkey(struct crypto_skcipher *tfm) +{ + return tfm->has_setkey; +} + /** * crypto_skcipher_reqtfm() - obtain cipher handle from request * @req: skcipher_request out of which the cipher handle is to be obtained diff --git a/include/linux/aer.h b/include/linux/aer.h index 744b997d6a94..164049357e5c 100644 --- a/include/linux/aer.h +++ b/include/linux/aer.h @@ -7,6 +7,7 @@ #ifndef _AER_H_ #define _AER_H_ +#include <linux/errno.h> #include <linux/types.h> #define AER_NONFATAL 0 diff --git a/include/linux/bio.h b/include/linux/bio.h index b9b6e046b52e..5349e6816cbb 100644 --- a/include/linux/bio.h +++ b/include/linux/bio.h @@ -318,16 +318,6 @@ enum bip_flags { BIP_IP_CHECKSUM = 1 << 4, /* IP checksum */ }; -#if defined(CONFIG_BLK_DEV_INTEGRITY) - -static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) -{ - if (bio->bi_rw & REQ_INTEGRITY) - return bio->bi_integrity; - - return NULL; -} - /* * bio integrity payload */ @@ -349,6 +339,16 @@ struct bio_integrity_payload { struct bio_vec bip_inline_vecs[0];/* embedded bvec array */ }; +#if defined(CONFIG_BLK_DEV_INTEGRITY) + +static inline struct bio_integrity_payload *bio_integrity(struct bio *bio) +{ + if (bio->bi_rw & REQ_INTEGRITY) + return bio->bi_integrity; + + return NULL; +} + static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) { struct bio_integrity_payload *bip = bio_integrity(bio); @@ -795,6 +795,18 @@ static inline bool bio_integrity_flagged(struct bio *bio, enum bip_flags flag) return false; } +static inline void *bio_integrity_alloc(struct bio * bio, gfp_t gfp, + unsigned int nr) +{ + return ERR_PTR(-EINVAL); +} + +static inline int bio_integrity_add_page(struct bio *bio, struct page *page, + unsigned int len, unsigned int offset) +{ + return 0; +} + #endif /* CONFIG_BLK_DEV_INTEGRITY */ #endif /* CONFIG_BLOCK */ diff --git a/include/linux/blk-iopoll.h b/include/linux/blk-iopoll.h deleted file mode 100644 index 77ae77c0b704..000000000000 --- a/include/linux/blk-iopoll.h +++ /dev/null @@ -1,46 +0,0 @@ -#ifndef BLK_IOPOLL_H -#define BLK_IOPOLL_H - -struct blk_iopoll; -typedef int (blk_iopoll_fn)(struct blk_iopoll *, int); - -struct blk_iopoll { - struct list_head list; - unsigned long state; - unsigned long data; - int weight; - int max; - blk_iopoll_fn *poll; -}; - -enum { - IOPOLL_F_SCHED = 0, - IOPOLL_F_DISABLE = 1, -}; - -/* - * Returns 0 if we successfully set the IOPOLL_F_SCHED bit, indicating - * that we were the first to acquire this iop for scheduling. If this iop - * is currently disabled, return "failure". - */ -static inline int blk_iopoll_sched_prep(struct blk_iopoll *iop) -{ - if (!test_bit(IOPOLL_F_DISABLE, &iop->state)) - return test_and_set_bit(IOPOLL_F_SCHED, &iop->state); - - return 1; -} - -static inline int blk_iopoll_disable_pending(struct blk_iopoll *iop) -{ - return test_bit(IOPOLL_F_DISABLE, &iop->state); -} - -extern void blk_iopoll_sched(struct blk_iopoll *); -extern void blk_iopoll_init(struct blk_iopoll *, int, blk_iopoll_fn *); -extern void blk_iopoll_complete(struct blk_iopoll *); -extern void __blk_iopoll_complete(struct blk_iopoll *); -extern void blk_iopoll_enable(struct blk_iopoll *); -extern void blk_iopoll_disable(struct blk_iopoll *); - -#endif diff --git a/include/linux/blk_types.h b/include/linux/blk_types.h index 0fb65843ec1e..86a38ea1823f 100644 --- a/include/linux/blk_types.h +++ b/include/linux/blk_types.h @@ -188,7 +188,6 @@ enum rq_flag_bits { __REQ_PM, /* runtime pm request */ __REQ_HASHED, /* on IO scheduler merge hash */ __REQ_MQ_INFLIGHT, /* track inflight for MQ */ - __REQ_NO_TIMEOUT, /* requests may never expire */ __REQ_NR_BITS, /* stops here */ }; @@ -242,7 +241,6 @@ enum rq_flag_bits { #define REQ_PM (1ULL << __REQ_PM) #define REQ_HASHED (1ULL << __REQ_HASHED) #define REQ_MQ_INFLIGHT (1ULL << __REQ_MQ_INFLIGHT) -#define REQ_NO_TIMEOUT (1ULL << __REQ_NO_TIMEOUT) typedef unsigned int blk_qc_t; #define BLK_QC_T_NONE -1U diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index d372ea87ead5..29189aeace19 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -409,6 +409,7 @@ struct request_queue { unsigned int rq_timeout; struct timer_list timeout; + struct work_struct timeout_work; struct list_head timeout_list; struct list_head icq_list; diff --git a/include/linux/ceph/ceph_frag.h b/include/linux/ceph/ceph_frag.h index 5babb8e95352..b827e066e55a 100644 --- a/include/linux/ceph/ceph_frag.h +++ b/include/linux/ceph/ceph_frag.h @@ -40,46 +40,11 @@ static inline __u32 ceph_frag_mask_shift(__u32 f) return 24 - ceph_frag_bits(f); } -static inline int ceph_frag_contains_value(__u32 f, __u32 v) +static inline bool ceph_frag_contains_value(__u32 f, __u32 v) { return (v & ceph_frag_mask(f)) == ceph_frag_value(f); } -static inline int ceph_frag_contains_frag(__u32 f, __u32 sub) -{ - /* is sub as specific as us, and contained by us? */ - return ceph_frag_bits(sub) >= ceph_frag_bits(f) && - (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f); -} -static inline __u32 ceph_frag_parent(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f) - 1, - ceph_frag_value(f) & (ceph_frag_mask(f) << 1)); -} -static inline int ceph_frag_is_left_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0; -} -static inline int ceph_frag_is_right_child(__u32 f) -{ - return ceph_frag_bits(f) > 0 && - (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1; -} -static inline __u32 ceph_frag_sibling(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f), - ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f))); -} -static inline __u32 ceph_frag_left_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f)); -} -static inline __u32 ceph_frag_right_child(__u32 f) -{ - return ceph_frag_make(ceph_frag_bits(f)+1, - ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f)))); -} static inline __u32 ceph_frag_make_child(__u32 f, int by, int i) { int newbits = ceph_frag_bits(f) + by; diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 71b1d6cdcb5d..8dbd7879fdc6 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -220,6 +220,7 @@ struct ceph_connection { struct ceph_entity_addr actual_peer_addr; /* message out temps */ + struct ceph_msg_header out_hdr; struct ceph_msg *out_msg; /* sending message (== tail of out_sent) */ bool out_msg_done; @@ -229,7 +230,6 @@ struct ceph_connection { int out_kvec_left; /* kvec's left in out_kvec */ int out_skip; /* skip this many bytes */ int out_kvec_bytes; /* total bytes left */ - bool out_kvec_is_msg; /* kvec refers to out_msg */ int out_more; /* there is more data after the kvecs */ __le64 out_temp_ack; /* for writing an ack */ struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 diff --git a/include/linux/dax.h b/include/linux/dax.h index b415e521528d..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -36,4 +36,11 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } + +static inline bool dax_mapping(struct address_space *mapping) +{ + return mapping->host && IS_DAX(mapping->host); +} +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end); #endif diff --git a/include/linux/drbd.h b/include/linux/drbd.h index 8723f2a99e15..d6b3c9943a2c 100644 --- a/include/linux/drbd.h +++ b/include/linux/drbd.h @@ -25,7 +25,6 @@ */ #ifndef DRBD_H #define DRBD_H -#include <linux/connector.h> #include <asm/types.h> #ifdef __KERNEL__ @@ -52,7 +51,7 @@ #endif extern const char *drbd_buildtag(void); -#define REL_VERSION "8.4.5" +#define REL_VERSION "8.4.6" #define API_VERSION 1 #define PRO_VERSION_MIN 86 #define PRO_VERSION_MAX 101 @@ -339,6 +338,8 @@ enum drbd_state_rv { #define MDF_AL_CLEAN (1 << 7) #define MDF_AL_DISABLED (1 << 8) +#define MAX_PEERS 32 + enum drbd_uuid_index { UI_CURRENT, UI_BITMAP, @@ -349,14 +350,35 @@ enum drbd_uuid_index { UI_EXTENDED_SIZE /* Everything. */ }; +#define HISTORY_UUIDS MAX_PEERS + enum drbd_timeout_flag { UT_DEFAULT = 0, UT_DEGRADED = 1, UT_PEER_OUTDATED = 2, }; +enum drbd_notification_type { + NOTIFY_EXISTS, + NOTIFY_CREATE, + NOTIFY_CHANGE, + NOTIFY_DESTROY, + NOTIFY_CALL, + NOTIFY_RESPONSE, + + NOTIFY_CONTINUES = 0x8000, + NOTIFY_FLAGS = NOTIFY_CONTINUES, +}; + #define UUID_JUST_CREATED ((__u64)4) +enum write_ordering_e { + WO_NONE, + WO_DRAIN_IO, + WO_BDEV_FLUSH, + WO_BIO_BARRIER +}; + /* magic numbers used in meta data and network packets */ #define DRBD_MAGIC 0x83740267 #define DRBD_MAGIC_BIG 0x835a diff --git a/include/linux/drbd_genl.h b/include/linux/drbd_genl.h index 7b131ed8f9c6..2d0e5ad5de9d 100644 --- a/include/linux/drbd_genl.h +++ b/include/linux/drbd_genl.h @@ -250,6 +250,76 @@ GENL_struct(DRBD_NLA_DETACH_PARMS, 13, detach_parms, __flg_field(1, DRBD_GENLA_F_MANDATORY, force_detach) ) +GENL_struct(DRBD_NLA_RESOURCE_INFO, 15, resource_info, + __u32_field(1, 0, res_role) + __flg_field(2, 0, res_susp) + __flg_field(3, 0, res_susp_nod) + __flg_field(4, 0, res_susp_fen) + /* __flg_field(5, 0, res_weak) */ +) + +GENL_struct(DRBD_NLA_DEVICE_INFO, 16, device_info, + __u32_field(1, 0, dev_disk_state) +) + +GENL_struct(DRBD_NLA_CONNECTION_INFO, 17, connection_info, + __u32_field(1, 0, conn_connection_state) + __u32_field(2, 0, conn_role) +) + +GENL_struct(DRBD_NLA_PEER_DEVICE_INFO, 18, peer_device_info, + __u32_field(1, 0, peer_repl_state) + __u32_field(2, 0, peer_disk_state) + __u32_field(3, 0, peer_resync_susp_user) + __u32_field(4, 0, peer_resync_susp_peer) + __u32_field(5, 0, peer_resync_susp_dependency) +) + +GENL_struct(DRBD_NLA_RESOURCE_STATISTICS, 19, resource_statistics, + __u32_field(1, 0, res_stat_write_ordering) +) + +GENL_struct(DRBD_NLA_DEVICE_STATISTICS, 20, device_statistics, + __u64_field(1, 0, dev_size) /* (sectors) */ + __u64_field(2, 0, dev_read) /* (sectors) */ + __u64_field(3, 0, dev_write) /* (sectors) */ + __u64_field(4, 0, dev_al_writes) /* activity log writes (count) */ + __u64_field(5, 0, dev_bm_writes) /* bitmap writes (count) */ + __u32_field(6, 0, dev_upper_pending) /* application requests in progress */ + __u32_field(7, 0, dev_lower_pending) /* backing device requests in progress */ + __flg_field(8, 0, dev_upper_blocked) + __flg_field(9, 0, dev_lower_blocked) + __flg_field(10, 0, dev_al_suspended) /* activity log suspended */ + __u64_field(11, 0, dev_exposed_data_uuid) + __u64_field(12, 0, dev_current_uuid) + __u32_field(13, 0, dev_disk_flags) + __bin_field(14, 0, history_uuids, HISTORY_UUIDS * sizeof(__u64)) +) + +GENL_struct(DRBD_NLA_CONNECTION_STATISTICS, 21, connection_statistics, + __flg_field(1, 0, conn_congested) +) + +GENL_struct(DRBD_NLA_PEER_DEVICE_STATISTICS, 22, peer_device_statistics, + __u64_field(1, 0, peer_dev_received) /* sectors */ + __u64_field(2, 0, peer_dev_sent) /* sectors */ + __u32_field(3, 0, peer_dev_pending) /* number of requests */ + __u32_field(4, 0, peer_dev_unacked) /* number of requests */ + __u64_field(5, 0, peer_dev_out_of_sync) /* sectors */ + __u64_field(6, 0, peer_dev_resync_failed) /* sectors */ + __u64_field(7, 0, peer_dev_bitmap_uuid) + __u32_field(9, 0, peer_dev_flags) +) + +GENL_struct(DRBD_NLA_NOTIFICATION_HEADER, 23, drbd_notification_header, + __u32_field(1, DRBD_GENLA_F_MANDATORY, nh_type) +) + +GENL_struct(DRBD_NLA_HELPER, 24, drbd_helper_info, + __str_field(1, DRBD_GENLA_F_MANDATORY, helper_name, 32) + __u32_field(2, DRBD_GENLA_F_MANDATORY, helper_status) +) + /* * Notifications and commands (genlmsghdr->cmd) */ @@ -382,3 +452,82 @@ GENL_op(DRBD_ADM_GET_TIMEOUT_TYPE, 26, GENL_doit(drbd_adm_get_timeout_type), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) GENL_op(DRBD_ADM_DOWN, 27, GENL_doit(drbd_adm_down), GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED)) + +GENL_op(DRBD_ADM_GET_RESOURCES, 30, + GENL_op_init( + .dumpit = drbd_adm_dump_resources, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_DEVICES, 31, + GENL_op_init( + .dumpit = drbd_adm_dump_devices, + .done = drbd_adm_dump_devices_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_CONNECTIONS, 32, + GENL_op_init( + .dumpit = drbd_adm_dump_connections, + .done = drbd_adm_dump_connections_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_op(DRBD_ADM_GET_PEER_DEVICES, 33, + GENL_op_init( + .dumpit = drbd_adm_dump_peer_devices, + .done = drbd_adm_dump_peer_devices_done, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_GENLA_F_MANDATORY) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_GENLA_F_MANDATORY)) + +GENL_notification( + DRBD_RESOURCE_STATE, 34, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_RESOURCE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_DEVICE_STATE, 35, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DEVICE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_DEVICE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_CONNECTION_STATE, 36, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_CONNECTION_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_CONNECTION_STATISTICS, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_PEER_DEVICE_STATE, 37, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_INFO, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_PEER_DEVICE_STATISTICS, DRBD_F_REQUIRED)) + +GENL_op( + DRBD_ADM_GET_INITIAL_STATE, 38, + GENL_op_init( + .dumpit = drbd_adm_get_initial_state, + ), + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_GENLA_F_MANDATORY)) + +GENL_notification( + DRBD_HELPER, 40, events, + GENL_tla_expected(DRBD_NLA_CFG_CONTEXT, DRBD_F_REQUIRED) + GENL_tla_expected(DRBD_NLA_HELPER, DRBD_F_REQUIRED)) + +GENL_notification( + DRBD_INITIAL_STATE_DONE, 41, events, + GENL_tla_expected(DRBD_NLA_NOTIFICATION_HEADER, DRBD_F_REQUIRED)) diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..1a2046275cdf 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,8 @@ struct address_space { struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ - unsigned long nrshadows; /* number of shadow entries */ + /* number of shadow or DAX exceptional entries */ + unsigned long nrexceptional; pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ @@ -714,6 +715,31 @@ enum inode_i_mutex_lock_class I_MUTEX_PARENT2, }; +static inline void inode_lock(struct inode *inode) +{ + mutex_lock(&inode->i_mutex); +} + +static inline void inode_unlock(struct inode *inode) +{ + mutex_unlock(&inode->i_mutex); +} + +static inline int inode_trylock(struct inode *inode) +{ + return mutex_trylock(&inode->i_mutex); +} + +static inline int inode_is_locked(struct inode *inode) +{ + return mutex_is_locked(&inode->i_mutex); +} + +static inline void inode_lock_nested(struct inode *inode, unsigned subclass) +{ + mutex_lock_nested(&inode->i_mutex, subclass); +} + void lock_two_nondirectories(struct inode *, struct inode*); void unlock_two_nondirectories(struct inode *, struct inode*); @@ -3047,8 +3073,8 @@ static inline bool dir_emit_dots(struct file *file, struct dir_context *ctx) } static inline bool dir_relax(struct inode *inode) { - mutex_unlock(&inode->i_mutex); - mutex_lock(&inode->i_mutex); + inode_unlock(inode); + inode_lock(inode); return !IS_DEADDIR(inode); } diff --git a/include/linux/huge_mm.h b/include/linux/huge_mm.h index cfe81e10bd54..459fd25b378e 100644 --- a/include/linux/huge_mm.h +++ b/include/linux/huge_mm.h @@ -120,15 +120,15 @@ extern void vma_adjust_trans_huge(struct vm_area_struct *vma, unsigned long start, unsigned long end, long adjust_next); -extern bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl); +extern spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma); /* mmap_sem must be held on entry */ -static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) { VM_BUG_ON_VMA(!rwsem_is_locked(&vma->vm_mm->mmap_sem), vma); if (pmd_trans_huge(*pmd) || pmd_devmap(*pmd)) - return __pmd_trans_huge_lock(pmd, vma, ptl); + return __pmd_trans_huge_lock(pmd, vma); else return false; } @@ -190,10 +190,10 @@ static inline void vma_adjust_trans_huge(struct vm_area_struct *vma, long adjust_next) { } -static inline bool pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +static inline spinlock_t *pmd_trans_huge_lock(pmd_t *pmd, + struct vm_area_struct *vma) { - return false; + return NULL; } static inline int do_huge_pmd_numa_page(struct mm_struct *mm, struct vm_area_struct *vma, diff --git a/include/linux/idr.h b/include/linux/idr.h index 013fd9bc4cb6..083d61e92706 100644 --- a/include/linux/idr.h +++ b/include/linux/idr.h @@ -135,6 +135,20 @@ static inline void *idr_find(struct idr *idr, int id) #define idr_for_each_entry(idp, entry, id) \ for (id = 0; ((entry) = idr_get_next(idp, &(id))) != NULL; ++id) +/** + * idr_for_each_entry - continue iteration over an idr's elements of a given type + * @idp: idr handle + * @entry: the type * to use as cursor + * @id: id entry's key + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define idr_for_each_entry_continue(idp, entry, id) \ + for ((entry) = idr_get_next((idp), &(id)); \ + entry; \ + ++id, (entry) = idr_get_next((idp), &(id))) + /* * IDA - IDR based id allocator, use when translation from id to * pointer isn't necessary. diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index cb30edbfe9fc..0e95fcc75b2a 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -413,7 +413,7 @@ enum NET_TX_SOFTIRQ, NET_RX_SOFTIRQ, BLOCK_SOFTIRQ, - BLOCK_IOPOLL_SOFTIRQ, + IRQ_POLL_SOFTIRQ, TASKLET_SOFTIRQ, SCHED_SOFTIRQ, HRTIMER_SOFTIRQ, /* Unused, but kept as tools rely on the diff --git a/include/linux/irq_poll.h b/include/linux/irq_poll.h new file mode 100644 index 000000000000..3e8c1b8fb9be --- /dev/null +++ b/include/linux/irq_poll.h @@ -0,0 +1,25 @@ +#ifndef IRQ_POLL_H +#define IRQ_POLL_H + +struct irq_poll; +typedef int (irq_poll_fn)(struct irq_poll *, int); + +struct irq_poll { + struct list_head list; + unsigned long state; + int weight; + irq_poll_fn *poll; +}; + +enum { + IRQ_POLL_F_SCHED = 0, + IRQ_POLL_F_DISABLE = 1, +}; + +extern void irq_poll_sched(struct irq_poll *); +extern void irq_poll_init(struct irq_poll *, int, irq_poll_fn *); +extern void irq_poll_complete(struct irq_poll *); +extern void irq_poll_enable(struct irq_poll *); +extern void irq_poll_disable(struct irq_poll *); + +#endif diff --git a/include/linux/lightnvm.h b/include/linux/lightnvm.h index 034117b3be5f..d6750111e48e 100644 --- a/include/linux/lightnvm.h +++ b/include/linux/lightnvm.h @@ -1,6 +1,8 @@ #ifndef NVM_H #define NVM_H +#include <linux/types.h> + enum { NVM_IO_OK = 0, NVM_IO_REQUEUE = 1, @@ -11,12 +13,74 @@ enum { NVM_IOTYPE_GC = 1, }; +#define NVM_BLK_BITS (16) +#define NVM_PG_BITS (16) +#define NVM_SEC_BITS (8) +#define NVM_PL_BITS (8) +#define NVM_LUN_BITS (8) +#define NVM_CH_BITS (8) + +struct ppa_addr { + /* Generic structure for all addresses */ + union { + struct { + u64 blk : NVM_BLK_BITS; + u64 pg : NVM_PG_BITS; + u64 sec : NVM_SEC_BITS; + u64 pl : NVM_PL_BITS; + u64 lun : NVM_LUN_BITS; + u64 ch : NVM_CH_BITS; + } g; + + u64 ppa; + }; +}; + +struct nvm_rq; +struct nvm_id; +struct nvm_dev; + +typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); +typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); +typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); +typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, + nvm_l2p_update_fn *, void *); +typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, + nvm_bb_update_fn *, void *); +typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); +typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); +typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); +typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); +typedef void (nvm_destroy_dma_pool_fn)(void *); +typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, + dma_addr_t *); +typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); + +struct nvm_dev_ops { + nvm_id_fn *identity; + nvm_get_l2p_tbl_fn *get_l2p_tbl; + nvm_op_bb_tbl_fn *get_bb_tbl; + nvm_op_set_bb_fn *set_bb_tbl; + + nvm_submit_io_fn *submit_io; + nvm_erase_blk_fn *erase_block; + + nvm_create_dma_pool_fn *create_dma_pool; + nvm_destroy_dma_pool_fn *destroy_dma_pool; + nvm_dev_dma_alloc_fn *dev_dma_alloc; + nvm_dev_dma_free_fn *dev_dma_free; + + unsigned int max_phys_sect; +}; + + + #ifdef CONFIG_NVM #include <linux/blkdev.h> -#include <linux/types.h> #include <linux/file.h> #include <linux/dmapool.h> +#include <uapi/linux/lightnvm.h> enum { /* HW Responsibilities */ @@ -58,8 +122,29 @@ enum { /* Block Types */ NVM_BLK_T_FREE = 0x0, NVM_BLK_T_BAD = 0x1, - NVM_BLK_T_DEV = 0x2, - NVM_BLK_T_HOST = 0x4, + NVM_BLK_T_GRWN_BAD = 0x2, + NVM_BLK_T_DEV = 0x4, + NVM_BLK_T_HOST = 0x8, + + /* Memory capabilities */ + NVM_ID_CAP_SLC = 0x1, + NVM_ID_CAP_CMD_SUSPEND = 0x2, + NVM_ID_CAP_SCRAMBLE = 0x4, + NVM_ID_CAP_ENCRYPT = 0x8, + + /* Memory types */ + NVM_ID_FMTYPE_SLC = 0, + NVM_ID_FMTYPE_MLC = 1, +}; + +struct nvm_id_lp_mlc { + u16 num_pairs; + u8 pairs[886]; +}; + +struct nvm_id_lp_tbl { + __u8 id[8]; + struct nvm_id_lp_mlc mlc; }; struct nvm_id_group { @@ -82,6 +167,8 @@ struct nvm_id_group { u32 mpos; u32 mccap; u16 cpar; + + struct nvm_id_lp_tbl lptbl; }; struct nvm_addr_format { @@ -125,28 +212,8 @@ struct nvm_tgt_instance { #define NVM_VERSION_MINOR 0 #define NVM_VERSION_PATCH 0 -#define NVM_BLK_BITS (16) -#define NVM_PG_BITS (16) -#define NVM_SEC_BITS (8) -#define NVM_PL_BITS (8) -#define NVM_LUN_BITS (8) -#define NVM_CH_BITS (8) - -struct ppa_addr { - /* Generic structure for all addresses */ - union { - struct { - u64 blk : NVM_BLK_BITS; - u64 pg : NVM_PG_BITS; - u64 sec : NVM_SEC_BITS; - u64 pl : NVM_PL_BITS; - u64 lun : NVM_LUN_BITS; - u64 ch : NVM_CH_BITS; - } g; - - u64 ppa; - }; -}; +struct nvm_rq; +typedef void (nvm_end_io_fn)(struct nvm_rq *); struct nvm_rq { struct nvm_tgt_instance *ins; @@ -164,9 +231,14 @@ struct nvm_rq { void *metadata; dma_addr_t dma_metadata; + struct completion *wait; + nvm_end_io_fn *end_io; + uint8_t opcode; uint16_t nr_pages; uint16_t flags; + + int error; }; static inline struct nvm_rq *nvm_rq_from_pdu(void *pdu) @@ -181,51 +253,31 @@ static inline void *nvm_rq_to_pdu(struct nvm_rq *rqdata) struct nvm_block; -typedef int (nvm_l2p_update_fn)(u64, u32, __le64 *, void *); -typedef int (nvm_bb_update_fn)(struct ppa_addr, int, u8 *, void *); -typedef int (nvm_id_fn)(struct nvm_dev *, struct nvm_id *); -typedef int (nvm_get_l2p_tbl_fn)(struct nvm_dev *, u64, u32, - nvm_l2p_update_fn *, void *); -typedef int (nvm_op_bb_tbl_fn)(struct nvm_dev *, struct ppa_addr, int, - nvm_bb_update_fn *, void *); -typedef int (nvm_op_set_bb_fn)(struct nvm_dev *, struct nvm_rq *, int); -typedef int (nvm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvm_erase_blk_fn)(struct nvm_dev *, struct nvm_rq *); -typedef void *(nvm_create_dma_pool_fn)(struct nvm_dev *, char *); -typedef void (nvm_destroy_dma_pool_fn)(void *); -typedef void *(nvm_dev_dma_alloc_fn)(struct nvm_dev *, void *, gfp_t, - dma_addr_t *); -typedef void (nvm_dev_dma_free_fn)(void *, void*, dma_addr_t); - -struct nvm_dev_ops { - nvm_id_fn *identity; - nvm_get_l2p_tbl_fn *get_l2p_tbl; - nvm_op_bb_tbl_fn *get_bb_tbl; - nvm_op_set_bb_fn *set_bb_tbl; - - nvm_submit_io_fn *submit_io; - nvm_erase_blk_fn *erase_block; - - nvm_create_dma_pool_fn *create_dma_pool; - nvm_destroy_dma_pool_fn *destroy_dma_pool; - nvm_dev_dma_alloc_fn *dev_dma_alloc; - nvm_dev_dma_free_fn *dev_dma_free; - - unsigned int max_phys_sect; -}; - struct nvm_lun { int id; int lun_id; int chnl_id; - unsigned int nr_inuse_blocks; /* Number of used blocks */ + /* It is up to the target to mark blocks as closed. If the target does + * not do it, all blocks are marked as open, and nr_open_blocks + * represents the number of blocks in use + */ + unsigned int nr_open_blocks; /* Number of used, writable blocks */ + unsigned int nr_closed_blocks; /* Number of used, read-only blocks */ unsigned int nr_free_blocks; /* Number of unused blocks */ unsigned int nr_bad_blocks; /* Number of bad blocks */ - struct nvm_block *blocks; spinlock_t lock; + + struct nvm_block *blocks; +}; + +enum { + NVM_BLK_ST_FREE = 0x1, /* Free block */ + NVM_BLK_ST_OPEN = 0x2, /* Open block - read-write */ + NVM_BLK_ST_CLOSED = 0x4, /* Closed block - read-only */ + NVM_BLK_ST_BAD = 0x8, /* Bad block */ }; struct nvm_block { @@ -234,7 +286,16 @@ struct nvm_block { unsigned long id; void *priv; - int type; + int state; +}; + +/* system block cpu representation */ +struct nvm_sb_info { + unsigned long seqnr; + unsigned long erase_cnt; + unsigned int version; + char mmtype[NVM_MMTYPE_LEN]; + struct ppa_addr fs_ppa; }; struct nvm_dev { @@ -247,6 +308,9 @@ struct nvm_dev { struct nvmm_type *mt; void *mp; + /* System blocks */ + struct nvm_sb_info sb; + /* Device information */ int nr_chnls; int nr_planes; @@ -256,6 +320,7 @@ struct nvm_dev { int blks_per_lun; int sec_size; int oob_size; + int mccap; struct nvm_addr_format ppaf; /* Calculated/Cached values. These do not reflect the actual usable @@ -268,6 +333,10 @@ struct nvm_dev { int sec_per_blk; int sec_per_lun; + /* lower page table */ + int lps_per_blk; + int *lptbl; + unsigned long total_pages; unsigned long total_blocks; int nr_luns; @@ -280,6 +349,8 @@ struct nvm_dev { /* Backend device */ struct request_queue *q; char name[DISK_NAME_LEN]; + + struct mutex mlock; }; static inline struct ppa_addr generic_to_dev_addr(struct nvm_dev *dev, @@ -345,9 +416,13 @@ static inline struct ppa_addr block_to_ppa(struct nvm_dev *dev, return ppa; } +static inline int ppa_to_slc(struct nvm_dev *dev, int slc_pg) +{ + return dev->lptbl[slc_pg]; +} + typedef blk_qc_t (nvm_tgt_make_rq_fn)(struct request_queue *, struct bio *); typedef sector_t (nvm_tgt_capacity_fn)(void *); -typedef int (nvm_tgt_end_io_fn)(struct nvm_rq *, int); typedef void *(nvm_tgt_init_fn)(struct nvm_dev *, struct gendisk *, int, int); typedef void (nvm_tgt_exit_fn)(void *); @@ -358,7 +433,7 @@ struct nvm_tgt_type { /* target entry points */ nvm_tgt_make_rq_fn *make_rq; nvm_tgt_capacity_fn *capacity; - nvm_tgt_end_io_fn *end_io; + nvm_end_io_fn *end_io; /* module-specific init/teardown */ nvm_tgt_init_fn *init; @@ -383,7 +458,6 @@ typedef int (nvmm_open_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_close_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef void (nvmm_flush_blk_fn)(struct nvm_dev *, struct nvm_block *); typedef int (nvmm_submit_io_fn)(struct nvm_dev *, struct nvm_rq *); -typedef int (nvmm_end_io_fn)(struct nvm_rq *, int); typedef int (nvmm_erase_blk_fn)(struct nvm_dev *, struct nvm_block *, unsigned long); typedef struct nvm_lun *(nvmm_get_lun_fn)(struct nvm_dev *, int); @@ -397,6 +471,8 @@ struct nvmm_type { nvmm_unregister_fn *unregister_mgr; /* Block administration callbacks */ + nvmm_get_blk_fn *get_blk_unlocked; + nvmm_put_blk_fn *put_blk_unlocked; nvmm_get_blk_fn *get_blk; nvmm_put_blk_fn *put_blk; nvmm_open_blk_fn *open_blk; @@ -404,7 +480,6 @@ struct nvmm_type { nvmm_flush_blk_fn *flush_blk; nvmm_submit_io_fn *submit_io; - nvmm_end_io_fn *end_io; nvmm_erase_blk_fn *erase_blk; /* Configuration management */ @@ -418,6 +493,10 @@ struct nvmm_type { extern int nvm_register_mgr(struct nvmm_type *); extern void nvm_unregister_mgr(struct nvmm_type *); +extern struct nvm_block *nvm_get_blk_unlocked(struct nvm_dev *, + struct nvm_lun *, unsigned long); +extern void nvm_put_blk_unlocked(struct nvm_dev *, struct nvm_block *); + extern struct nvm_block *nvm_get_blk(struct nvm_dev *, struct nvm_lun *, unsigned long); extern void nvm_put_blk(struct nvm_dev *, struct nvm_block *); @@ -427,7 +506,36 @@ extern int nvm_register(struct request_queue *, char *, extern void nvm_unregister(char *); extern int nvm_submit_io(struct nvm_dev *, struct nvm_rq *); +extern void nvm_generic_to_addr_mode(struct nvm_dev *, struct nvm_rq *); +extern void nvm_addr_to_generic_mode(struct nvm_dev *, struct nvm_rq *); +extern int nvm_set_rqd_ppalist(struct nvm_dev *, struct nvm_rq *, + struct ppa_addr *, int); +extern void nvm_free_rqd_ppalist(struct nvm_dev *, struct nvm_rq *); +extern int nvm_erase_ppa(struct nvm_dev *, struct ppa_addr *, int); extern int nvm_erase_blk(struct nvm_dev *, struct nvm_block *); +extern void nvm_end_io(struct nvm_rq *, int); +extern int nvm_submit_ppa(struct nvm_dev *, struct ppa_addr *, int, int, int, + void *, int); + +/* sysblk.c */ +#define NVM_SYSBLK_MAGIC 0x4E564D53 /* "NVMS" */ + +/* system block on disk representation */ +struct nvm_system_block { + __be32 magic; /* magic signature */ + __be32 seqnr; /* sequence number */ + __be32 erase_cnt; /* erase count */ + __be16 version; /* version number */ + u8 mmtype[NVM_MMTYPE_LEN]; /* media manager name */ + __be64 fs_ppa; /* PPA for media manager + * superblock */ +}; + +extern int nvm_get_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_update_sysblock(struct nvm_dev *, struct nvm_sb_info *); +extern int nvm_init_sysblock(struct nvm_dev *, struct nvm_sb_info *); + +extern int nvm_dev_factory(struct nvm_dev *, int flags); #else /* CONFIG_NVM */ struct nvm_dev_ops; diff --git a/include/linux/lru_cache.h b/include/linux/lru_cache.h index 46262284de47..04fc6e6c7ff0 100644 --- a/include/linux/lru_cache.h +++ b/include/linux/lru_cache.h @@ -264,7 +264,7 @@ extern unsigned int lc_put(struct lru_cache *lc, struct lc_element *e); extern void lc_committed(struct lru_cache *lc); struct seq_file; -extern size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); +extern void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc); extern void lc_seq_dump_details(struct seq_file *seq, struct lru_cache *lc, char *utext, void (*detail) (struct seq_file *, struct lc_element *)); diff --git a/include/linux/mlx4/cmd.h b/include/linux/mlx4/cmd.h index 58391f2e0414..116b284bc4ce 100644 --- a/include/linux/mlx4/cmd.h +++ b/include/linux/mlx4/cmd.h @@ -206,7 +206,8 @@ enum { MLX4_SET_PORT_GID_TABLE = 0x5, MLX4_SET_PORT_PRIO2TC = 0x8, MLX4_SET_PORT_SCHEDULER = 0x9, - MLX4_SET_PORT_VXLAN = 0xB + MLX4_SET_PORT_VXLAN = 0xB, + MLX4_SET_PORT_ROCE_ADDR = 0xD }; enum { diff --git a/include/linux/mlx4/device.h b/include/linux/mlx4/device.h index d3133be12d92..430a929f048b 100644 --- a/include/linux/mlx4/device.h +++ b/include/linux/mlx4/device.h @@ -216,6 +216,7 @@ enum { MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN = 1LL << 30, MLX4_DEV_CAP_FLAG2_UPDATE_QP_SRC_CHECK_LB = 1ULL << 31, MLX4_DEV_CAP_FLAG2_LB_SRC_CHK = 1ULL << 32, + MLX4_DEV_CAP_FLAG2_ROCE_V1_V2 = 1ULL << 33, }; enum { @@ -267,12 +268,14 @@ enum { MLX4_BMME_FLAG_TYPE_2_WIN = 1 << 9, MLX4_BMME_FLAG_RESERVED_LKEY = 1 << 10, MLX4_BMME_FLAG_FAST_REG_WR = 1 << 11, + MLX4_BMME_FLAG_ROCE_V1_V2 = 1 << 19, MLX4_BMME_FLAG_PORT_REMAP = 1 << 24, MLX4_BMME_FLAG_VSD_INIT2RTR = 1 << 28, }; enum { - MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP + MLX4_FLAG_PORT_REMAP = MLX4_BMME_FLAG_PORT_REMAP, + MLX4_FLAG_ROCE_V1_V2 = MLX4_BMME_FLAG_ROCE_V1_V2 }; enum mlx4_event { @@ -979,14 +982,11 @@ struct mlx4_mad_ifc { for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if ((type) == (dev)->caps.port_mask[(port)]) -#define mlx4_foreach_non_ib_transport_port(port, dev) \ - for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ - if (((dev)->caps.port_mask[port] != MLX4_PORT_TYPE_IB)) - #define mlx4_foreach_ib_transport_port(port, dev) \ - for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ + for ((port) = 1; (port) <= (dev)->caps.num_ports; (port)++) \ if (((dev)->caps.port_mask[port] == MLX4_PORT_TYPE_IB) || \ - ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE)) + ((dev)->caps.flags & MLX4_DEV_CAP_FLAG_IBOE) || \ + ((dev)->caps.flags2 & MLX4_DEV_CAP_FLAG2_ROCE_V1_V2)) #define MLX4_INVALID_SLAVE_ID 0xFF #define MLX4_SINK_COUNTER_INDEX(dev) (dev->caps.max_counters - 1) @@ -1457,6 +1457,7 @@ int mlx4_get_base_gid_ix(struct mlx4_dev *dev, int slave, int port); int mlx4_config_vxlan_port(struct mlx4_dev *dev, __be16 udp_port); int mlx4_disable_rx_port_check(struct mlx4_dev *dev, bool dis); +int mlx4_config_roce_v2_port(struct mlx4_dev *dev, u16 udp_port); int mlx4_virt2phy_port_map(struct mlx4_dev *dev, u32 port1, u32 port2); int mlx4_vf_smi_enabled(struct mlx4_dev *dev, int slave, int port); int mlx4_vf_get_enable_smi_admin(struct mlx4_dev *dev, int slave, int port); diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index fe052e234906..587cdf943b52 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -194,7 +194,7 @@ struct mlx4_qp_context { u8 mtu_msgmax; u8 rq_size_stride; u8 sq_size_stride; - u8 rlkey; + u8 rlkey_roce_mode; __be32 usr_page; __be32 local_qpn; __be32 remote_qpn; @@ -204,7 +204,8 @@ struct mlx4_qp_context { u32 reserved1; __be32 next_send_psn; __be32 cqn_send; - u32 reserved2[2]; + __be16 roce_entropy; + __be16 reserved2[3]; __be32 last_acked_psn; __be32 ssn; __be32 params2; @@ -487,4 +488,14 @@ static inline struct mlx4_qp *__mlx4_qp_lookup(struct mlx4_dev *dev, u32 qpn) void mlx4_qp_remove(struct mlx4_dev *dev, struct mlx4_qp *qp); +static inline u16 folded_qp(u32 q) +{ + u16 res; + + res = ((q & 0xff) ^ ((q & 0xff0000) >> 16)) | (q & 0xff00); + return res; +} + +u16 mlx4_qp_roce_entropy(struct mlx4_dev *dev, u32 qpn); + #endif /* MLX4_QP_H */ diff --git a/include/linux/mlx5/device.h b/include/linux/mlx5/device.h index 7be845e30689..987764afa65c 100644 --- a/include/linux/mlx5/device.h +++ b/include/linux/mlx5/device.h @@ -223,6 +223,14 @@ enum { #define MLX5_UMR_MTT_MASK (MLX5_UMR_MTT_ALIGNMENT - 1) #define MLX5_UMR_MTT_MIN_CHUNK_SIZE MLX5_UMR_MTT_ALIGNMENT +#define MLX5_USER_INDEX_LEN (MLX5_FLD_SZ_BYTES(qpc, user_index) * 8) + +enum { + MLX5_EVENT_QUEUE_TYPE_QP = 0, + MLX5_EVENT_QUEUE_TYPE_RQ = 1, + MLX5_EVENT_QUEUE_TYPE_SQ = 2, +}; + enum mlx5_event { MLX5_EVENT_TYPE_COMP = 0x0, @@ -280,6 +288,26 @@ enum { }; enum { + MLX5_ROCE_VERSION_1 = 0, + MLX5_ROCE_VERSION_2 = 2, +}; + +enum { + MLX5_ROCE_VERSION_1_CAP = 1 << MLX5_ROCE_VERSION_1, + MLX5_ROCE_VERSION_2_CAP = 1 << MLX5_ROCE_VERSION_2, +}; + +enum { + MLX5_ROCE_L3_TYPE_IPV4 = 0, + MLX5_ROCE_L3_TYPE_IPV6 = 1, +}; + +enum { + MLX5_ROCE_L3_TYPE_IPV4_CAP = 1 << 1, + MLX5_ROCE_L3_TYPE_IPV6_CAP = 1 << 2, +}; + +enum { MLX5_OPCODE_NOP = 0x00, MLX5_OPCODE_SEND_INVAL = 0x01, MLX5_OPCODE_RDMA_WRITE = 0x08, @@ -446,7 +474,7 @@ struct mlx5_init_seg { __be32 rsvd2[880]; __be32 internal_timer_h; __be32 internal_timer_l; - __be32 rsrv3[2]; + __be32 rsvd3[2]; __be32 health_counter; __be32 rsvd4[1019]; __be64 ieee1588_clk; @@ -460,7 +488,9 @@ struct mlx5_eqe_comp { }; struct mlx5_eqe_qp_srq { - __be32 reserved[6]; + __be32 reserved1[5]; + u8 type; + u8 reserved2[3]; __be32 qp_srq_n; }; @@ -651,6 +681,12 @@ enum { }; enum { + MLX5_CQE_ROCE_L3_HEADER_TYPE_GRH = 0x0, + MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV6 = 0x1, + MLX5_CQE_ROCE_L3_HEADER_TYPE_IPV4 = 0x2, +}; + +enum { CQE_L2_OK = 1 << 0, CQE_L3_OK = 1 << 1, CQE_L4_OK = 1 << 2, diff --git a/include/linux/mlx5/driver.h b/include/linux/mlx5/driver.h index 5162f3533042..1e3006dcf35d 100644 --- a/include/linux/mlx5/driver.h +++ b/include/linux/mlx5/driver.h @@ -115,6 +115,11 @@ enum { MLX5_REG_HOST_ENDIANNESS = 0x7004, }; +enum { + MLX5_ATOMIC_OPS_CMP_SWAP = 1 << 0, + MLX5_ATOMIC_OPS_FETCH_ADD = 1 << 1, +}; + enum mlx5_page_fault_resume_flags { MLX5_PAGE_FAULT_RESUME_REQUESTOR = 1 << 0, MLX5_PAGE_FAULT_RESUME_WRITE = 1 << 1, @@ -341,9 +346,11 @@ struct mlx5_core_mr { }; enum mlx5_res_type { - MLX5_RES_QP, - MLX5_RES_SRQ, - MLX5_RES_XSRQ, + MLX5_RES_QP = MLX5_EVENT_QUEUE_TYPE_QP, + MLX5_RES_RQ = MLX5_EVENT_QUEUE_TYPE_RQ, + MLX5_RES_SQ = MLX5_EVENT_QUEUE_TYPE_SQ, + MLX5_RES_SRQ = 3, + MLX5_RES_XSRQ = 4, }; struct mlx5_core_rsc_common { @@ -651,13 +658,6 @@ extern struct workqueue_struct *mlx5_core_wq; .struct_offset_bytes = offsetof(struct ib_unpacked_ ## header, field), \ .struct_size_bytes = sizeof((struct ib_unpacked_ ## header *)0)->field -struct ib_field { - size_t struct_offset_bytes; - size_t struct_size_bytes; - int offset_bits; - int size_bits; -}; - static inline struct mlx5_core_dev *pci2mlx5_core_dev(struct pci_dev *pdev) { return pci_get_drvdata(pdev); diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h index 68d73f82e009..231ab6bcea76 100644 --- a/include/linux/mlx5/mlx5_ifc.h +++ b/include/linux/mlx5/mlx5_ifc.h @@ -67,6 +67,11 @@ enum { }; enum { + MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0, + MLX5_SET_HCA_CAP_OP_MOD_ATOMIC = 0x3, +}; + +enum { MLX5_CMD_OP_QUERY_HCA_CAP = 0x100, MLX5_CMD_OP_QUERY_ADAPTER = 0x101, MLX5_CMD_OP_INIT_HCA = 0x102, @@ -573,21 +578,24 @@ enum { struct mlx5_ifc_atomic_caps_bits { u8 reserved_0[0x40]; - u8 atomic_req_endianness[0x1]; - u8 reserved_1[0x1f]; + u8 atomic_req_8B_endianess_mode[0x2]; + u8 reserved_1[0x4]; + u8 supported_atomic_req_8B_endianess_mode_1[0x1]; - u8 reserved_2[0x20]; + u8 reserved_2[0x19]; - u8 reserved_3[0x10]; - u8 atomic_operations[0x10]; + u8 reserved_3[0x20]; u8 reserved_4[0x10]; - u8 atomic_size_qp[0x10]; + u8 atomic_operations[0x10]; u8 reserved_5[0x10]; + u8 atomic_size_qp[0x10]; + + u8 reserved_6[0x10]; u8 atomic_size_dc[0x10]; - u8 reserved_6[0x720]; + u8 reserved_7[0x720]; }; struct mlx5_ifc_odp_cap_bits { @@ -850,7 +858,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 reserved_66[0x8]; u8 log_uar_page_sz[0x10]; - u8 reserved_67[0x40]; + u8 reserved_67[0x20]; + u8 device_frequency_mhz[0x20]; u8 device_frequency_khz[0x20]; u8 reserved_68[0x5f]; u8 cqe_zip[0x1]; @@ -2215,19 +2224,25 @@ struct mlx5_ifc_nic_vport_context_bits { u8 mtu[0x10]; - u8 reserved_3[0x640]; + u8 system_image_guid[0x40]; + u8 port_guid[0x40]; + u8 node_guid[0x40]; + + u8 reserved_3[0x140]; + u8 qkey_violation_counter[0x10]; + u8 reserved_4[0x430]; u8 promisc_uc[0x1]; u8 promisc_mc[0x1]; u8 promisc_all[0x1]; - u8 reserved_4[0x2]; + u8 reserved_5[0x2]; u8 allowed_list_type[0x3]; - u8 reserved_5[0xc]; + u8 reserved_6[0xc]; u8 allowed_list_size[0xc]; struct mlx5_ifc_mac_address_layout_bits permanent_address; - u8 reserved_6[0x20]; + u8 reserved_7[0x20]; u8 current_uc_mac_address[0][0x40]; }; @@ -4199,6 +4214,13 @@ struct mlx5_ifc_modify_tis_out_bits { u8 reserved_1[0x40]; }; +struct mlx5_ifc_modify_tis_bitmask_bits { + u8 reserved_0[0x20]; + + u8 reserved_1[0x1f]; + u8 prio[0x1]; +}; + struct mlx5_ifc_modify_tis_in_bits { u8 opcode[0x10]; u8 reserved_0[0x10]; @@ -4211,7 +4233,7 @@ struct mlx5_ifc_modify_tis_in_bits { u8 reserved_3[0x20]; - u8 modify_bitmask[0x40]; + struct mlx5_ifc_modify_tis_bitmask_bits bitmask; u8 reserved_4[0x40]; diff --git a/include/linux/mlx5/qp.h b/include/linux/mlx5/qp.h index f079fb1a31f7..5b8c89ffaa58 100644 --- a/include/linux/mlx5/qp.h +++ b/include/linux/mlx5/qp.h @@ -85,7 +85,16 @@ enum mlx5_qp_state { MLX5_QP_STATE_ERR = 6, MLX5_QP_STATE_SQ_DRAINING = 7, MLX5_QP_STATE_SUSPENDED = 9, - MLX5_QP_NUM_STATE + MLX5_QP_NUM_STATE, + MLX5_QP_STATE, + MLX5_QP_STATE_BAD, +}; + +enum { + MLX5_SQ_STATE_NA = MLX5_SQC_STATE_ERR + 1, + MLX5_SQ_NUM_STATE = MLX5_SQ_STATE_NA + 1, + MLX5_RQ_STATE_NA = MLX5_RQC_STATE_ERR + 1, + MLX5_RQ_NUM_STATE = MLX5_RQ_STATE_NA + 1, }; enum { @@ -130,6 +139,9 @@ enum { MLX5_QP_BIT_RWE = 1 << 14, MLX5_QP_BIT_RAE = 1 << 13, MLX5_QP_BIT_RIC = 1 << 4, + MLX5_QP_BIT_CC_SLAVE_RECV = 1 << 2, + MLX5_QP_BIT_CC_SLAVE_SEND = 1 << 1, + MLX5_QP_BIT_CC_MASTER = 1 << 0 }; enum { @@ -248,8 +260,12 @@ struct mlx5_av { __be32 dqp_dct; u8 stat_rate_sl; u8 fl_mlid; - __be16 rlid; - u8 reserved0[10]; + union { + __be16 rlid; + __be16 udp_sport; + }; + u8 reserved0[4]; + u8 rmac[6]; u8 tclass; u8 hop_limit; __be32 grh_gid_fl; @@ -456,11 +472,16 @@ struct mlx5_qp_path { u8 static_rate; u8 hop_limit; __be32 tclass_flowlabel; - u8 rgid[16]; - u8 rsvd1[4]; - u8 sl; + union { + u8 rgid[16]; + u8 rip[16]; + }; + u8 f_dscp_ecn_prio; + u8 ecn_dscp; + __be16 udp_sport; + u8 dci_cfi_prio_sl; u8 port; - u8 rsvd2[6]; + u8 rmac[6]; }; struct mlx5_qp_context { @@ -620,8 +641,7 @@ int mlx5_core_create_qp(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp, struct mlx5_create_qp_mbox_in *in, int inlen); -int mlx5_core_qp_modify(struct mlx5_core_dev *dev, enum mlx5_qp_state cur_state, - enum mlx5_qp_state new_state, +int mlx5_core_qp_modify(struct mlx5_core_dev *dev, u16 operation, struct mlx5_modify_qp_mbox_in *in, int sqd_event, struct mlx5_core_qp *qp); int mlx5_core_destroy_qp(struct mlx5_core_dev *dev, @@ -639,6 +659,14 @@ void mlx5_debug_qp_remove(struct mlx5_core_dev *dev, struct mlx5_core_qp *qp); int mlx5_core_page_fault_resume(struct mlx5_core_dev *dev, u32 qpn, u8 context, int error); #endif +int mlx5_core_create_rq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *rq); +void mlx5_core_destroy_rq_tracked(struct mlx5_core_dev *dev, + struct mlx5_core_qp *rq); +int mlx5_core_create_sq_tracked(struct mlx5_core_dev *dev, u32 *in, int inlen, + struct mlx5_core_qp *sq); +void mlx5_core_destroy_sq_tracked(struct mlx5_core_dev *dev, + struct mlx5_core_qp *sq); static inline const char *mlx5_qp_type_str(int type) { diff --git a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h b/include/linux/mlx5/transobj.h index 74cae51436e4..88441f5ece25 100644 --- a/drivers/net/ethernet/mellanox/mlx5/core/transobj.h +++ b/include/linux/mlx5/transobj.h @@ -33,16 +33,20 @@ #ifndef __TRANSOBJ_H__ #define __TRANSOBJ_H__ -int mlx5_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn); -void mlx5_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn); +#include <linux/mlx5/driver.h> + +int mlx5_core_alloc_transport_domain(struct mlx5_core_dev *dev, u32 *tdn); +void mlx5_core_dealloc_transport_domain(struct mlx5_core_dev *dev, u32 tdn); int mlx5_core_create_rq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rqn); int mlx5_core_modify_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *in, int inlen); void mlx5_core_destroy_rq(struct mlx5_core_dev *dev, u32 rqn); +int mlx5_core_query_rq(struct mlx5_core_dev *dev, u32 rqn, u32 *out); int mlx5_core_create_sq(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *sqn); int mlx5_core_modify_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *in, int inlen); void mlx5_core_destroy_sq(struct mlx5_core_dev *dev, u32 sqn); +int mlx5_core_query_sq(struct mlx5_core_dev *dev, u32 sqn, u32 *out); int mlx5_core_create_tir(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tirn); int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, @@ -50,6 +54,8 @@ int mlx5_core_modify_tir(struct mlx5_core_dev *dev, u32 tirn, u32 *in, void mlx5_core_destroy_tir(struct mlx5_core_dev *dev, u32 tirn); int mlx5_core_create_tis(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *tisn); +int mlx5_core_modify_tis(struct mlx5_core_dev *dev, u32 tisn, u32 *in, + int inlen); void mlx5_core_destroy_tis(struct mlx5_core_dev *dev, u32 tisn); int mlx5_core_create_rmp(struct mlx5_core_dev *dev, u32 *in, int inlen, u32 *rmpn); diff --git a/include/linux/mlx5/vport.h b/include/linux/mlx5/vport.h index 638f2ca7a527..123771003e68 100644 --- a/include/linux/mlx5/vport.h +++ b/include/linux/mlx5/vport.h @@ -45,6 +45,11 @@ int mlx5_query_nic_vport_mac_address(struct mlx5_core_dev *mdev, u16 vport, u8 *addr); int mlx5_modify_nic_vport_mac_address(struct mlx5_core_dev *dev, u16 vport, u8 *addr); +int mlx5_query_nic_vport_system_image_guid(struct mlx5_core_dev *mdev, + u64 *system_image_guid); +int mlx5_query_nic_vport_node_guid(struct mlx5_core_dev *mdev, u64 *node_guid); +int mlx5_query_nic_vport_qkey_viol_cntr(struct mlx5_core_dev *mdev, + u16 *qkey_viol_cntr); int mlx5_query_hca_vport_gid(struct mlx5_core_dev *dev, u8 other_vport, u8 port_num, u16 vf_num, u16 gid_index, union ib_gid *gid); @@ -85,4 +90,7 @@ int mlx5_modify_nic_vport_vlans(struct mlx5_core_dev *dev, u16 vlans[], int list_size); +int mlx5_nic_vport_enable_roce(struct mlx5_core_dev *mdev); +int mlx5_nic_vport_disable_roce(struct mlx5_core_dev *mdev); + #endif /* __MLX5_VPORT_H__ */ diff --git a/include/linux/nvme.h b/include/linux/nvme.h index 3af5f454c04a..a55986f6fe38 100644 --- a/include/linux/nvme.h +++ b/include/linux/nvme.h @@ -17,20 +17,19 @@ #include <linux/types.h> -struct nvme_bar { - __u64 cap; /* Controller Capabilities */ - __u32 vs; /* Version */ - __u32 intms; /* Interrupt Mask Set */ - __u32 intmc; /* Interrupt Mask Clear */ - __u32 cc; /* Controller Configuration */ - __u32 rsvd1; /* Reserved */ - __u32 csts; /* Controller Status */ - __u32 nssr; /* Subsystem Reset */ - __u32 aqa; /* Admin Queue Attributes */ - __u64 asq; /* Admin SQ Base Address */ - __u64 acq; /* Admin CQ Base Address */ - __u32 cmbloc; /* Controller Memory Buffer Location */ - __u32 cmbsz; /* Controller Memory Buffer Size */ +enum { + NVME_REG_CAP = 0x0000, /* Controller Capabilities */ + NVME_REG_VS = 0x0008, /* Version */ + NVME_REG_INTMS = 0x000c, /* Interrupt Mask Set */ + NVME_REG_INTMC = 0x0010, /* Interrupt Mask Set */ + NVME_REG_CC = 0x0014, /* Controller Configuration */ + NVME_REG_CSTS = 0x001c, /* Controller Status */ + NVME_REG_NSSR = 0x0020, /* NVM Subsystem Reset */ + NVME_REG_AQA = 0x0024, /* Admin Queue Attributes */ + NVME_REG_ASQ = 0x0028, /* Admin SQ Base Address */ + NVME_REG_ACQ = 0x0030, /* Admin SQ Base Address */ + NVME_REG_CMBLOC = 0x0038, /* Controller Memory Buffer Location */ + NVME_REG_CMBSZ = 0x003c, /* Controller Memory Buffer Size */ }; #define NVME_CAP_MQES(cap) ((cap) & 0xffff) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4d08b6c33557..92395a0a7dc5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); diff --git a/include/linux/pipe_fs_i.h b/include/linux/pipe_fs_i.h index eb8b8ac6df3c..24f5470d3944 100644 --- a/include/linux/pipe_fs_i.h +++ b/include/linux/pipe_fs_i.h @@ -42,6 +42,7 @@ struct pipe_buffer { * @fasync_readers: reader side fasync * @fasync_writers: writer side fasync * @bufs: the circular array of pipe buffers + * @user: the user who created this pipe **/ struct pipe_inode_info { struct mutex mutex; @@ -57,6 +58,7 @@ struct pipe_inode_info { struct fasync_struct *fasync_readers; struct fasync_struct *fasync_writers; struct pipe_buffer *bufs; + struct user_struct *user; }; /* @@ -123,6 +125,8 @@ void pipe_unlock(struct pipe_inode_info *); void pipe_double_lock(struct pipe_inode_info *, struct pipe_inode_info *); extern unsigned int pipe_max_size, pipe_min_size; +extern unsigned long pipe_user_pages_hard; +extern unsigned long pipe_user_pages_soft; int pipe_proc_fn(struct ctl_table *, int, void __user *, size_t *, loff_t *); diff --git a/include/linux/pmem.h b/include/linux/pmem.h index acfea8ce4a07..7c3d11a6b4ad 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) { BUG(); } + +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) +{ + BUG(); +} #endif /* * Architectures that define ARCH_HAS_PMEM_API must provide * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), - * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). + * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() + * and arch_has_wmb_pmem(). */ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) { @@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size) else default_clear_pmem(addr, size); } + +/** + * wb_cache_pmem - write back processor cache for PMEM memory range + * @addr: virtual start address + * @size: number of bytes to write back + * + * Write back the processor cache range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with a wmb_pmem() call. + */ +static inline void wb_cache_pmem(void __pmem *addr, size_t size) +{ + if (arch_has_pmem_api()) + arch_wb_cache_pmem(addr, size); +} #endif /* __PMEM_H__ */ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 57e7d87d2d4c..7c88ad156a29 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -51,6 +51,15 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 +#define RADIX_DAX_MASK 0xf +#define RADIX_DAX_SHIFT 4 +#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); diff --git a/include/linux/sched.h b/include/linux/sched.h index f1e81e128592..a10494a94cc3 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -835,6 +835,7 @@ struct user_struct { #endif unsigned long locked_shm; /* How many pages of mlocked shm ? */ unsigned long unix_inflight; /* How many files in flight in unix sockets */ + atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ #ifdef CONFIG_KEYS struct key *uid_keyring; /* UID specific keyring */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a43f41cb3c43..4d4780c00d34 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -15,10 +15,7 @@ struct shmem_inode_info { unsigned int seals; /* shmem seals */ unsigned long flags; unsigned long alloced; /* data pages alloced to file */ - union { - unsigned long swapped; /* subtotal assigned to swap */ - char *symlink; /* unswappable short symlink */ - }; + unsigned long swapped; /* subtotal assigned to swap */ struct shared_policy policy; /* NUMA memory alloc policy */ struct list_head swaplist; /* chain of maybes on swap */ struct simple_xattrs xattrs; /* list of xattrs */ diff --git a/include/linux/sunrpc/svc_rdma.h b/include/linux/sunrpc/svc_rdma.h index f869807a0d0e..5322fea6fe4c 100644 --- a/include/linux/sunrpc/svc_rdma.h +++ b/include/linux/sunrpc/svc_rdma.h @@ -51,6 +51,7 @@ /* RPC/RDMA parameters and stats */ extern unsigned int svcrdma_ord; extern unsigned int svcrdma_max_requests; +extern unsigned int svcrdma_max_bc_requests; extern unsigned int svcrdma_max_req_size; extern atomic_t rdma_stat_recv; @@ -69,6 +70,7 @@ extern atomic_t rdma_stat_sq_prod; * completes. */ struct svc_rdma_op_ctxt { + struct list_head free; struct svc_rdma_op_ctxt *read_hdr; struct svc_rdma_fastreg_mr *frmr; int hdr_count; @@ -112,6 +114,7 @@ struct svc_rdma_fastreg_mr { struct list_head frmr_list; }; struct svc_rdma_req_map { + struct list_head free; unsigned long count; union { struct kvec sge[RPCSVC_MAXPAGES]; @@ -132,28 +135,32 @@ struct svcxprt_rdma { int sc_max_sge; int sc_max_sge_rd; /* max sge for read target */ - int sc_sq_depth; /* Depth of SQ */ atomic_t sc_sq_count; /* Number of SQ WR on queue */ - - int sc_max_requests; /* Depth of RQ */ + unsigned int sc_sq_depth; /* Depth of SQ */ + unsigned int sc_rq_depth; /* Depth of RQ */ + u32 sc_max_requests; /* Forward credits */ + u32 sc_max_bc_requests;/* Backward credits */ int sc_max_req_size; /* Size of each RQ WR buf */ struct ib_pd *sc_pd; atomic_t sc_dma_used; - atomic_t sc_ctxt_used; + spinlock_t sc_ctxt_lock; + struct list_head sc_ctxts; + int sc_ctxt_used; + spinlock_t sc_map_lock; + struct list_head sc_maps; + struct list_head sc_rq_dto_q; spinlock_t sc_rq_dto_lock; struct ib_qp *sc_qp; struct ib_cq *sc_rq_cq; struct ib_cq *sc_sq_cq; - struct ib_mr *sc_phys_mr; /* MR for server memory */ int (*sc_reader)(struct svcxprt_rdma *, struct svc_rqst *, struct svc_rdma_op_ctxt *, int *, u32 *, u32, u32, u64, bool); u32 sc_dev_caps; /* distilled device caps */ - u32 sc_dma_lkey; /* local dma key */ unsigned int sc_frmr_pg_list_len; struct list_head sc_frmr_q; spinlock_t sc_frmr_q_lock; @@ -179,8 +186,18 @@ struct svcxprt_rdma { #define RPCRDMA_MAX_REQUESTS 32 #define RPCRDMA_MAX_REQ_SIZE 4096 +/* Typical ULP usage of BC requests is NFSv4.1 backchannel. Our + * current NFSv4.1 implementation supports one backchannel slot. + */ +#define RPCRDMA_MAX_BC_REQUESTS 2 + #define RPCSVC_MAXPAYLOAD_RDMA RPCSVC_MAXPAYLOAD +/* svc_rdma_backchannel.c */ +extern int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, + struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf); + /* svc_rdma_marshal.c */ extern int svc_rdma_xdr_decode_req(struct rpcrdma_msg **, struct svc_rqst *); extern int svc_rdma_xdr_encode_error(struct svcxprt_rdma *, @@ -206,6 +223,8 @@ extern int rdma_read_chunk_frmr(struct svcxprt_rdma *, struct svc_rqst *, u32, u32, u64, bool); /* svc_rdma_sendto.c */ +extern int svc_rdma_map_xdr(struct svcxprt_rdma *, struct xdr_buf *, + struct svc_rdma_req_map *); extern int svc_rdma_sendto(struct svc_rqst *); extern struct rpcrdma_read_chunk * svc_rdma_get_read_chunk(struct rpcrdma_msg *); @@ -214,13 +233,14 @@ extern struct rpcrdma_read_chunk * extern int svc_rdma_send(struct svcxprt_rdma *, struct ib_send_wr *); extern void svc_rdma_send_error(struct svcxprt_rdma *, struct rpcrdma_msg *, enum rpcrdma_errcode); -extern int svc_rdma_post_recv(struct svcxprt_rdma *); +extern int svc_rdma_post_recv(struct svcxprt_rdma *, gfp_t); extern int svc_rdma_create_listen(struct svc_serv *, int, struct sockaddr *); extern struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *); extern void svc_rdma_put_context(struct svc_rdma_op_ctxt *, int); extern void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt); -extern struct svc_rdma_req_map *svc_rdma_get_req_map(void); -extern void svc_rdma_put_req_map(struct svc_rdma_req_map *); +extern struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *); +extern void svc_rdma_put_req_map(struct svcxprt_rdma *, + struct svc_rdma_req_map *); extern struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *); extern void svc_rdma_put_frmr(struct svcxprt_rdma *, struct svc_rdma_fastreg_mr *); @@ -234,6 +254,7 @@ extern struct svc_xprt_class svc_rdma_bc_class; #endif /* svc_rdma.c */ +extern struct workqueue_struct *svc_rdma_wq; extern int svc_rdma_init(void); extern void svc_rdma_cleanup(void); diff --git a/include/linux/thermal.h b/include/linux/thermal.h index 613c29bd6baf..e13a1ace50e9 100644 --- a/include/linux/thermal.h +++ b/include/linux/thermal.h @@ -43,6 +43,9 @@ /* Default weight of a bound cooling device */ #define THERMAL_WEIGHT_DEFAULT 0 +/* use value, which < 0K, to indicate an invalid/uninitialized temperature */ +#define THERMAL_TEMP_INVALID -274000 + /* Unit conversion macros */ #define DECI_KELVIN_TO_CELSIUS(t) ({ \ long _t = (t); \ @@ -167,6 +170,7 @@ struct thermal_attr { * @forced_passive: If > 0, temperature at which to switch on all ACPI * processor cooling devices. Currently only used by the * step-wise governor. + * @need_update: if equals 1, thermal_zone_device_update needs to be invoked. * @ops: operations this &thermal_zone_device supports * @tzp: thermal zone parameters * @governor: pointer to the governor for this thermal zone @@ -194,6 +198,7 @@ struct thermal_zone_device { int emul_temperature; int passive; unsigned int forced_passive; + atomic_t need_update; struct thermal_zone_device_ops *ops; struct thermal_zone_params *tzp; struct thermal_governor *governor; diff --git a/include/rdma/ib_addr.h b/include/rdma/ib_addr.h index 11528591d0d7..c34c9002460c 100644 --- a/include/rdma/ib_addr.h +++ b/include/rdma/ib_addr.h @@ -83,6 +83,8 @@ struct rdma_dev_addr { int bound_dev_if; enum rdma_transport_type transport; struct net *net; + enum rdma_network_type network; + int hoplimit; }; /** @@ -91,8 +93,8 @@ struct rdma_dev_addr { * * The dev_addr->net field must be initialized. */ -int rdma_translate_ip(struct sockaddr *addr, struct rdma_dev_addr *dev_addr, - u16 *vlan_id); +int rdma_translate_ip(const struct sockaddr *addr, + struct rdma_dev_addr *dev_addr, u16 *vlan_id); /** * rdma_resolve_ip - Resolve source and destination IP addresses to @@ -117,6 +119,10 @@ int rdma_resolve_ip(struct rdma_addr_client *client, struct rdma_dev_addr *addr, void *context), void *context); +int rdma_resolve_ip_route(struct sockaddr *src_addr, + const struct sockaddr *dst_addr, + struct rdma_dev_addr *addr); + void rdma_addr_cancel(struct rdma_dev_addr *addr); int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, @@ -125,8 +131,10 @@ int rdma_copy_addr(struct rdma_dev_addr *dev_addr, struct net_device *dev, int rdma_addr_size(struct sockaddr *addr); int rdma_addr_find_smac_by_sgid(union ib_gid *sgid, u8 *smac, u16 *vlan_id); -int rdma_addr_find_dmac_by_grh(const union ib_gid *sgid, const union ib_gid *dgid, - u8 *smac, u16 *vlan_id, int if_index); +int rdma_addr_find_l2_eth_by_grh(const union ib_gid *sgid, + const union ib_gid *dgid, + u8 *smac, u16 *vlan_id, int *if_index, + int *hoplimit); static inline u16 ib_addr_get_pkey(struct rdma_dev_addr *dev_addr) { diff --git a/include/rdma/ib_cache.h b/include/rdma/ib_cache.h index 269a27cf0a46..e30f19bd4a41 100644 --- a/include/rdma/ib_cache.h +++ b/include/rdma/ib_cache.h @@ -60,6 +60,7 @@ int ib_get_cached_gid(struct ib_device *device, * a specified GID value occurs. * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: The GID type to search for. * @ndev: In RoCE, the net device of the device. NULL means ignore. * @port_num: The port number of the device where the GID value was found. * @index: The index into the cached GID table where the GID was found. This @@ -70,6 +71,7 @@ int ib_get_cached_gid(struct ib_device *device, */ int ib_find_cached_gid(struct ib_device *device, const union ib_gid *gid, + enum ib_gid_type gid_type, struct net_device *ndev, u8 *port_num, u16 *index); @@ -79,6 +81,7 @@ int ib_find_cached_gid(struct ib_device *device, * GID value occurs * @device: The device to query. * @gid: The GID value to search for. + * @gid_type: The GID type to search for. * @port_num: The port number of the device where the GID value sould be * searched. * @ndev: In RoCE, the net device of the device. Null means ignore. @@ -90,6 +93,7 @@ int ib_find_cached_gid(struct ib_device *device, */ int ib_find_cached_gid_by_port(struct ib_device *device, const union ib_gid *gid, + enum ib_gid_type gid_type, u8 port_num, struct net_device *ndev, u16 *index); diff --git a/include/rdma/ib_mad.h b/include/rdma/ib_mad.h index ec9b44dd3d80..0ff049bd9ad4 100644 --- a/include/rdma/ib_mad.h +++ b/include/rdma/ib_mad.h @@ -438,6 +438,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, /** * ib_mad_recv_handler - callback handler for a received MAD. * @mad_agent: MAD agent requesting the received MAD. + * @send_buf: Send buffer if found, else NULL * @mad_recv_wc: Received work completion information on the received MAD. * * MADs received in response to a send request operation will be handed to @@ -447,6 +448,7 @@ typedef void (*ib_mad_snoop_handler)(struct ib_mad_agent *mad_agent, * modify the data referenced by @mad_recv_wc. */ typedef void (*ib_mad_recv_handler)(struct ib_mad_agent *mad_agent, + struct ib_mad_send_buf *send_buf, struct ib_mad_recv_wc *mad_recv_wc); /** diff --git a/include/rdma/ib_pack.h b/include/rdma/ib_pack.h index e99d8f9a4551..0f3daae44bf9 100644 --- a/include/rdma/ib_pack.h +++ b/include/rdma/ib_pack.h @@ -41,6 +41,8 @@ enum { IB_ETH_BYTES = 14, IB_VLAN_BYTES = 4, IB_GRH_BYTES = 40, + IB_IP4_BYTES = 20, + IB_UDP_BYTES = 8, IB_BTH_BYTES = 12, IB_DETH_BYTES = 8 }; @@ -223,6 +225,27 @@ struct ib_unpacked_eth { __be16 type; }; +struct ib_unpacked_ip4 { + u8 ver; + u8 hdr_len; + u8 tos; + __be16 tot_len; + __be16 id; + __be16 frag_off; + u8 ttl; + u8 protocol; + __sum16 check; + __be32 saddr; + __be32 daddr; +}; + +struct ib_unpacked_udp { + __be16 sport; + __be16 dport; + __be16 length; + __be16 csum; +}; + struct ib_unpacked_vlan { __be16 tag; __be16 type; @@ -237,6 +260,10 @@ struct ib_ud_header { struct ib_unpacked_vlan vlan; int grh_present; struct ib_unpacked_grh grh; + int ipv4_present; + struct ib_unpacked_ip4 ip4; + int udp_present; + struct ib_unpacked_udp udp; struct ib_unpacked_bth bth; struct ib_unpacked_deth deth; int immediate_present; @@ -253,13 +280,17 @@ void ib_unpack(const struct ib_field *desc, void *buf, void *structure); -void ib_ud_header_init(int payload_bytes, - int lrh_present, - int eth_present, - int vlan_present, - int grh_present, - int immediate_present, - struct ib_ud_header *header); +__sum16 ib_ud_ip4_csum(struct ib_ud_header *header); + +int ib_ud_header_init(int payload_bytes, + int lrh_present, + int eth_present, + int vlan_present, + int grh_present, + int ip_version, + int udp_present, + int immediate_present, + struct ib_ud_header *header); int ib_ud_header_pack(struct ib_ud_header *header, void *buf); diff --git a/include/rdma/ib_pma.h b/include/rdma/ib_pma.h index a5889f18807b..2f8a65c1fca7 100644 --- a/include/rdma/ib_pma.h +++ b/include/rdma/ib_pma.h @@ -42,6 +42,7 @@ */ #define IB_PMA_CLASS_CAP_ALLPORTSELECT cpu_to_be16(1 << 8) #define IB_PMA_CLASS_CAP_EXT_WIDTH cpu_to_be16(1 << 9) +#define IB_PMA_CLASS_CAP_EXT_WIDTH_NOIETF cpu_to_be16(1 << 10) #define IB_PMA_CLASS_CAP_XMIT_WAIT cpu_to_be16(1 << 12) #define IB_PMA_CLASS_PORT_INFO cpu_to_be16(0x0001) diff --git a/include/rdma/ib_sa.h b/include/rdma/ib_sa.h index 301969552d0a..cdc1c81aa275 100644 --- a/include/rdma/ib_sa.h +++ b/include/rdma/ib_sa.h @@ -160,6 +160,7 @@ struct ib_sa_path_rec { int ifindex; /* ignored in IB */ struct net *net; + enum ib_gid_type gid_type; }; static inline struct net_device *ib_get_ndev_from_path(struct ib_sa_path_rec *rec) @@ -402,6 +403,8 @@ int ib_sa_get_mcmember_rec(struct ib_device *device, u8 port_num, */ int ib_init_ah_from_mcmember(struct ib_device *device, u8 port_num, struct ib_sa_mcmember_rec *rec, + struct net_device *ndev, + enum ib_gid_type gid_type, struct ib_ah_attr *ah_attr); /** diff --git a/include/rdma/ib_verbs.h b/include/rdma/ib_verbs.h index 120da1d7f57e..284b00c8fea4 100644 --- a/include/rdma/ib_verbs.h +++ b/include/rdma/ib_verbs.h @@ -49,13 +49,19 @@ #include <linux/scatterlist.h> #include <linux/workqueue.h> #include <linux/socket.h> +#include <linux/irq_poll.h> #include <uapi/linux/if_ether.h> +#include <net/ipv6.h> +#include <net/ip.h> +#include <linux/string.h> +#include <linux/slab.h> #include <linux/atomic.h> #include <linux/mmu_notifier.h> #include <asm/uaccess.h> extern struct workqueue_struct *ib_wq; +extern struct workqueue_struct *ib_comp_wq; union ib_gid { u8 raw[16]; @@ -67,7 +73,17 @@ union ib_gid { extern union ib_gid zgid; +enum ib_gid_type { + /* If link layer is Ethernet, this is RoCE V1 */ + IB_GID_TYPE_IB = 0, + IB_GID_TYPE_ROCE = 0, + IB_GID_TYPE_ROCE_UDP_ENCAP = 1, + IB_GID_TYPE_SIZE +}; + +#define ROCE_V2_UDP_DPORT 4791 struct ib_gid_attr { + enum ib_gid_type gid_type; struct net_device *ndev; }; @@ -98,6 +114,35 @@ enum rdma_protocol_type { __attribute_const__ enum rdma_transport_type rdma_node_get_transport(enum rdma_node_type node_type); +enum rdma_network_type { + RDMA_NETWORK_IB, + RDMA_NETWORK_ROCE_V1 = RDMA_NETWORK_IB, + RDMA_NETWORK_IPV4, + RDMA_NETWORK_IPV6 +}; + +static inline enum ib_gid_type ib_network_to_gid_type(enum rdma_network_type network_type) +{ + if (network_type == RDMA_NETWORK_IPV4 || + network_type == RDMA_NETWORK_IPV6) + return IB_GID_TYPE_ROCE_UDP_ENCAP; + + /* IB_GID_TYPE_IB same as RDMA_NETWORK_ROCE_V1 */ + return IB_GID_TYPE_IB; +} + +static inline enum rdma_network_type ib_gid_to_network_type(enum ib_gid_type gid_type, + union ib_gid *gid) +{ + if (gid_type == IB_GID_TYPE_IB) + return RDMA_NETWORK_IB; + + if (ipv6_addr_v4mapped((struct in6_addr *)gid)) + return RDMA_NETWORK_IPV4; + else + return RDMA_NETWORK_IPV6; +} + enum rdma_link_layer { IB_LINK_LAYER_UNSPECIFIED, IB_LINK_LAYER_INFINIBAND, @@ -105,24 +150,32 @@ enum rdma_link_layer { }; enum ib_device_cap_flags { - IB_DEVICE_RESIZE_MAX_WR = 1, - IB_DEVICE_BAD_PKEY_CNTR = (1<<1), - IB_DEVICE_BAD_QKEY_CNTR = (1<<2), - IB_DEVICE_RAW_MULTI = (1<<3), - IB_DEVICE_AUTO_PATH_MIG = (1<<4), - IB_DEVICE_CHANGE_PHY_PORT = (1<<5), - IB_DEVICE_UD_AV_PORT_ENFORCE = (1<<6), - IB_DEVICE_CURR_QP_STATE_MOD = (1<<7), - IB_DEVICE_SHUTDOWN_PORT = (1<<8), - IB_DEVICE_INIT_TYPE = (1<<9), - IB_DEVICE_PORT_ACTIVE_EVENT = (1<<10), - IB_DEVICE_SYS_IMAGE_GUID = (1<<11), - IB_DEVICE_RC_RNR_NAK_GEN = (1<<12), - IB_DEVICE_SRQ_RESIZE = (1<<13), - IB_DEVICE_N_NOTIFY_CQ = (1<<14), - IB_DEVICE_LOCAL_DMA_LKEY = (1<<15), - IB_DEVICE_RESERVED = (1<<16), /* old SEND_W_INV */ - IB_DEVICE_MEM_WINDOW = (1<<17), + IB_DEVICE_RESIZE_MAX_WR = (1 << 0), + IB_DEVICE_BAD_PKEY_CNTR = (1 << 1), + IB_DEVICE_BAD_QKEY_CNTR = (1 << 2), + IB_DEVICE_RAW_MULTI = (1 << 3), + IB_DEVICE_AUTO_PATH_MIG = (1 << 4), + IB_DEVICE_CHANGE_PHY_PORT = (1 << 5), + IB_DEVICE_UD_AV_PORT_ENFORCE = (1 << 6), + IB_DEVICE_CURR_QP_STATE_MOD = (1 << 7), + IB_DEVICE_SHUTDOWN_PORT = (1 << 8), + IB_DEVICE_INIT_TYPE = (1 << 9), + IB_DEVICE_PORT_ACTIVE_EVENT = (1 << 10), + IB_DEVICE_SYS_IMAGE_GUID = (1 << 11), + IB_DEVICE_RC_RNR_NAK_GEN = (1 << 12), + IB_DEVICE_SRQ_RESIZE = (1 << 13), + IB_DEVICE_N_NOTIFY_CQ = (1 << 14), + + /* + * This device supports a per-device lkey or stag that can be + * used without performing a memory registration for the local + * memory. Note that ULPs should never check this flag, but + * instead of use the local_dma_lkey flag in the ib_pd structure, + * which will always contain a usable lkey. + */ + IB_DEVICE_LOCAL_DMA_LKEY = (1 << 15), + IB_DEVICE_RESERVED /* old SEND_W_INV */ = (1 << 16), + IB_DEVICE_MEM_WINDOW = (1 << 17), /* * Devices should set IB_DEVICE_UD_IP_SUM if they support * insertion of UDP and TCP checksum on outgoing UD IPoIB @@ -130,18 +183,35 @@ enum ib_device_cap_flags { * incoming messages. Setting this flag implies that the * IPoIB driver may set NETIF_F_IP_CSUM for datagram mode. */ - IB_DEVICE_UD_IP_CSUM = (1<<18), - IB_DEVICE_UD_TSO = (1<<19), - IB_DEVICE_XRC = (1<<20), - IB_DEVICE_MEM_MGT_EXTENSIONS = (1<<21), - IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1<<22), - IB_DEVICE_MEM_WINDOW_TYPE_2A = (1<<23), - IB_DEVICE_MEM_WINDOW_TYPE_2B = (1<<24), - IB_DEVICE_RC_IP_CSUM = (1<<25), - IB_DEVICE_RAW_IP_CSUM = (1<<26), - IB_DEVICE_MANAGED_FLOW_STEERING = (1<<29), - IB_DEVICE_SIGNATURE_HANDOVER = (1<<30), - IB_DEVICE_ON_DEMAND_PAGING = (1<<31), + IB_DEVICE_UD_IP_CSUM = (1 << 18), + IB_DEVICE_UD_TSO = (1 << 19), + IB_DEVICE_XRC = (1 << 20), + + /* + * This device supports the IB "base memory management extension", + * which includes support for fast registrations (IB_WR_REG_MR, + * IB_WR_LOCAL_INV and IB_WR_SEND_WITH_INV verbs). This flag should + * also be set by any iWarp device which must support FRs to comply + * to the iWarp verbs spec. iWarp devices also support the + * IB_WR_RDMA_READ_WITH_INV verb for RDMA READs that invalidate the + * stag. + */ + IB_DEVICE_MEM_MGT_EXTENSIONS = (1 << 21), + IB_DEVICE_BLOCK_MULTICAST_LOOPBACK = (1 << 22), + IB_DEVICE_MEM_WINDOW_TYPE_2A = (1 << 23), + IB_DEVICE_MEM_WINDOW_TYPE_2B = (1 << 24), + IB_DEVICE_RC_IP_CSUM = (1 << 25), + IB_DEVICE_RAW_IP_CSUM = (1 << 26), + /* + * Devices should set IB_DEVICE_CROSS_CHANNEL if they + * support execution of WQEs that involve synchronization + * of I/O operations with single completion queue managed + * by hardware. + */ + IB_DEVICE_CROSS_CHANNEL = (1 << 27), + IB_DEVICE_MANAGED_FLOW_STEERING = (1 << 29), + IB_DEVICE_SIGNATURE_HANDOVER = (1 << 30), + IB_DEVICE_ON_DEMAND_PAGING = (1 << 31), }; enum ib_signature_prot_cap { @@ -184,6 +254,7 @@ struct ib_odp_caps { enum ib_cq_creation_flags { IB_CQ_FLAGS_TIMESTAMP_COMPLETION = 1 << 0, + IB_CQ_FLAGS_IGNORE_OVERRUN = 1 << 1, }; struct ib_cq_init_attr { @@ -393,6 +464,7 @@ union rdma_protocol_stats { #define RDMA_CORE_CAP_PROT_IB 0x00100000 #define RDMA_CORE_CAP_PROT_ROCE 0x00200000 #define RDMA_CORE_CAP_PROT_IWARP 0x00400000 +#define RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP 0x00800000 #define RDMA_CORE_PORT_IBA_IB (RDMA_CORE_CAP_PROT_IB \ | RDMA_CORE_CAP_IB_MAD \ @@ -405,6 +477,12 @@ union rdma_protocol_stats { | RDMA_CORE_CAP_IB_CM \ | RDMA_CORE_CAP_AF_IB \ | RDMA_CORE_CAP_ETH_AH) +#define RDMA_CORE_PORT_IBA_ROCE_UDP_ENCAP \ + (RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP \ + | RDMA_CORE_CAP_IB_MAD \ + | RDMA_CORE_CAP_IB_CM \ + | RDMA_CORE_CAP_AF_IB \ + | RDMA_CORE_CAP_ETH_AH) #define RDMA_CORE_PORT_IWARP (RDMA_CORE_CAP_PROT_IWARP \ | RDMA_CORE_CAP_IW_CM) #define RDMA_CORE_PORT_INTEL_OPA (RDMA_CORE_PORT_IBA_IB \ @@ -519,6 +597,17 @@ struct ib_grh { union ib_gid dgid; }; +union rdma_network_hdr { + struct ib_grh ibgrh; + struct { + /* The IB spec states that if it's IPv4, the header + * is located in the last 20 bytes of the header. + */ + u8 reserved[20]; + struct iphdr roce4grh; + }; +}; + enum { IB_MULTICAST_QPN = 0xffffff }; @@ -734,7 +823,6 @@ enum ib_wc_opcode { IB_WC_RDMA_READ, IB_WC_COMP_SWAP, IB_WC_FETCH_ADD, - IB_WC_BIND_MW, IB_WC_LSO, IB_WC_LOCAL_INV, IB_WC_REG_MR, @@ -755,10 +843,14 @@ enum ib_wc_flags { IB_WC_IP_CSUM_OK = (1<<3), IB_WC_WITH_SMAC = (1<<4), IB_WC_WITH_VLAN = (1<<5), + IB_WC_WITH_NETWORK_HDR_TYPE = (1<<6), }; struct ib_wc { - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; enum ib_wc_status status; enum ib_wc_opcode opcode; u32 vendor_err; @@ -777,6 +869,7 @@ struct ib_wc { u8 port_num; /* valid only for DR SMPs on switches */ u8 smac[ETH_ALEN]; u16 vlan_id; + u8 network_hdr_type; }; enum ib_cq_notify_flags { @@ -866,6 +959,9 @@ enum ib_qp_type { enum ib_qp_create_flags { IB_QP_CREATE_IPOIB_UD_LSO = 1 << 0, IB_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_QP_CREATE_CROSS_CHANNEL = 1 << 2, + IB_QP_CREATE_MANAGED_SEND = 1 << 3, + IB_QP_CREATE_MANAGED_RECV = 1 << 4, IB_QP_CREATE_NETIF_QP = 1 << 5, IB_QP_CREATE_SIGNATURE_EN = 1 << 6, IB_QP_CREATE_USE_GFP_NOIO = 1 << 7, @@ -1027,7 +1123,6 @@ enum ib_wr_opcode { IB_WR_REG_MR, IB_WR_MASKED_ATOMIC_CMP_AND_SWP, IB_WR_MASKED_ATOMIC_FETCH_AND_ADD, - IB_WR_BIND_MW, IB_WR_REG_SIG_MR, /* reserve values for low level drivers' internal use. * These values will not be used at all in the ib core layer. @@ -1062,26 +1157,16 @@ struct ib_sge { u32 lkey; }; -/** - * struct ib_mw_bind_info - Parameters for a memory window bind operation. - * @mr: A memory region to bind the memory window to. - * @addr: The address where the memory window should begin. - * @length: The length of the memory window, in bytes. - * @mw_access_flags: Access flags from enum ib_access_flags for the window. - * - * This struct contains the shared parameters for type 1 and type 2 - * memory window bind operations. - */ -struct ib_mw_bind_info { - struct ib_mr *mr; - u64 addr; - u64 length; - int mw_access_flags; +struct ib_cqe { + void (*done)(struct ib_cq *cq, struct ib_wc *wc); }; struct ib_send_wr { struct ib_send_wr *next; - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; struct ib_sge *sg_list; int num_sge; enum ib_wr_opcode opcode; @@ -1147,19 +1232,6 @@ static inline struct ib_reg_wr *reg_wr(struct ib_send_wr *wr) return container_of(wr, struct ib_reg_wr, wr); } -struct ib_bind_mw_wr { - struct ib_send_wr wr; - struct ib_mw *mw; - /* The new rkey for the memory window. */ - u32 rkey; - struct ib_mw_bind_info bind_info; -}; - -static inline struct ib_bind_mw_wr *bind_mw_wr(struct ib_send_wr *wr) -{ - return container_of(wr, struct ib_bind_mw_wr, wr); -} - struct ib_sig_handover_wr { struct ib_send_wr wr; struct ib_sig_attrs *sig_attrs; @@ -1175,7 +1247,10 @@ static inline struct ib_sig_handover_wr *sig_handover_wr(struct ib_send_wr *wr) struct ib_recv_wr { struct ib_recv_wr *next; - u64 wr_id; + union { + u64 wr_id; + struct ib_cqe *wr_cqe; + }; struct ib_sge *sg_list; int num_sge; }; @@ -1190,20 +1265,10 @@ enum ib_access_flags { IB_ACCESS_ON_DEMAND = (1<<6), }; -struct ib_phys_buf { - u64 addr; - u64 size; -}; - -struct ib_mr_attr { - struct ib_pd *pd; - u64 device_virt_addr; - u64 size; - int mr_access_flags; - u32 lkey; - u32 rkey; -}; - +/* + * XXX: these are apparently used for ->rereg_user_mr, no idea why they + * are hidden here instead of a uapi header! + */ enum ib_mr_rereg_flags { IB_MR_REREG_TRANS = 1, IB_MR_REREG_PD = (1<<1), @@ -1211,18 +1276,6 @@ enum ib_mr_rereg_flags { IB_MR_REREG_SUPPORTED = ((IB_MR_REREG_ACCESS << 1) - 1) }; -/** - * struct ib_mw_bind - Parameters for a type 1 memory window bind operation. - * @wr_id: Work request id. - * @send_flags: Flags from ib_send_flags enum. - * @bind_info: More parameters of the bind operation. - */ -struct ib_mw_bind { - u64 wr_id; - int send_flags; - struct ib_mw_bind_info bind_info; -}; - struct ib_fmr_attr { int max_pages; int max_maps; @@ -1307,6 +1360,12 @@ struct ib_ah { typedef void (*ib_comp_handler)(struct ib_cq *cq, void *cq_context); +enum ib_poll_context { + IB_POLL_DIRECT, /* caller context, no hw completions */ + IB_POLL_SOFTIRQ, /* poll from softirq context */ + IB_POLL_WORKQUEUE, /* poll from workqueue */ +}; + struct ib_cq { struct ib_device *device; struct ib_uobject *uobject; @@ -1315,6 +1374,12 @@ struct ib_cq { void *cq_context; int cqe; atomic_t usecnt; /* count number of work queues */ + enum ib_poll_context poll_ctx; + struct ib_wc *wc; + union { + struct irq_poll iop; + struct work_struct work; + }; }; struct ib_srq { @@ -1363,7 +1428,6 @@ struct ib_mr { u64 iova; u32 length; unsigned int page_size; - atomic_t usecnt; /* count number of MWs */ }; struct ib_mw { @@ -1724,11 +1788,6 @@ struct ib_device { int wc_cnt); struct ib_mr * (*get_dma_mr)(struct ib_pd *pd, int mr_access_flags); - struct ib_mr * (*reg_phys_mr)(struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); struct ib_mr * (*reg_user_mr)(struct ib_pd *pd, u64 start, u64 length, u64 virt_addr, @@ -1741,8 +1800,6 @@ struct ib_device { int mr_access_flags, struct ib_pd *pd, struct ib_udata *udata); - int (*query_mr)(struct ib_mr *mr, - struct ib_mr_attr *mr_attr); int (*dereg_mr)(struct ib_mr *mr); struct ib_mr * (*alloc_mr)(struct ib_pd *pd, enum ib_mr_type mr_type, @@ -1750,18 +1807,8 @@ struct ib_device { int (*map_mr_sg)(struct ib_mr *mr, struct scatterlist *sg, int sg_nents); - int (*rereg_phys_mr)(struct ib_mr *mr, - int mr_rereg_mask, - struct ib_pd *pd, - struct ib_phys_buf *phys_buf_array, - int num_phys_buf, - int mr_access_flags, - u64 *iova_start); struct ib_mw * (*alloc_mw)(struct ib_pd *pd, enum ib_mw_type type); - int (*bind_mw)(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind); int (*dealloc_mw)(struct ib_mw *mw); struct ib_fmr * (*alloc_fmr)(struct ib_pd *pd, int mr_access_flags, @@ -1823,6 +1870,7 @@ struct ib_device { u16 is_switch:1; u8 node_type; u8 phys_port_cnt; + struct ib_device_attr attrs; /** * The following mandatory functions are used only at device @@ -1888,6 +1936,31 @@ static inline int ib_copy_to_udata(struct ib_udata *udata, void *src, size_t len return copy_to_user(udata->outbuf, src, len) ? -EFAULT : 0; } +static inline bool ib_is_udata_cleared(struct ib_udata *udata, + size_t offset, + size_t len) +{ + const void __user *p = udata->inbuf + offset; + bool ret = false; + u8 *buf; + + if (len > USHRT_MAX) + return false; + + buf = kmalloc(len, GFP_KERNEL); + if (!buf) + return false; + + if (copy_from_user(buf, p, len)) + goto free; + + ret = !memchr_inv(buf, 0, len); + +free: + kfree(buf); + return ret; +} + /** * ib_modify_qp_is_ok - Check that the supplied attribute mask * contains all required attributes and no attributes not allowed for @@ -1912,9 +1985,6 @@ int ib_register_event_handler (struct ib_event_handler *event_handler); int ib_unregister_event_handler(struct ib_event_handler *event_handler); void ib_dispatch_event(struct ib_event *event); -int ib_query_device(struct ib_device *device, - struct ib_device_attr *device_attr); - int ib_query_port(struct ib_device *device, u8 port_num, struct ib_port_attr *port_attr); @@ -1968,6 +2038,17 @@ static inline bool rdma_protocol_ib(const struct ib_device *device, u8 port_num) static inline bool rdma_protocol_roce(const struct ib_device *device, u8 port_num) { + return device->port_immutable[port_num].core_cap_flags & + (RDMA_CORE_CAP_PROT_ROCE | RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP); +} + +static inline bool rdma_protocol_roce_udp_encap(const struct ib_device *device, u8 port_num) +{ + return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE_UDP_ENCAP; +} + +static inline bool rdma_protocol_roce_eth_encap(const struct ib_device *device, u8 port_num) +{ return device->port_immutable[port_num].core_cap_flags & RDMA_CORE_CAP_PROT_ROCE; } @@ -1978,8 +2059,8 @@ static inline bool rdma_protocol_iwarp(const struct ib_device *device, u8 port_n static inline bool rdma_ib_or_roce(const struct ib_device *device, u8 port_num) { - return device->port_immutable[port_num].core_cap_flags & - (RDMA_CORE_CAP_PROT_IB | RDMA_CORE_CAP_PROT_ROCE); + return rdma_protocol_ib(device, port_num) || + rdma_protocol_roce(device, port_num); } /** @@ -2220,7 +2301,8 @@ int ib_modify_port(struct ib_device *device, struct ib_port_modify *port_modify); int ib_find_gid(struct ib_device *device, union ib_gid *gid, - struct net_device *ndev, u8 *port_num, u16 *index); + enum ib_gid_type gid_type, struct net_device *ndev, + u8 *port_num, u16 *index); int ib_find_pkey(struct ib_device *device, u8 port_num, u16 pkey, u16 *index); @@ -2454,6 +2536,11 @@ static inline int ib_post_recv(struct ib_qp *qp, return qp->device->post_recv(qp, recv_wr, bad_recv_wr); } +struct ib_cq *ib_alloc_cq(struct ib_device *dev, void *private, + int nr_cqe, int comp_vector, enum ib_poll_context poll_ctx); +void ib_free_cq(struct ib_cq *cq); +int ib_process_cq_direct(struct ib_cq *cq, int budget); + /** * ib_create_cq - Creates a CQ on the specified device. * @device: The device on which to create the CQ. @@ -2839,13 +2926,6 @@ static inline void ib_dma_free_coherent(struct ib_device *dev, } /** - * ib_query_mr - Retrieves information about a specific memory region. - * @mr: The memory region to retrieve information about. - * @mr_attr: The attributes of the specified memory region. - */ -int ib_query_mr(struct ib_mr *mr, struct ib_mr_attr *mr_attr); - -/** * ib_dereg_mr - Deregisters a memory region and removes it from the * HCA translation table. * @mr: The memory region to deregister. @@ -2882,42 +2962,6 @@ static inline u32 ib_inc_rkey(u32 rkey) } /** - * ib_alloc_mw - Allocates a memory window. - * @pd: The protection domain associated with the memory window. - * @type: The type of the memory window (1 or 2). - */ -struct ib_mw *ib_alloc_mw(struct ib_pd *pd, enum ib_mw_type type); - -/** - * ib_bind_mw - Posts a work request to the send queue of the specified - * QP, which binds the memory window to the given address range and - * remote access attributes. - * @qp: QP to post the bind work request on. - * @mw: The memory window to bind. - * @mw_bind: Specifies information about the memory window, including - * its address range, remote access rights, and associated memory region. - * - * If there is no immediate error, the function will update the rkey member - * of the mw parameter to its new value. The bind operation can still fail - * asynchronously. - */ -static inline int ib_bind_mw(struct ib_qp *qp, - struct ib_mw *mw, - struct ib_mw_bind *mw_bind) -{ - /* XXX reference counting in corresponding MR? */ - return mw->device->bind_mw ? - mw->device->bind_mw(qp, mw, mw_bind) : - -ENOSYS; -} - -/** - * ib_dealloc_mw - Deallocates a memory window. - * @mw: The memory window to deallocate. - */ -int ib_dealloc_mw(struct ib_mw *mw); - -/** * ib_alloc_fmr - Allocates a unmapped fast memory region. * @pd: The protection domain associated with the unmapped region. * @mr_access_flags: Specifies the memory access rights. diff --git a/include/scsi/iser.h b/include/scsi/iser.h new file mode 100644 index 000000000000..2e678fa74eca --- /dev/null +++ b/include/scsi/iser.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2015 Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ISCSI_ISER_H +#define ISCSI_ISER_H + +#define ISER_ZBVA_NOT_SUP 0x80 +#define ISER_SEND_W_INV_NOT_SUP 0x40 +#define ISERT_ZBVA_NOT_USED 0x80 +#define ISERT_SEND_W_INV_NOT_USED 0x40 + +#define ISCSI_CTRL 0x10 +#define ISER_HELLO 0x20 +#define ISER_HELLORPLY 0x30 + +#define ISER_VER 0x10 +#define ISER_WSV 0x08 +#define ISER_RSV 0x04 + +/** + * struct iser_cm_hdr - iSER CM header (from iSER Annex A12) + * + * @flags: flags support (zbva, send_w_inv) + * @rsvd: reserved + */ +struct iser_cm_hdr { + u8 flags; + u8 rsvd[3]; +} __packed; + +/** + * struct iser_ctrl - iSER header of iSCSI control PDU + * + * @flags: opcode and read/write valid bits + * @rsvd: reserved + * @write_stag: write rkey + * @write_va: write virtual address + * @reaf_stag: read rkey + * @read_va: read virtual address + */ +struct iser_ctrl { + u8 flags; + u8 rsvd[3]; + __be32 write_stag; + __be64 write_va; + __be32 read_stag; + __be64 read_va; +} __packed; + +#endif /* ISCSI_ISER_H */ diff --git a/include/sound/timer.h b/include/sound/timer.h index 7990469a44ce..c4d76ff056c6 100644 --- a/include/sound/timer.h +++ b/include/sound/timer.h @@ -104,6 +104,7 @@ struct snd_timer_instance { int event, struct timespec * tstamp, unsigned long resolution); + void (*disconnect)(struct snd_timer_instance *timeri); void *callback_data; unsigned long ticks; /* auto-load ticks when expired */ unsigned long cticks; /* current ticks */ diff --git a/include/trace/events/ext4.h b/include/trace/events/ext4.h index 594b4b29a224..4e4b2fa78609 100644 --- a/include/trace/events/ext4.h +++ b/include/trace/events/ext4.h @@ -43,7 +43,7 @@ struct extent_status; { EXT4_GET_BLOCKS_METADATA_NOFAIL, "METADATA_NOFAIL" }, \ { EXT4_GET_BLOCKS_NO_NORMALIZE, "NO_NORMALIZE" }, \ { EXT4_GET_BLOCKS_KEEP_SIZE, "KEEP_SIZE" }, \ - { EXT4_GET_BLOCKS_NO_LOCK, "NO_LOCK" }) + { EXT4_GET_BLOCKS_ZERO, "ZERO" }) #define show_mflags(flags) __print_flags(flags, "", \ { EXT4_MAP_NEW, "N" }, \ diff --git a/include/trace/events/huge_memory.h b/include/trace/events/huge_memory.h index 0f803d2783e3..47c6212d8f3c 100644 --- a/include/trace/events/huge_memory.h +++ b/include/trace/events/huge_memory.h @@ -46,10 +46,10 @@ SCAN_STATUS TRACE_EVENT(mm_khugepaged_scan_pmd, - TP_PROTO(struct mm_struct *mm, unsigned long pfn, bool writable, + TP_PROTO(struct mm_struct *mm, struct page *page, bool writable, bool referenced, int none_or_zero, int status), - TP_ARGS(mm, pfn, writable, referenced, none_or_zero, status), + TP_ARGS(mm, page, writable, referenced, none_or_zero, status), TP_STRUCT__entry( __field(struct mm_struct *, mm) @@ -62,7 +62,7 @@ TRACE_EVENT(mm_khugepaged_scan_pmd, TP_fast_assign( __entry->mm = mm; - __entry->pfn = pfn; + __entry->pfn = page ? page_to_pfn(page) : -1; __entry->writable = writable; __entry->referenced = referenced; __entry->none_or_zero = none_or_zero; @@ -104,10 +104,10 @@ TRACE_EVENT(mm_collapse_huge_page, TRACE_EVENT(mm_collapse_huge_page_isolate, - TP_PROTO(unsigned long pfn, int none_or_zero, + TP_PROTO(struct page *page, int none_or_zero, bool referenced, bool writable, int status), - TP_ARGS(pfn, none_or_zero, referenced, writable, status), + TP_ARGS(page, none_or_zero, referenced, writable, status), TP_STRUCT__entry( __field(unsigned long, pfn) @@ -118,7 +118,7 @@ TRACE_EVENT(mm_collapse_huge_page_isolate, ), TP_fast_assign( - __entry->pfn = pfn; + __entry->pfn = page ? page_to_pfn(page) : -1; __entry->none_or_zero = none_or_zero; __entry->referenced = referenced; __entry->writable = writable; diff --git a/include/trace/events/irq.h b/include/trace/events/irq.h index ff8f6c091a15..f95f25e786ef 100644 --- a/include/trace/events/irq.h +++ b/include/trace/events/irq.h @@ -15,7 +15,7 @@ struct softirq_action; softirq_name(NET_TX) \ softirq_name(NET_RX) \ softirq_name(BLOCK) \ - softirq_name(BLOCK_IOPOLL) \ + softirq_name(IRQ_POLL) \ softirq_name(TASKLET) \ softirq_name(SCHED) \ softirq_name(HRTIMER) \ diff --git a/include/uapi/linux/Kbuild b/include/uapi/linux/Kbuild index c2e5d6cb34e3..ebd10e624598 100644 --- a/include/uapi/linux/Kbuild +++ b/include/uapi/linux/Kbuild @@ -307,7 +307,7 @@ header-y += nfs_mount.h header-y += nl80211.h header-y += n_r3964.h header-y += nubus.h -header-y += nvme.h +header-y += nvme_ioctl.h header-y += nvram.h header-y += omap3isp.h header-y += omapfb.h diff --git a/include/uapi/linux/fs.h b/include/uapi/linux/fs.h index b8a5b3b86ad6..41e0433b4a83 100644 --- a/include/uapi/linux/fs.h +++ b/include/uapi/linux/fs.h @@ -2,8 +2,11 @@ #define _UAPI_LINUX_FS_H /* - * This file has definitions for some important file table - * structures etc. + * This file has definitions for some important file table structures + * and constants and structures used by various generic file system + * ioctl's. Please do not make any changes in this file before + * sending patches for review to linux-fsdevel@vger.kernel.org and + * linux-api@vger.kernel.org. */ #include <linux/limits.h> @@ -146,6 +149,37 @@ struct inodes_stat_t { #define MS_MGC_VAL 0xC0ED0000 #define MS_MGC_MSK 0xffff0000 +/* + * Structure for FS_IOC_FSGETXATTR[A] and FS_IOC_FSSETXATTR. + */ +struct fsxattr { + __u32 fsx_xflags; /* xflags field value (get/set) */ + __u32 fsx_extsize; /* extsize field value (get/set)*/ + __u32 fsx_nextents; /* nextents field value (get) */ + __u32 fsx_projid; /* project identifier (get/set) */ + unsigned char fsx_pad[12]; +}; + +/* + * Flags for the fsx_xflags field + */ +#define FS_XFLAG_REALTIME 0x00000001 /* data in realtime volume */ +#define FS_XFLAG_PREALLOC 0x00000002 /* preallocated file extents */ +#define FS_XFLAG_IMMUTABLE 0x00000008 /* file cannot be modified */ +#define FS_XFLAG_APPEND 0x00000010 /* all writes append */ +#define FS_XFLAG_SYNC 0x00000020 /* all writes synchronous */ +#define FS_XFLAG_NOATIME 0x00000040 /* do not update access time */ +#define FS_XFLAG_NODUMP 0x00000080 /* do not include in backups */ +#define FS_XFLAG_RTINHERIT 0x00000100 /* create with rt bit set */ +#define FS_XFLAG_PROJINHERIT 0x00000200 /* create with parents projid */ +#define FS_XFLAG_NOSYMLINKS 0x00000400 /* disallow symlink creation */ +#define FS_XFLAG_EXTSIZE 0x00000800 /* extent size allocator hint */ +#define FS_XFLAG_EXTSZINHERIT 0x00001000 /* inherit inode extent size */ +#define FS_XFLAG_NODEFRAG 0x00002000 /* do not defragment */ +#define FS_XFLAG_FILESTREAM 0x00004000 /* use filestream allocator */ +#define FS_XFLAG_DAX 0x00008000 /* use DAX for IO */ +#define FS_XFLAG_HASATTR 0x80000000 /* no DIFLAG for this */ + /* the read-only stuff doesn't really belong here, but any other place is probably as bad and I don't want to create yet another include file. */ @@ -210,9 +244,28 @@ struct inodes_stat_t { #define FS_IOC32_SETFLAGS _IOW('f', 2, int) #define FS_IOC32_GETVERSION _IOR('v', 1, int) #define FS_IOC32_SETVERSION _IOW('v', 2, int) +#define FS_IOC_FSGETXATTR _IOR ('X', 31, struct fsxattr) +#define FS_IOC_FSSETXATTR _IOW ('X', 32, struct fsxattr) /* * Inode flags (FS_IOC_GETFLAGS / FS_IOC_SETFLAGS) + * + * Note: for historical reasons, these flags were originally used and + * defined for use by ext2/ext3, and then other file systems started + * using these flags so they wouldn't need to write their own version + * of chattr/lsattr (which was shipped as part of e2fsprogs). You + * should think twice before trying to use these flags in new + * contexts, or trying to assign these flags, since they are used both + * as the UAPI and the on-disk encoding for ext2/3/4. Also, we are + * almost out of 32-bit flags. :-) + * + * We have recently hoisted FS_IOC_FSGETXATTR / FS_IOC_FSSETXATTR from + * XFS to the generic FS level interface. This uses a structure that + * has padding and hence has more room to grow, so it may be more + * appropriate for many new use cases. + * + * Please do not change these flags or interfaces before checking with + * linux-fsdevel@vger.kernel.org and linux-api@vger.kernel.org. */ #define FS_SECRM_FL 0x00000001 /* Secure deletion */ #define FS_UNRM_FL 0x00000002 /* Undelete */ @@ -226,8 +279,8 @@ struct inodes_stat_t { #define FS_DIRTY_FL 0x00000100 #define FS_COMPRBLK_FL 0x00000200 /* One or more compressed clusters */ #define FS_NOCOMP_FL 0x00000400 /* Don't compress */ -#define FS_ECOMPR_FL 0x00000800 /* Compression error */ /* End compression flags --- maybe not all used */ +#define FS_ENCRYPT_FL 0x00000800 /* Encrypted file */ #define FS_BTREE_FL 0x00001000 /* btree format dir */ #define FS_INDEX_FL 0x00001000 /* hash-indexed directory */ #define FS_IMAGIC_FL 0x00002000 /* AFS directory */ @@ -235,9 +288,12 @@ struct inodes_stat_t { #define FS_NOTAIL_FL 0x00008000 /* file tail should not be merged */ #define FS_DIRSYNC_FL 0x00010000 /* dirsync behaviour (directories only) */ #define FS_TOPDIR_FL 0x00020000 /* Top of directory hierarchies*/ +#define FS_HUGE_FILE_FL 0x00040000 /* Reserved for ext4 */ #define FS_EXTENT_FL 0x00080000 /* Extents */ -#define FS_DIRECTIO_FL 0x00100000 /* Use direct i/o */ +#define FS_EA_INODE_FL 0x00200000 /* Inode used for large EA */ +#define FS_EOFBLOCKS_FL 0x00400000 /* Reserved for ext4 */ #define FS_NOCOW_FL 0x00800000 /* Do not cow file */ +#define FS_INLINE_DATA_FL 0x10000000 /* Reserved for ext4 */ #define FS_PROJINHERIT_FL 0x20000000 /* Create with parents projid */ #define FS_RESERVED_FL 0x80000000 /* reserved for ext2 lib */ diff --git a/include/uapi/linux/lightnvm.h b/include/uapi/linux/lightnvm.h index 928f98997d8a..774a43128a7a 100644 --- a/include/uapi/linux/lightnvm.h +++ b/include/uapi/linux/lightnvm.h @@ -33,6 +33,7 @@ #define NVM_TTYPE_NAME_MAX 48 #define NVM_TTYPE_MAX 63 +#define NVM_MMTYPE_LEN 8 #define NVM_CTRL_FILE "/dev/lightnvm/control" @@ -100,6 +101,26 @@ struct nvm_ioctl_remove { __u32 flags; }; +struct nvm_ioctl_dev_init { + char dev[DISK_NAME_LEN]; /* open-channel SSD device */ + char mmtype[NVM_MMTYPE_LEN]; /* register to media manager */ + + __u32 flags; +}; + +enum { + NVM_FACTORY_ERASE_ONLY_USER = 1 << 0, /* erase only blocks used as + * host blks or grown blks */ + NVM_FACTORY_RESET_HOST_BLKS = 1 << 1, /* remove host blk marks */ + NVM_FACTORY_RESET_GRWN_BBLKS = 1 << 2, /* remove grown blk marks */ + NVM_FACTORY_NR_BITS = 1 << 3, /* stops here */ +}; + +struct nvm_ioctl_dev_factory { + char dev[DISK_NAME_LEN]; + + __u32 flags; +}; /* The ioctl type, 'L', 0x20 - 0x2F documented in ioctl-number.txt */ enum { @@ -110,6 +131,12 @@ enum { /* device level cmds */ NVM_DEV_CREATE_CMD, NVM_DEV_REMOVE_CMD, + + /* Init a device to support LightNVM media managers */ + NVM_DEV_INIT_CMD, + + /* Factory reset device */ + NVM_DEV_FACTORY_CMD, }; #define NVM_IOCTL 'L' /* 0x4c */ @@ -122,6 +149,10 @@ enum { struct nvm_ioctl_create) #define NVM_DEV_REMOVE _IOW(NVM_IOCTL, NVM_DEV_REMOVE_CMD, \ struct nvm_ioctl_remove) +#define NVM_DEV_INIT _IOW(NVM_IOCTL, NVM_DEV_INIT_CMD, \ + struct nvm_ioctl_dev_init) +#define NVM_DEV_FACTORY _IOW(NVM_IOCTL, NVM_DEV_FACTORY_CMD, \ + struct nvm_ioctl_dev_factory) #define NVM_VERSION_MAJOR 1 #define NVM_VERSION_MINOR 0 diff --git a/include/xen/interface/io/blkif.h b/include/xen/interface/io/blkif.h index c33e1c489eb2..8b8cfadf7833 100644 --- a/include/xen/interface/io/blkif.h +++ b/include/xen/interface/io/blkif.h @@ -28,6 +28,54 @@ typedef uint16_t blkif_vdev_t; typedef uint64_t blkif_sector_t; /* + * Multiple hardware queues/rings: + * If supported, the backend will write the key "multi-queue-max-queues" to + * the directory for that vbd, and set its value to the maximum supported + * number of queues. + * Frontends that are aware of this feature and wish to use it can write the + * key "multi-queue-num-queues" with the number they wish to use, which must be + * greater than zero, and no more than the value reported by the backend in + * "multi-queue-max-queues". + * + * For frontends requesting just one queue, the usual event-channel and + * ring-ref keys are written as before, simplifying the backend processing + * to avoid distinguishing between a frontend that doesn't understand the + * multi-queue feature, and one that does, but requested only one queue. + * + * Frontends requesting two or more queues must not write the toplevel + * event-channel and ring-ref keys, instead writing those keys under sub-keys + * having the name "queue-N" where N is the integer ID of the queue/ring for + * which those keys belong. Queues are indexed from zero. + * For example, a frontend with two queues must write the following set of + * queue-related keys: + * + * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vbd/0/queue-0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref = "<ring-ref#0>" + * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>" + * /local/domain/1/device/vbd/0/queue-1 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref = "<ring-ref#1>" + * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>" + * + * It is also possible to use multiple queues/rings together with + * feature multi-page ring buffer. + * For example, a frontend requests two queues/rings and the size of each ring + * buffer is two pages must write the following set of related keys: + * + * /local/domain/1/device/vbd/0/multi-queue-num-queues = "2" + * /local/domain/1/device/vbd/0/ring-page-order = "1" + * /local/domain/1/device/vbd/0/queue-0 = "" + * /local/domain/1/device/vbd/0/queue-0/ring-ref0 = "<ring-ref#0>" + * /local/domain/1/device/vbd/0/queue-0/ring-ref1 = "<ring-ref#1>" + * /local/domain/1/device/vbd/0/queue-0/event-channel = "<evtchn#0>" + * /local/domain/1/device/vbd/0/queue-1 = "" + * /local/domain/1/device/vbd/0/queue-1/ring-ref0 = "<ring-ref#2>" + * /local/domain/1/device/vbd/0/queue-1/ring-ref1 = "<ring-ref#3>" + * /local/domain/1/device/vbd/0/queue-1/event-channel = "<evtchn#1>" + * + */ + +/* * REQUEST CODES. */ #define BLKIF_OP_READ 0 diff --git a/ipc/mqueue.c b/ipc/mqueue.c index f4617cf07069..781c1399c6a3 100644 --- a/ipc/mqueue.c +++ b/ipc/mqueue.c @@ -795,7 +795,7 @@ SYSCALL_DEFINE4(mq_open, const char __user *, u_name, int, oflag, umode_t, mode, ro = mnt_want_write(mnt); /* we'll drop it in any case */ error = 0; - mutex_lock(&d_inode(root)->i_mutex); + inode_lock(d_inode(root)); path.dentry = lookup_one_len(name->name, root, strlen(name->name)); if (IS_ERR(path.dentry)) { error = PTR_ERR(path.dentry); @@ -841,7 +841,7 @@ out_putfd: put_unused_fd(fd); fd = error; } - mutex_unlock(&d_inode(root)->i_mutex); + inode_unlock(d_inode(root)); if (!ro) mnt_drop_write(mnt); out_putname: @@ -866,7 +866,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) err = mnt_want_write(mnt); if (err) goto out_name; - mutex_lock_nested(&d_inode(mnt->mnt_root)->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(d_inode(mnt->mnt_root), I_MUTEX_PARENT); dentry = lookup_one_len(name->name, mnt->mnt_root, strlen(name->name)); if (IS_ERR(dentry)) { @@ -884,7 +884,7 @@ SYSCALL_DEFINE1(mq_unlink, const char __user *, u_name) dput(dentry); out_unlock: - mutex_unlock(&d_inode(mnt->mnt_root)->i_mutex); + inode_unlock(d_inode(mnt->mnt_root)); if (inode) iput(inode); mnt_drop_write(mnt); diff --git a/ipc/sem.c b/ipc/sem.c index b471e5a3863d..cddd5b5fde51 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1493,7 +1493,7 @@ out_rcu_wakeup: wake_up_sem_queue_do(&tasks); out_free: if (sem_io != fast_sem_io) - ipc_free(sem_io, sizeof(ushort)*nsems); + ipc_free(sem_io); return err; } diff --git a/ipc/util.c b/ipc/util.c index 0f401d94b7c6..798cad18dd87 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -414,17 +414,12 @@ void *ipc_alloc(int size) /** * ipc_free - free ipc space * @ptr: pointer returned by ipc_alloc - * @size: size of block * - * Free a block created with ipc_alloc(). The caller must know the size - * used in the allocation call. + * Free a block created with ipc_alloc(). */ -void ipc_free(void *ptr, int size) +void ipc_free(void *ptr) { - if (size > PAGE_SIZE) - vfree(ptr); - else - kfree(ptr); + kvfree(ptr); } /** diff --git a/ipc/util.h b/ipc/util.h index 3a8a5a0eca62..51f7ca58ac67 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); * both function can sleep */ void *ipc_alloc(int size); -void ipc_free(void *ptr, int size); +void ipc_free(void *ptr); /* * For allocation that need to be freed by RCU. diff --git a/kernel/audit_fsnotify.c b/kernel/audit_fsnotify.c index 27c6046c2c3d..f84f8d06e1f6 100644 --- a/kernel/audit_fsnotify.c +++ b/kernel/audit_fsnotify.c @@ -95,7 +95,7 @@ struct audit_fsnotify_mark *audit_alloc_mark(struct audit_krule *krule, char *pa if (IS_ERR(dentry)) return (void *)dentry; /* returning an error */ inode = path.dentry->d_inode; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); audit_mark = kzalloc(sizeof(*audit_mark), GFP_KERNEL); if (unlikely(!audit_mark)) { diff --git a/kernel/audit_watch.c b/kernel/audit_watch.c index 656c7e93ac0d..9f194aad0adc 100644 --- a/kernel/audit_watch.c +++ b/kernel/audit_watch.c @@ -364,7 +364,7 @@ static int audit_get_nd(struct audit_watch *watch, struct path *parent) struct dentry *d = kern_path_locked(watch->path, parent); if (IS_ERR(d)) return PTR_ERR(d); - mutex_unlock(&d_backing_inode(parent->dentry)->i_mutex); + inode_unlock(d_backing_inode(parent->dentry)); if (d_is_positive(d)) { /* update watch filter fields */ watch->dev = d_backing_inode(d)->i_sb->s_dev; diff --git a/kernel/events/core.c b/kernel/events/core.c index c0957416b32e..06ae52e99ac2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -4872,9 +4872,9 @@ static int perf_fasync(int fd, struct file *filp, int on) struct perf_event *event = filp->private_data; int retval; - mutex_lock(&inode->i_mutex); + inode_lock(inode); retval = fasync_helper(fd, filp, on, &event->fasync); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (retval < 0) return retval; diff --git a/kernel/relay.c b/kernel/relay.c index 0b4570cfacae..074994bcfa9b 100644 --- a/kernel/relay.c +++ b/kernel/relay.c @@ -1133,7 +1133,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, if (!desc->count) return 0; - mutex_lock(&file_inode(filp)->i_mutex); + inode_lock(file_inode(filp)); do { if (!relay_file_read_avail(buf, *ppos)) break; @@ -1153,7 +1153,7 @@ static ssize_t relay_file_read_subbufs(struct file *filp, loff_t *ppos, *ppos = relay_file_read_end_pos(buf, read_start, ret); } } while (desc->count && ret); - mutex_unlock(&file_inode(filp)->i_mutex); + inode_unlock(file_inode(filp)); return desc->written; } diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 44253adb3c36..63d3a24e081a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -222,9 +222,9 @@ sched_feat_write(struct file *filp, const char __user *ubuf, /* Ensure the static_key remains in a consistent state */ inode = file_inode(filp); - mutex_lock(&inode->i_mutex); + inode_lock(inode); i = sched_feat_set(cmp); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (i == __SCHED_FEAT_NR) return -EINVAL; diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 91420362e0b3..97715fd9e790 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1757,6 +1757,20 @@ static struct ctl_table fs_table[] = { .proc_handler = &pipe_proc_fn, .extra1 = &pipe_min_size, }, + { + .procname = "pipe-user-pages-hard", + .data = &pipe_user_pages_hard, + .maxlen = sizeof(pipe_user_pages_hard), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, + { + .procname = "pipe-user-pages-soft", + .data = &pipe_user_pages_soft, + .maxlen = sizeof(pipe_user_pages_soft), + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, { } }; diff --git a/lib/Kconfig b/lib/Kconfig index 5a0c1c83cdf0..133ebc0c1773 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -210,9 +210,11 @@ config RANDOM32_SELFTEST # compression support is select'ed if needed # config 842_COMPRESS + select CRC32 tristate config 842_DECOMPRESS + select CRC32 tristate config ZLIB_INFLATE @@ -475,6 +477,11 @@ config DDR information. This data is useful for drivers handling DDR SDRAM controllers. +config IRQ_POLL + bool "IRQ polling library" + help + Helper library to poll interrupt mitigation using polling. + config MPILIB tristate select CLZ_TAB diff --git a/lib/Makefile b/lib/Makefile index 2d4bc33d09b4..a7c26a41a738 100644 --- a/lib/Makefile +++ b/lib/Makefile @@ -165,6 +165,7 @@ obj-$(CONFIG_GENERIC_NET_UTILS) += net_utils.o obj-$(CONFIG_SG_SPLIT) += sg_split.o obj-$(CONFIG_STMP_DEVICE) += stmp_device.o +obj-$(CONFIG_IRQ_POLL) += irq_poll.o libfdt_files = fdt.o fdt_ro.o fdt_wip.o fdt_rw.o fdt_sw.o fdt_strerror.o \ fdt_empty_tree.o diff --git a/block/blk-iopoll.c b/lib/irq_poll.c index 0736729d6494..836f7db4e548 100644 --- a/block/blk-iopoll.c +++ b/lib/irq_poll.c @@ -6,84 +6,84 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/bio.h> -#include <linux/blkdev.h> #include <linux/interrupt.h> #include <linux/cpu.h> -#include <linux/blk-iopoll.h> +#include <linux/irq_poll.h> #include <linux/delay.h> -#include "blk.h" - -static unsigned int blk_iopoll_budget __read_mostly = 256; +static unsigned int irq_poll_budget __read_mostly = 256; static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll); /** - * blk_iopoll_sched - Schedule a run of the iopoll handler + * irq_poll_sched - Schedule a run of the iopoll handler * @iop: The parent iopoll structure * * Description: - * Add this blk_iopoll structure to the pending poll list and trigger the - * raise of the blk iopoll softirq. The driver must already have gotten a - * successful return from blk_iopoll_sched_prep() before calling this. + * Add this irq_poll structure to the pending poll list and trigger the + * raise of the blk iopoll softirq. **/ -void blk_iopoll_sched(struct blk_iopoll *iop) +void irq_poll_sched(struct irq_poll *iop) { unsigned long flags; + if (test_bit(IRQ_POLL_F_DISABLE, &iop->state)) + return; + if (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state)) + return; + local_irq_save(flags); list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_restore(flags); } -EXPORT_SYMBOL(blk_iopoll_sched); +EXPORT_SYMBOL(irq_poll_sched); /** - * __blk_iopoll_complete - Mark this @iop as un-polled again + * __irq_poll_complete - Mark this @iop as un-polled again * @iop: The parent iopoll structure * * Description: - * See blk_iopoll_complete(). This function must be called with interrupts + * See irq_poll_complete(). This function must be called with interrupts * disabled. **/ -void __blk_iopoll_complete(struct blk_iopoll *iop) +static void __irq_poll_complete(struct irq_poll *iop) { list_del(&iop->list); smp_mb__before_atomic(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); + clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state); } -EXPORT_SYMBOL(__blk_iopoll_complete); /** - * blk_iopoll_complete - Mark this @iop as un-polled again + * irq_poll_complete - Mark this @iop as un-polled again * @iop: The parent iopoll structure * * Description: * If a driver consumes less than the assigned budget in its run of the * iopoll handler, it'll end the polled mode by calling this function. The - * iopoll handler will not be invoked again before blk_iopoll_sched_prep() + * iopoll handler will not be invoked again before irq_poll_sched() * is called. **/ -void blk_iopoll_complete(struct blk_iopoll *iop) +void irq_poll_complete(struct irq_poll *iop) { unsigned long flags; local_irq_save(flags); - __blk_iopoll_complete(iop); + __irq_poll_complete(iop); local_irq_restore(flags); } -EXPORT_SYMBOL(blk_iopoll_complete); +EXPORT_SYMBOL(irq_poll_complete); -static void blk_iopoll_softirq(struct softirq_action *h) +static void irq_poll_softirq(struct softirq_action *h) { struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); - int rearm = 0, budget = blk_iopoll_budget; + int rearm = 0, budget = irq_poll_budget; unsigned long start_time = jiffies; local_irq_disable(); while (!list_empty(list)) { - struct blk_iopoll *iop; + struct irq_poll *iop; int work, weight; /* @@ -101,11 +101,11 @@ static void blk_iopoll_softirq(struct softirq_action *h) * entries to the tail of this list, and only ->poll() * calls can remove this head entry from the list. */ - iop = list_entry(list->next, struct blk_iopoll, list); + iop = list_entry(list->next, struct irq_poll, list); weight = iop->weight; work = 0; - if (test_bit(IOPOLL_F_SCHED, &iop->state)) + if (test_bit(IRQ_POLL_F_SCHED, &iop->state)) work = iop->poll(iop, weight); budget -= work; @@ -121,72 +121,70 @@ static void blk_iopoll_softirq(struct softirq_action *h) * move the instance around on the list at-will. */ if (work >= weight) { - if (blk_iopoll_disable_pending(iop)) - __blk_iopoll_complete(iop); + if (test_bit(IRQ_POLL_F_DISABLE, &iop->state)) + __irq_poll_complete(iop); else list_move_tail(&iop->list, list); } } if (rearm) - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); } /** - * blk_iopoll_disable - Disable iopoll on this @iop + * irq_poll_disable - Disable iopoll on this @iop * @iop: The parent iopoll structure * * Description: * Disable io polling and wait for any pending callbacks to have completed. **/ -void blk_iopoll_disable(struct blk_iopoll *iop) +void irq_poll_disable(struct irq_poll *iop) { - set_bit(IOPOLL_F_DISABLE, &iop->state); - while (test_and_set_bit(IOPOLL_F_SCHED, &iop->state)) + set_bit(IRQ_POLL_F_DISABLE, &iop->state); + while (test_and_set_bit(IRQ_POLL_F_SCHED, &iop->state)) msleep(1); - clear_bit(IOPOLL_F_DISABLE, &iop->state); + clear_bit(IRQ_POLL_F_DISABLE, &iop->state); } -EXPORT_SYMBOL(blk_iopoll_disable); +EXPORT_SYMBOL(irq_poll_disable); /** - * blk_iopoll_enable - Enable iopoll on this @iop + * irq_poll_enable - Enable iopoll on this @iop * @iop: The parent iopoll structure * * Description: * Enable iopoll on this @iop. Note that the handler run will not be * scheduled, it will only mark it as active. **/ -void blk_iopoll_enable(struct blk_iopoll *iop) +void irq_poll_enable(struct irq_poll *iop) { - BUG_ON(!test_bit(IOPOLL_F_SCHED, &iop->state)); + BUG_ON(!test_bit(IRQ_POLL_F_SCHED, &iop->state)); smp_mb__before_atomic(); - clear_bit_unlock(IOPOLL_F_SCHED, &iop->state); + clear_bit_unlock(IRQ_POLL_F_SCHED, &iop->state); } -EXPORT_SYMBOL(blk_iopoll_enable); +EXPORT_SYMBOL(irq_poll_enable); /** - * blk_iopoll_init - Initialize this @iop + * irq_poll_init - Initialize this @iop * @iop: The parent iopoll structure * @weight: The default weight (or command completion budget) * @poll_fn: The handler to invoke * * Description: - * Initialize this blk_iopoll structure. Before being actively used, the - * driver must call blk_iopoll_enable(). + * Initialize and enable this irq_poll structure. **/ -void blk_iopoll_init(struct blk_iopoll *iop, int weight, blk_iopoll_fn *poll_fn) +void irq_poll_init(struct irq_poll *iop, int weight, irq_poll_fn *poll_fn) { memset(iop, 0, sizeof(*iop)); INIT_LIST_HEAD(&iop->list); iop->weight = weight; iop->poll = poll_fn; - set_bit(IOPOLL_F_SCHED, &iop->state); } -EXPORT_SYMBOL(blk_iopoll_init); +EXPORT_SYMBOL(irq_poll_init); -static int blk_iopoll_cpu_notify(struct notifier_block *self, +static int irq_poll_cpu_notify(struct notifier_block *self, unsigned long action, void *hcpu) { /* @@ -199,26 +197,26 @@ static int blk_iopoll_cpu_notify(struct notifier_block *self, local_irq_disable(); list_splice_init(&per_cpu(blk_cpu_iopoll, cpu), this_cpu_ptr(&blk_cpu_iopoll)); - __raise_softirq_irqoff(BLOCK_IOPOLL_SOFTIRQ); + __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); } return NOTIFY_OK; } -static struct notifier_block blk_iopoll_cpu_notifier = { - .notifier_call = blk_iopoll_cpu_notify, +static struct notifier_block irq_poll_cpu_notifier = { + .notifier_call = irq_poll_cpu_notify, }; -static __init int blk_iopoll_setup(void) +static __init int irq_poll_setup(void) { int i; for_each_possible_cpu(i) INIT_LIST_HEAD(&per_cpu(blk_cpu_iopoll, i)); - open_softirq(BLOCK_IOPOLL_SOFTIRQ, blk_iopoll_softirq); - register_hotcpu_notifier(&blk_iopoll_cpu_notifier); + open_softirq(IRQ_POLL_SOFTIRQ, irq_poll_softirq); + register_hotcpu_notifier(&irq_poll_cpu_notifier); return 0; } -subsys_initcall(blk_iopoll_setup); +subsys_initcall(irq_poll_setup); diff --git a/lib/libcrc32c.c b/lib/libcrc32c.c index 31ce853fbfb1..74a54b7f2562 100644 --- a/lib/libcrc32c.c +++ b/lib/libcrc32c.c @@ -75,3 +75,4 @@ module_exit(libcrc32c_mod_fini); MODULE_AUTHOR("Clay Haapala <chaapala@cisco.com>"); MODULE_DESCRIPTION("CRC32c (Castagnoli) calculations"); MODULE_LICENSE("GPL"); +MODULE_SOFTDEP("pre: crc32c"); diff --git a/lib/lru_cache.c b/lib/lru_cache.c index 028f5d996eef..28ba40b99337 100644 --- a/lib/lru_cache.c +++ b/lib/lru_cache.c @@ -238,7 +238,7 @@ void lc_reset(struct lru_cache *lc) * @seq: the seq_file to print into * @lc: the lru cache to print statistics of */ -size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) +void lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) { /* NOTE: * total calls to lc_get are @@ -250,8 +250,6 @@ size_t lc_seq_printf_stats(struct seq_file *seq, struct lru_cache *lc) seq_printf(seq, "\t%s: used:%u/%u hits:%lu misses:%lu starving:%lu locked:%lu changed:%lu\n", lc->name, lc->used, lc->nr_elements, lc->hits, lc->misses, lc->starving, lc->locked, lc->changed); - - return 0; } static struct hlist_head *lc_hash_slot(struct lru_cache *lc, unsigned int enr) diff --git a/lib/ratelimit.c b/lib/ratelimit.c index 40e03ea2a967..2c5de86460c5 100644 --- a/lib/ratelimit.c +++ b/lib/ratelimit.c @@ -49,7 +49,7 @@ int ___ratelimit(struct ratelimit_state *rs, const char *func) if (rs->missed) printk(KERN_WARNING "%s: %d callbacks suppressed\n", func, rs->missed); - rs->begin = 0; + rs->begin = jiffies; rs->printed = 0; rs->missed = 0; } diff --git a/mm/filemap.c b/mm/filemap.c index 847ee43c2806..bc943867d68c 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -11,6 +11,7 @@ */ #include <linux/export.h> #include <linux/compiler.h> +#include <linux/dax.h> #include <linux/fs.h> #include <linux/uaccess.h> #include <linux/capability.h> @@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); if (shadow) { - mapping->nrshadows++; + mapping->nrexceptional++; /* - * Make sure the nrshadows update is committed before + * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing * with reclaim does not see both counters 0 at the * same time and miss a shadow entry. @@ -481,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; + if (dax_mapping(mapping) && mapping->nrexceptional) { + err = dax_writeback_mapping_range(mapping, lstart, lend); + if (err) + return err; + } + if (mapping->nrpages) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); @@ -579,9 +586,13 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; + + if (WARN_ON(dax_mapping(mapping))) + return -EINVAL; + if (shadowp) *shadowp = p; - mapping->nrshadows--; + mapping->nrexceptional--; if (node) workingset_node_shadows_dec(node); } @@ -1245,9 +1256,9 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. */ goto export; } @@ -1494,6 +1505,74 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); +/** + * find_get_entries_tag - find and return entries that match @tag + * @mapping: the address_space to search + * @start: the starting page cache index + * @tag: the tag index + * @nr_entries: the maximum number of entries + * @entries: where the resulting entries are placed + * @indices: the cache indices corresponding to the entries in @entries + * + * Like find_get_entries, except we only return entries which are tagged with + * @tag. + */ +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, start, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_entries_tag); + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2684,11 +2763,11 @@ ssize_t generic_file_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *inode = file->f_mapping->host; ssize_t ret; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = generic_write_checks(iocb, from); if (ret > 0) ret = __generic_file_write_iter(iocb, from); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if (ret > 0) { ssize_t err; diff --git a/mm/huge_memory.c b/mm/huge_memory.c index 8ad580273521..fd3a07b3e6f4 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -1560,7 +1560,8 @@ int madvise_free_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, struct mm_struct *mm = tlb->mm; int ret = 0; - if (!pmd_trans_huge_lock(pmd, vma, &ptl)) + ptl = pmd_trans_huge_lock(pmd, vma); + if (!ptl) goto out_unlocked; orig_pmd = *pmd; @@ -1627,7 +1628,8 @@ int zap_huge_pmd(struct mmu_gather *tlb, struct vm_area_struct *vma, pmd_t orig_pmd; spinlock_t *ptl; - if (!__pmd_trans_huge_lock(pmd, vma, &ptl)) + ptl = __pmd_trans_huge_lock(pmd, vma); + if (!ptl) return 0; /* * For architectures like ppc64 we look at deposited pgtable @@ -1690,7 +1692,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma, * We don't have to worry about the ordering of src and dst * ptlocks because exclusive mmap_sem prevents deadlock. */ - if (__pmd_trans_huge_lock(old_pmd, vma, &old_ptl)) { + old_ptl = __pmd_trans_huge_lock(old_pmd, vma); + if (old_ptl) { new_ptl = pmd_lockptr(mm, new_pmd); if (new_ptl != old_ptl) spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING); @@ -1724,7 +1727,8 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, spinlock_t *ptl; int ret = 0; - if (__pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = __pmd_trans_huge_lock(pmd, vma); + if (ptl) { pmd_t entry; bool preserve_write = prot_numa && pmd_write(*pmd); ret = 1; @@ -1760,14 +1764,14 @@ int change_huge_pmd(struct vm_area_struct *vma, pmd_t *pmd, * Note that if it returns true, this routine returns without unlocking page * table lock. So callers must unlock it. */ -bool __pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma, - spinlock_t **ptl) +spinlock_t *__pmd_trans_huge_lock(pmd_t *pmd, struct vm_area_struct *vma) { - *ptl = pmd_lock(vma->vm_mm, pmd); + spinlock_t *ptl; + ptl = pmd_lock(vma->vm_mm, pmd); if (likely(pmd_trans_huge(*pmd) || pmd_devmap(*pmd))) - return true; - spin_unlock(*ptl); - return false; + return ptl; + spin_unlock(ptl); + return NULL; } #define VM_NO_THP (VM_SPECIAL | VM_HUGETLB | VM_SHARED | VM_MAYSHARE) @@ -2068,7 +2072,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, if (likely(writable)) { if (likely(referenced)) { result = SCAN_SUCCEED; - trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero, + trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 1; } @@ -2078,7 +2082,7 @@ static int __collapse_huge_page_isolate(struct vm_area_struct *vma, out: release_pte_pages(pte, _pte); - trace_mm_collapse_huge_page_isolate(page_to_pfn(page), none_or_zero, + trace_mm_collapse_huge_page_isolate(page, none_or_zero, referenced, writable, result); return 0; } @@ -2576,7 +2580,7 @@ out_unmap: collapse_huge_page(mm, address, hpage, vma, node); } out: - trace_mm_khugepaged_scan_pmd(mm, page_to_pfn(page), writable, referenced, + trace_mm_khugepaged_scan_pmd(mm, page, writable, referenced, none_or_zero, result); return ret; } diff --git a/mm/memcontrol.c b/mm/memcontrol.c index ca052f2a4a0b..d06cae2de783 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4638,7 +4638,8 @@ static int mem_cgroup_count_precharge_pte_range(pmd_t *pmd, pte_t *pte; spinlock_t *ptl; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (get_mctgt_type_thp(vma, addr, *pmd, NULL) == MC_TARGET_PAGE) mc.precharge += HPAGE_PMD_NR; spin_unlock(ptl); @@ -4826,7 +4827,8 @@ static int mem_cgroup_move_charge_pte_range(pmd_t *pmd, union mc_target target; struct page *page; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { if (mc.precharge < HPAGE_PMD_NR) { spin_unlock(ptl); return 0; diff --git a/mm/mincore.c b/mm/mincore.c index 2a565ed8bb49..563f32045490 100644 --- a/mm/mincore.c +++ b/mm/mincore.c @@ -117,7 +117,8 @@ static int mincore_pte_range(pmd_t *pmd, unsigned long addr, unsigned long end, unsigned char *vec = walk->private; int nr = (end - addr) >> PAGE_SHIFT; - if (pmd_trans_huge_lock(pmd, vma, &ptl)) { + ptl = pmd_trans_huge_lock(pmd, vma); + if (ptl) { memset(vec, 1, nr); spin_unlock(ptl); goto out; diff --git a/mm/mlock.c b/mm/mlock.c index e1e2b1207bf2..96f001041928 100644 --- a/mm/mlock.c +++ b/mm/mlock.c @@ -175,7 +175,7 @@ static void __munlock_isolation_failed(struct page *page) */ unsigned int munlock_vma_page(struct page *page) { - unsigned int nr_pages; + int nr_pages; struct zone *zone = page_zone(page); /* For try_to_munlock() and to serialize with page migration */ diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b97a053..998607adf6eb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size) /** * pcpu_mem_free - free memory * @ptr: memory to free - * @size: size of the area * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ -static void pcpu_mem_free(void *ptr, size_t size) +static void pcpu_mem_free(void *ptr) { - if (size <= PAGE_SIZE) - kfree(ptr); - else - vfree(ptr); + kvfree(ptr); } /** @@ -463,8 +459,8 @@ out_unlock: * pcpu_mem_free() might end up calling vfree() which uses * IRQ-unsafe lock and thus can't be called under pcpu_lock. */ - pcpu_mem_free(old, old_size); - pcpu_mem_free(new, new_size); + pcpu_mem_free(old); + pcpu_mem_free(new); return 0; } @@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk); return NULL; } @@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk); } /** diff --git a/mm/shmem.c b/mm/shmem.c index fa2ceb2d2655..440e2a7e6c1c 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -701,8 +701,7 @@ static void shmem_evict_inode(struct inode *inode) list_del_init(&info->swaplist); mutex_unlock(&shmem_swaplist_mutex); } - } else - kfree(info->symlink); + } simple_xattrs_free(&info->xattrs); WARN_ON(inode->i_blocks); @@ -1902,7 +1901,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (whence != SEEK_DATA && whence != SEEK_HOLE) return generic_file_llseek_size(file, offset, whence, MAX_LFS_FILESIZE, i_size_read(inode)); - mutex_lock(&inode->i_mutex); + inode_lock(inode); /* We're holding i_mutex so we can access i_size directly */ if (offset < 0) @@ -1926,7 +1925,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) if (offset >= 0) offset = vfs_setpos(file, offset, MAX_LFS_FILESIZE); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return offset; } @@ -2091,7 +2090,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) if (seals & ~(unsigned int)F_ALL_SEALS) return -EINVAL; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (info->seals & F_SEAL_SEAL) { error = -EPERM; @@ -2114,7 +2113,7 @@ int shmem_add_seals(struct file *file, unsigned int seals) error = 0; unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } EXPORT_SYMBOL_GPL(shmem_add_seals); @@ -2164,7 +2163,7 @@ static long shmem_fallocate(struct file *file, int mode, loff_t offset, if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE)) return -EOPNOTSUPP; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (mode & FALLOC_FL_PUNCH_HOLE) { struct address_space *mapping = file->f_mapping; @@ -2277,7 +2276,7 @@ undone: inode->i_private = NULL; spin_unlock(&inode->i_lock); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } @@ -2549,13 +2548,12 @@ static int shmem_symlink(struct inode *dir, struct dentry *dentry, const char *s info = SHMEM_I(inode); inode->i_size = len-1; if (len <= SHORT_SYMLINK_LEN) { - info->symlink = kmemdup(symname, len, GFP_KERNEL); - if (!info->symlink) { + inode->i_link = kmemdup(symname, len, GFP_KERNEL); + if (!inode->i_link) { iput(inode); return -ENOMEM; } inode->i_op = &shmem_short_symlink_operations; - inode->i_link = info->symlink; } else { inode_nohighmem(inode); error = shmem_getpage(inode, 0, &page, SGP_WRITE, NULL); @@ -3132,6 +3130,7 @@ static struct inode *shmem_alloc_inode(struct super_block *sb) static void shmem_destroy_callback(struct rcu_head *head) { struct inode *inode = container_of(head, struct inode, i_rcu); + kfree(inode->i_link); kmem_cache_free(shmem_inode_cachep, SHMEM_I(inode)); } diff --git a/mm/swapfile.c b/mm/swapfile.c index c43f654a7b64..d2c37365e2d6 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1956,9 +1956,9 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) set_blocksize(bdev, old_block_size); blkdev_put(bdev, FMODE_READ | FMODE_WRITE | FMODE_EXCL); } else { - mutex_lock(&inode->i_mutex); + inode_lock(inode); inode->i_flags &= ~S_SWAPFILE; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } filp_close(swap_file, NULL); @@ -2183,7 +2183,7 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) p->flags |= SWP_BLKDEV; } else if (S_ISREG(inode->i_mode)) { p->bdev = inode->i_sb->s_bdev; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (IS_SWAPFILE(inode)) return -EBUSY; } else @@ -2416,7 +2416,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) mapping = swap_file->f_mapping; inode = mapping->host; - /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ + /* If S_ISREG(inode->i_mode) will do inode_lock(inode); */ error = claim_swapfile(p, inode); if (unlikely(error)) goto bad_swap; @@ -2561,7 +2561,7 @@ bad_swap: vfree(cluster_info); if (swap_file) { if (inode && S_ISREG(inode->i_mode)) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); inode = NULL; } filp_close(swap_file, NULL); @@ -2574,7 +2574,7 @@ out: if (name) putname(name); if (inode && S_ISREG(inode->i_mode)) - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return error; } diff --git a/mm/truncate.c b/mm/truncate.c index 76e35ad97102..e3ee0e27cd17 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include <linux/kernel.h> #include <linux/backing-dev.h> +#include <linux/dax.h> #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> @@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping, return; spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrshadows--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + + if (dax_mapping(mapping)) { + if (radix_tree_delete_item(&mapping->page_tree, index, entry)) + mapping->nrexceptional--; + } else { + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); + } unlock: spin_unlock_irq(&mapping->tree_lock); } @@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping, int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ @@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrshadows; + unsigned long nrexceptional; unsigned long nrpages; /* @@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping) /* * When reclaim installs eviction entries, it increases - * nrshadows first, then decreases nrpages. Make sure we see + * nrexceptional first, then decreases nrpages. Make sure we see * this in the right order or we might miss an entry. */ nrpages = mapping->nrpages; smp_rmb(); - nrshadows = mapping->nrshadows; + nrexceptional = mapping->nrexceptional; - if (nrpages || nrshadows) { + if (nrpages || nrexceptional) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree diff --git a/mm/vmscan.c b/mm/vmscan.c index bd620b65db52..eb3dd37ccd7c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,6 +46,7 @@ #include <linux/oom.h> #include <linux/prefetch.h> #include <linux/printk.h> +#include <linux/dax.h> #include <asm/tlbflush.h> #include <asm/div64.h> @@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. + * + * We also don't store shadows for DAX mappings because the + * only page cache pages found in these are zero pages + * covering holes, and because we don't want to mix DAX + * exceptional entries and shadow exceptional entries in the + * same page_tree. */ if (reclaimed && page_is_file_cache(page) && - !mapping_exiting(mapping)) + !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); diff --git a/mm/vmstat.c b/mm/vmstat.c index 64bd0aa13f75..40b2c74ddf16 100644 --- a/mm/vmstat.c +++ b/mm/vmstat.c @@ -1408,17 +1408,7 @@ static void vmstat_update(struct work_struct *w) * Defer the checking for differentials to the * shepherd thread on a different processor. */ - int r; - /* - * Shepherd work thread does not race since it never - * changes the bit if its zero but the cpu - * online / off line code may race if - * worker threads are still allowed during - * shutdown / startup. - */ - r = cpumask_test_and_set_cpu(smp_processor_id(), - cpu_stat_off); - VM_BUG_ON(r); + cpumask_set_cpu(smp_processor_id(), cpu_stat_off); } } diff --git a/mm/workingset.c b/mm/workingset.c index aa017133744b..61ead9e5549d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, node->slots[i] = NULL; BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); node->count -= 1U << RADIX_TREE_COUNT_SHIFT; - BUG_ON(!mapping->nrshadows); - mapping->nrshadows--; + BUG_ON(!mapping->nrexceptional); + mapping->nrexceptional--; } } BUG_ON(node->count); diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c index bced8c074c12..7bc2208b6cc4 100644 --- a/net/9p/trans_fd.c +++ b/net/9p/trans_fd.c @@ -108,9 +108,7 @@ struct p9_poll_wait { * @unsent_req_list: accounting for requests that haven't been sent * @req: current request being processed (if any) * @tmp_buf: temporary buffer to read in header - * @rsize: amount to read for current frame - * @rpos: read position in current frame - * @rbuf: current read buffer + * @rc: temporary fcall for reading current frame * @wpos: write position for current frame * @wsize: amount of data to write for current frame * @wbuf: current write buffer @@ -131,9 +129,7 @@ struct p9_conn { struct list_head unsent_req_list; struct p9_req_t *req; char tmp_buf[7]; - int rsize; - int rpos; - char *rbuf; + struct p9_fcall rc; int wpos; int wsize; char *wbuf; @@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work) if (m->err < 0) return; - p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos); + p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset); - if (!m->rbuf) { - m->rbuf = m->tmp_buf; - m->rpos = 0; - m->rsize = 7; /* start by reading header */ + if (!m->rc.sdata) { + m->rc.sdata = m->tmp_buf; + m->rc.offset = 0; + m->rc.capacity = 7; /* start by reading header */ } clear_bit(Rpending, &m->wsched); - p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n", - m, m->rpos, m->rsize, m->rsize-m->rpos); - err = p9_fd_read(m->client, m->rbuf + m->rpos, - m->rsize - m->rpos); + p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n", + m, m->rc.offset, m->rc.capacity, + m->rc.capacity - m->rc.offset); + err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset, + m->rc.capacity - m->rc.offset); p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err); - if (err == -EAGAIN) { + if (err == -EAGAIN) goto end_clear; - } if (err <= 0) goto error; - m->rpos += err; + m->rc.offset += err; - if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */ - u16 tag; + /* header read in */ + if ((!m->req) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new header\n"); - n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */ - if (n >= m->client->msize) { + err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0); + if (err) { + p9_debug(P9_DEBUG_ERROR, + "error parsing header: %d\n", err); + goto error; + } + + if (m->rc.size >= m->client->msize) { p9_debug(P9_DEBUG_ERROR, - "requested packet size too big: %d\n", n); + "requested packet size too big: %d\n", + m->rc.size); err = -EIO; goto error; } - tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */ p9_debug(P9_DEBUG_TRANS, - "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag); + "mux %p pkt: size: %d bytes tag: %d\n", + m, m->rc.size, m->rc.tag); - m->req = p9_tag_lookup(m->client, tag); + m->req = p9_tag_lookup(m->client, m->rc.tag); if (!m->req || (m->req->status != REQ_STATUS_SENT)) { p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n", - tag); + m->rc.tag); err = -EIO; goto error; } if (m->req->rc == NULL) { - m->req->rc = kmalloc(sizeof(struct p9_fcall) + - m->client->msize, GFP_NOFS); - if (!m->req->rc) { - m->req = NULL; - err = -ENOMEM; - goto error; - } + p9_debug(P9_DEBUG_ERROR, + "No recv fcall for tag %d (req %p), disconnecting!\n", + m->rc.tag, m->req); + m->req = NULL; + err = -EIO; + goto error; } - m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall); - memcpy(m->rbuf, m->tmp_buf, m->rsize); - m->rsize = n; + m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall); + memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity); + m->rc.capacity = m->rc.size; } - /* not an else because some packets (like clunk) have no payload */ - if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */ + /* packet is read in + * not an else because some packets (like clunk) have no payload + */ + if ((m->req) && (m->rc.offset == m->rc.capacity)) { p9_debug(P9_DEBUG_TRANS, "got new packet\n"); spin_lock(&m->client->lock); if (m->req->status != REQ_STATUS_ERROR) @@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work) list_del(&m->req->req_list); spin_unlock(&m->client->lock); p9_client_cb(m->client, m->req, status); - m->rbuf = NULL; - m->rpos = 0; - m->rsize = 0; + m->rc.sdata = NULL; + m->rc.offset = 0; + m->rc.capacity = 0; m->req = NULL; } diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c index 199bc76202d2..4acb1d5417aa 100644 --- a/net/9p/trans_virtio.c +++ b/net/9p/trans_virtio.c @@ -658,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args) mutex_unlock(&virtio_9p_lock); if (!found) { - pr_err("no channels available\n"); + pr_err("no channels available for device %s\n", devname); return ret; } diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c index 10d87753ed87..9e43a315e662 100644 --- a/net/ceph/auth_x.c +++ b/net/ceph/auth_x.c @@ -152,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac, void *ticket_buf = NULL; void *tp, *tpend; void **ptp; - struct ceph_timespec new_validity; struct ceph_crypto_key new_session_key; struct ceph_buffer *new_ticket_blob; unsigned long new_expires, new_renew_after; @@ -193,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac, if (ret) goto out; - ceph_decode_copy(&dp, &new_validity, sizeof(new_validity)); - ceph_decode_timespec(&validity, &new_validity); + ceph_decode_timespec(&validity, dp); + dp += sizeof(struct ceph_timespec); new_expires = get_seconds() + validity.tv_sec; new_renew_after = new_expires - (validity.tv_sec / 4); dout(" expires=%lu renew_after=%lu\n", new_expires, @@ -233,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac, ceph_buffer_put(th->ticket_blob); th->session_key = new_session_key; th->ticket_blob = new_ticket_blob; - th->validity = new_validity; th->secret_id = new_secret_id; th->expires = new_expires; th->renew_after = new_renew_after; + th->have_key = true; dout(" got ticket service %d (%s) secret_id %lld len %d\n", type, ceph_entity_type_name(type), th->secret_id, (int)th->ticket_blob->vec.iov_len); @@ -384,6 +383,24 @@ bad: return -ERANGE; } +static bool need_key(struct ceph_x_ticket_handler *th) +{ + if (!th->have_key) + return true; + + return get_seconds() >= th->renew_after; +} + +static bool have_key(struct ceph_x_ticket_handler *th) +{ + if (th->have_key) { + if (get_seconds() >= th->expires) + th->have_key = false; + } + + return th->have_key; +} + static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) { int want = ac->want_keys; @@ -402,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed) continue; th = get_ticket_handler(ac, service); - if (IS_ERR(th)) { *pneed |= service; continue; } - if (get_seconds() >= th->renew_after) + if (need_key(th)) *pneed |= service; - if (get_seconds() >= th->expires) + if (!have_key(th)) xi->have_keys &= ~service; } } - static int ceph_x_build_request(struct ceph_auth_client *ac, void *buf, void *end) { @@ -667,14 +682,26 @@ static void ceph_x_destroy(struct ceph_auth_client *ac) ac->private = NULL; } -static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, - int peer_type) +static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type) { struct ceph_x_ticket_handler *th; th = get_ticket_handler(ac, peer_type); if (!IS_ERR(th)) - memset(&th->validity, 0, sizeof(th->validity)); + th->have_key = false; +} + +static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac, + int peer_type) +{ + /* + * We are to invalidate a service ticket in the hopes of + * getting a new, hopefully more valid, one. But, we won't get + * it unless our AUTH ticket is good, so invalidate AUTH ticket + * as well, just in case. + */ + invalidate_ticket(ac, peer_type); + invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH); } static int calcu_signature(struct ceph_x_authorizer *au, diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h index e8b7c6917d47..40b1a3cf7397 100644 --- a/net/ceph/auth_x.h +++ b/net/ceph/auth_x.h @@ -16,7 +16,7 @@ struct ceph_x_ticket_handler { unsigned int service; struct ceph_crypto_key session_key; - struct ceph_timespec validity; + bool have_key; u64 secret_id; struct ceph_buffer *ticket_blob; diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 9981039ef4ff..9cfedf565f5b 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -23,9 +23,6 @@ #include <linux/ceph/pagelist.h> #include <linux/export.h> -#define list_entry_next(pos, member) \ - list_entry(pos->member.next, typeof(*pos), member) - /* * Ceph uses the messenger to exchange ceph_msg messages with other * hosts in the system. The messenger provides ordered and reliable @@ -672,6 +669,8 @@ static void reset_connection(struct ceph_connection *con) } con->in_seq = 0; con->in_seq_acked = 0; + + con->out_skip = 0; } /* @@ -771,6 +770,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt) static void con_out_kvec_reset(struct ceph_connection *con) { + BUG_ON(con->out_skip); + con->out_kvec_left = 0; con->out_kvec_bytes = 0; con->out_kvec_cur = &con->out_kvec[0]; @@ -779,9 +780,9 @@ static void con_out_kvec_reset(struct ceph_connection *con) static void con_out_kvec_add(struct ceph_connection *con, size_t size, void *data) { - int index; + int index = con->out_kvec_left; - index = con->out_kvec_left; + BUG_ON(con->out_skip); BUG_ON(index >= ARRAY_SIZE(con->out_kvec)); con->out_kvec[index].iov_len = size; @@ -790,6 +791,27 @@ static void con_out_kvec_add(struct ceph_connection *con, con->out_kvec_bytes += size; } +/* + * Chop off a kvec from the end. Return residual number of bytes for + * that kvec, i.e. how many bytes would have been written if the kvec + * hadn't been nuked. + */ +static int con_out_kvec_skip(struct ceph_connection *con) +{ + int off = con->out_kvec_cur - con->out_kvec; + int skip = 0; + + if (con->out_kvec_bytes > 0) { + skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len; + BUG_ON(con->out_kvec_bytes < skip); + BUG_ON(!con->out_kvec_left); + con->out_kvec_bytes -= skip; + con->out_kvec_left--; + } + + return skip; +} + #ifdef CONFIG_BLOCK /* @@ -1042,7 +1064,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor, /* Move on to the next page */ BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head)); - cursor->page = list_entry_next(cursor->page, lru); + cursor->page = list_next_entry(cursor->page, lru); cursor->last_piece = cursor->resid <= PAGE_SIZE; return true; @@ -1166,7 +1188,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor, if (!cursor->resid && cursor->total_resid) { WARN_ON(!cursor->last_piece); BUG_ON(list_is_last(&cursor->data->links, cursor->data_head)); - cursor->data = list_entry_next(cursor->data, links); + cursor->data = list_next_entry(cursor->data, links); __ceph_msg_data_cursor_init(cursor); new_piece = true; } @@ -1197,7 +1219,6 @@ static void prepare_write_message_footer(struct ceph_connection *con) m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; dout("prepare_write_message_footer %p\n", con); - con->out_kvec_is_msg = true; con->out_kvec[v].iov_base = &m->footer; if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->ops->sign_message) @@ -1225,7 +1246,6 @@ static void prepare_write_message(struct ceph_connection *con) u32 crc; con_out_kvec_reset(con); - con->out_kvec_is_msg = true; con->out_msg_done = false; /* Sneak an ack in there first? If we can get it into the same @@ -1265,18 +1285,19 @@ static void prepare_write_message(struct ceph_connection *con) /* tag + hdr + front + middle */ con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); - con_out_kvec_add(con, sizeof (m->hdr), &m->hdr); + con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr); con_out_kvec_add(con, m->front.iov_len, m->front.iov_base); if (m->middle) con_out_kvec_add(con, m->middle->vec.iov_len, m->middle->vec.iov_base); - /* fill in crc (except data pages), footer */ + /* fill in hdr crc and finalize hdr */ crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc)); con->out_msg->hdr.crc = cpu_to_le32(crc); - con->out_msg->footer.flags = 0; + memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr)); + /* fill in front and middle crc, footer */ crc = crc32c(0, m->front.iov_base, m->front.iov_len); con->out_msg->footer.front_crc = cpu_to_le32(crc); if (m->middle) { @@ -1288,6 +1309,7 @@ static void prepare_write_message(struct ceph_connection *con) dout("%s front_crc %u middle_crc %u\n", __func__, le32_to_cpu(con->out_msg->footer.front_crc), le32_to_cpu(con->out_msg->footer.middle_crc)); + con->out_msg->footer.flags = 0; /* is there a data payload? */ con->out_msg->footer.data_crc = 0; @@ -1492,7 +1514,6 @@ static int write_partial_kvec(struct ceph_connection *con) } } con->out_kvec_left = 0; - con->out_kvec_is_msg = false; ret = 1; out: dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con, @@ -1584,6 +1605,7 @@ static int write_partial_skip(struct ceph_connection *con) { int ret; + dout("%s %p %d left\n", __func__, con, con->out_skip); while (con->out_skip > 0) { size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE); @@ -2506,13 +2528,13 @@ more: more_kvec: /* kvec data queued? */ - if (con->out_skip) { - ret = write_partial_skip(con); + if (con->out_kvec_left) { + ret = write_partial_kvec(con); if (ret <= 0) goto out; } - if (con->out_kvec_left) { - ret = write_partial_kvec(con); + if (con->out_skip) { + ret = write_partial_skip(con); if (ret <= 0) goto out; } @@ -2805,13 +2827,17 @@ static bool con_backoff(struct ceph_connection *con) static void con_fault_finish(struct ceph_connection *con) { + dout("%s %p\n", __func__, con); + /* * in case we faulted due to authentication, invalidate our * current tickets so that we can get new ones. */ - if (con->auth_retry && con->ops->invalidate_authorizer) { - dout("calling invalidate_authorizer()\n"); - con->ops->invalidate_authorizer(con); + if (con->auth_retry) { + dout("auth_retry %d, invalidating\n", con->auth_retry); + if (con->ops->invalidate_authorizer) + con->ops->invalidate_authorizer(con); + con->auth_retry = 0; } if (con->ops->fault) @@ -3050,16 +3076,31 @@ void ceph_msg_revoke(struct ceph_msg *msg) ceph_msg_put(msg); } if (con->out_msg == msg) { - dout("%s %p msg %p - was sending\n", __func__, con, msg); - con->out_msg = NULL; - if (con->out_kvec_is_msg) { - con->out_skip = con->out_kvec_bytes; - con->out_kvec_is_msg = false; + BUG_ON(con->out_skip); + /* footer */ + if (con->out_msg_done) { + con->out_skip += con_out_kvec_skip(con); + } else { + BUG_ON(!msg->data_length); + if (con->peer_features & CEPH_FEATURE_MSG_AUTH) + con->out_skip += sizeof(msg->footer); + else + con->out_skip += sizeof(msg->old_footer); } + /* data, middle, front */ + if (msg->data_length) + con->out_skip += msg->cursor.total_resid; + if (msg->middle) + con->out_skip += con_out_kvec_skip(con); + con->out_skip += con_out_kvec_skip(con); + + dout("%s %p msg %p - was sending, will write %d skip %d\n", + __func__, con, msg, con->out_kvec_bytes, con->out_skip); msg->hdr.seq = 0; - + con->out_msg = NULL; ceph_msg_put(msg); } + mutex_unlock(&con->mutex); } @@ -3361,9 +3402,7 @@ static void ceph_msg_free(struct ceph_msg *m) static void ceph_msg_release(struct kref *kref) { struct ceph_msg *m = container_of(kref, struct ceph_msg, kref); - LIST_HEAD(data); - struct list_head *links; - struct list_head *next; + struct ceph_msg_data *data, *next; dout("%s %p\n", __func__, m); WARN_ON(!list_empty(&m->list_head)); @@ -3376,12 +3415,8 @@ static void ceph_msg_release(struct kref *kref) m->middle = NULL; } - list_splice_init(&m->data, &data); - list_for_each_safe(links, next, &data) { - struct ceph_msg_data *data; - - data = list_entry(links, struct ceph_msg_data, links); - list_del_init(links); + list_for_each_entry_safe(data, next, &m->data, links) { + list_del_init(&data->links); ceph_msg_data_destroy(data); } m->data_length = 0; diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c index edda01626a45..de85dddc3dc0 100644 --- a/net/ceph/mon_client.c +++ b/net/ceph/mon_client.c @@ -364,10 +364,6 @@ static bool have_debugfs_info(struct ceph_mon_client *monc) return monc->client->have_fsid && monc->auth->global_id > 0; } -/* - * The monitor responds with mount ack indicate mount success. The - * included client ticket allows the client to talk to MDSs and OSDs. - */ static void ceph_monc_handle_map(struct ceph_mon_client *monc, struct ceph_msg *msg) { diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e5936c10d..7aea0ccb6be6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head) if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->tn_bits <= TNODE_KMALLOC_MAX) - kfree(n); else - vfree(n); + kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) diff --git a/net/rds/ib.c b/net/rds/ib.c index f222885ac0c7..9481d55ff6cb 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -122,44 +122,34 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev) static void rds_ib_add_one(struct ib_device *device) { struct rds_ib_device *rds_ibdev; - struct ib_device_attr *dev_attr; /* Only handle IB (no iWARP) devices */ if (device->node_type != RDMA_NODE_IB_CA) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL, ibdev_to_node(device)); if (!rds_ibdev) - goto free_attr; + return; spin_lock_init(&rds_ibdev->spinlock); atomic_set(&rds_ibdev->refcount, 1); INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free); - rds_ibdev->max_wrs = dev_attr->max_qp_wr; - rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE); + rds_ibdev->max_wrs = device->attrs.max_qp_wr; + rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE); - rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32; - rds_ibdev->max_1m_fmrs = dev_attr->max_mr ? - min_t(unsigned int, (dev_attr->max_mr / 2), + rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32; + rds_ibdev->max_1m_fmrs = device->attrs.max_mr ? + min_t(unsigned int, (device->attrs.max_mr / 2), rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size; - rds_ibdev->max_8k_fmrs = dev_attr->max_mr ? - min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE), + rds_ibdev->max_8k_fmrs = device->attrs.max_mr ? + min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE), rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size; - rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom; - rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom; + rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom; + rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom; rds_ibdev->dev = device; rds_ibdev->pd = ib_alloc_pd(device); @@ -183,7 +173,7 @@ static void rds_ib_add_one(struct ib_device *device) } rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n", - dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, + device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge, rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs, rds_ibdev->max_8k_fmrs); @@ -202,8 +192,6 @@ static void rds_ib_add_one(struct ib_device *device) put_dev: rds_ib_dev_put(rds_ibdev); -free_attr: - kfree(dev_attr); } /* diff --git a/net/rds/iw.c b/net/rds/iw.c index 576f1825fc55..f4a9fff829e0 100644 --- a/net/rds/iw.c +++ b/net/rds/iw.c @@ -60,30 +60,20 @@ LIST_HEAD(iw_nodev_conns); static void rds_iw_add_one(struct ib_device *device) { struct rds_iw_device *rds_iwdev; - struct ib_device_attr *dev_attr; /* Only handle iwarp devices */ if (device->node_type != RDMA_NODE_RNIC) return; - dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL); - if (!dev_attr) - return; - - if (ib_query_device(device, dev_attr)) { - rdsdebug("Query device failed for %s\n", device->name); - goto free_attr; - } - rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL); if (!rds_iwdev) - goto free_attr; + return; spin_lock_init(&rds_iwdev->spinlock); - rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); - rds_iwdev->max_wrs = dev_attr->max_qp_wr; - rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE); + rds_iwdev->dma_local_lkey = !!(device->attrs.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY); + rds_iwdev->max_wrs = device->attrs.max_qp_wr; + rds_iwdev->max_sge = min(device->attrs.max_sge, RDS_IW_MAX_SGE); rds_iwdev->dev = device; rds_iwdev->pd = ib_alloc_pd(device); @@ -111,8 +101,7 @@ static void rds_iw_add_one(struct ib_device *device) list_add_tail(&rds_iwdev->list, &rds_iw_devices); ib_set_client_data(device, &rds_iw_client, rds_iwdev); - - goto free_attr; + return; err_mr: if (rds_iwdev->mr) @@ -121,8 +110,6 @@ err_pd: ib_dealloc_pd(rds_iwdev->pd); free_dev: kfree(rds_iwdev); -free_attr: - kfree(dev_attr); } static void rds_iw_remove_one(struct ib_device *device, void *client_data) diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c index 5e4f815c2b34..2b32fd602669 100644 --- a/net/sunrpc/cache.c +++ b/net/sunrpc/cache.c @@ -771,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, if (count == 0) return 0; - mutex_lock(&inode->i_mutex); /* protect against multiple concurrent + inode_lock(inode); /* protect against multiple concurrent * readers on this file */ again: spin_lock(&queue_lock); @@ -784,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (rp->q.list.next == &cd->queue) { spin_unlock(&queue_lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); WARN_ON_ONCE(rp->offset); return 0; } @@ -838,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count, } if (err == -EAGAIN) goto again; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return err ? err : count; } @@ -909,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf, if (!cd->cache_parse) goto out; - mutex_lock(&inode->i_mutex); + inode_lock(inode); ret = cache_downcall(mapping, buf, count, cd); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); out: return ret; } diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c index 14f45bf0410c..31789ef3e614 100644 --- a/net/sunrpc/rpc_pipe.c +++ b/net/sunrpc/rpc_pipe.c @@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode) int need_release; LIST_HEAD(free_list); - mutex_lock(&inode->i_mutex); + inode_lock(inode); spin_lock(&pipe->lock); need_release = pipe->nreaders != 0 || pipe->nwriters != 0; pipe->nreaders = 0; @@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode) cancel_delayed_work_sync(&pipe->queue_timeout); rpc_inode_setowner(inode, NULL); RPC_I(inode)->pipe = NULL; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } static struct inode * @@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) int first_open; int res = -ENXIO; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp) pipe->nwriters++; res = 0; out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) struct rpc_pipe_msg *msg; int last_close; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) goto out; @@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp) if (last_close && pipe->ops->release_pipe) pipe->ops->release_pipe(inode); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return 0; } @@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) struct rpc_pipe_msg *msg; int res = 0; - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { res = -EPIPE; @@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset) pipe->ops->destroy_msg(msg); } out_unlock: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of struct inode *inode = file_inode(filp); int res; - mutex_lock(&inode->i_mutex); + inode_lock(inode); res = -EPIPE; if (RPC_I(inode)->pipe != NULL) res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return res; } @@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait) poll_wait(filp, &rpci->waitq, wait); - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (rpci->pipe == NULL) mask |= POLLERR | POLLHUP; else if (filp->private_data || !list_empty(&rpci->pipe->pipe)) mask |= POLLIN | POLLRDNORM; - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return mask; } @@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) switch (cmd) { case FIONREAD: - mutex_lock(&inode->i_mutex); + inode_lock(inode); pipe = RPC_I(inode)->pipe; if (pipe == NULL) { - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return -EPIPE; } spin_lock(&pipe->lock); @@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) len += msg->len - msg->copied; } spin_unlock(&pipe->lock); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); return put_user(len, (int __user *)arg); default: return -EINVAL; @@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent, { struct inode *dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD); + inode_lock_nested(dir, I_MUTEX_CHILD); __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); } static int rpc_populate(struct dentry *parent, @@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent, struct dentry *dentry; int i, err; - mutex_lock(&dir->i_mutex); + inode_lock(dir); for (i = start; i < eof; i++) { dentry = __rpc_lookup_create_exclusive(parent, files[i].name); err = PTR_ERR(dentry); @@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent, if (err != 0) goto out_bad; } - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return 0; out_bad: __rpc_depopulate(parent, files, start, eof); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); printk(KERN_WARNING "%s: %s failed to populate directory %pd\n", __FILE__, __func__, parent); return err; @@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, struct inode *dir = d_inode(parent); int error; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent, goto err_rmdir; } out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; err_rmdir: __rpc_rmdir(dir, dentry); @@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry, parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); if (depopulate != NULL) depopulate(dentry); error = __rpc_rmdir(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } @@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (pipe->ops->downcall == NULL) umode &= ~S_IWUGO; - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); dentry = __rpc_lookup_create_exclusive(parent, name); if (IS_ERR(dentry)) goto out; @@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name, if (err) goto out_err; out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out_err: dentry = ERR_PTR(err); @@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry) parent = dget_parent(dentry); dir = d_inode(parent); - mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT); + inode_lock_nested(dir, I_MUTEX_PARENT); error = __rpc_rmpipe(dir, dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); dput(parent); return error; } diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c index 2e98f4a243e5..37edea6fa92d 100644 --- a/net/sunrpc/xprt.c +++ b/net/sunrpc/xprt.c @@ -1425,3 +1425,4 @@ void xprt_put(struct rpc_xprt *xprt) if (atomic_dec_and_test(&xprt->count)) xprt_destroy(xprt); } +EXPORT_SYMBOL_GPL(xprt_put); diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile index 33f99d3004f2..dc9f3b513a05 100644 --- a/net/sunrpc/xprtrdma/Makefile +++ b/net/sunrpc/xprtrdma/Makefile @@ -2,7 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o rpcrdma-y := transport.o rpc_rdma.o verbs.o \ fmr_ops.o frwr_ops.o physical_ops.o \ - svc_rdma.o svc_rdma_transport.o \ + svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \ svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \ module.o rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c index c6836844bd0e..e16567389e28 100644 --- a/net/sunrpc/xprtrdma/frwr_ops.c +++ b/net/sunrpc/xprtrdma/frwr_ops.c @@ -190,12 +190,11 @@ static int frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; int depth, delta; ia->ri_max_frmr_depth = min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS, - devattr->max_fast_reg_page_list_len); + ia->ri_device->attrs.max_fast_reg_page_list_len); dprintk("RPC: %s: device's max FR page list len = %u\n", __func__, ia->ri_max_frmr_depth); @@ -222,8 +221,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep, } ep->rep_attr.cap.max_send_wr *= depth; - if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) { - cdata->max_requests = devattr->max_qp_wr / depth; + if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) { + cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth; if (!cdata->max_requests) return -EINVAL; ep->rep_attr.cap.max_send_wr = cdata->max_requests * diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c index 1b7051bdbdc8..c846ca9f1eba 100644 --- a/net/sunrpc/xprtrdma/svc_rdma.c +++ b/net/sunrpc/xprtrdma/svc_rdma.c @@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD; static unsigned int min_ord = 1; static unsigned int max_ord = 4096; unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS; +unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS; static unsigned int min_max_requests = 4; static unsigned int max_max_requests = 16384; unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE; @@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod; atomic_t rdma_stat_sq_poll; atomic_t rdma_stat_sq_prod; -/* Temporary NFS request map and context caches */ -struct kmem_cache *svc_rdma_map_cachep; -struct kmem_cache *svc_rdma_ctxt_cachep; - struct workqueue_struct *svc_rdma_wq; /* @@ -243,17 +240,16 @@ void svc_rdma_cleanup(void) svc_unreg_xprt_class(&svc_rdma_bc_class); #endif svc_unreg_xprt_class(&svc_rdma_class); - kmem_cache_destroy(svc_rdma_map_cachep); - kmem_cache_destroy(svc_rdma_ctxt_cachep); } int svc_rdma_init(void) { dprintk("SVCRDMA Module Init, register RPC RDMA transport\n"); dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord); - dprintk("\tmax_requests : %d\n", svcrdma_max_requests); - dprintk("\tsq_depth : %d\n", + dprintk("\tmax_requests : %u\n", svcrdma_max_requests); + dprintk("\tsq_depth : %u\n", svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT); + dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests); dprintk("\tmax_inline : %d\n", svcrdma_max_req_size); svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0); @@ -264,39 +260,10 @@ int svc_rdma_init(void) svcrdma_table_header = register_sysctl_table(svcrdma_root_table); - /* Create the temporary map cache */ - svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache", - sizeof(struct svc_rdma_req_map), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_map_cachep) { - printk(KERN_INFO "Could not allocate map cache.\n"); - goto err0; - } - - /* Create the temporary context cache */ - svc_rdma_ctxt_cachep = - kmem_cache_create("svc_rdma_ctxt_cache", - sizeof(struct svc_rdma_op_ctxt), - 0, - SLAB_HWCACHE_ALIGN, - NULL); - if (!svc_rdma_ctxt_cachep) { - printk(KERN_INFO "Could not allocate WR ctxt cache.\n"); - goto err1; - } - /* Register RDMA with the SVC transport switch */ svc_reg_xprt_class(&svc_rdma_class); #if defined(CONFIG_SUNRPC_BACKCHANNEL) svc_reg_xprt_class(&svc_rdma_bc_class); #endif return 0; - err1: - kmem_cache_destroy(svc_rdma_map_cachep); - err0: - unregister_sysctl_table(svcrdma_table_header); - destroy_workqueue(svc_rdma_wq); - return -ENOMEM; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c new file mode 100644 index 000000000000..65a7c232a345 --- /dev/null +++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2015 Oracle. All rights reserved. + * + * Support for backward direction RPCs on RPC/RDMA (server-side). + */ + +#include <linux/sunrpc/svc_rdma.h> +#include "xprt_rdma.h" + +#define RPCDBG_FACILITY RPCDBG_SVCXPRT + +#undef SVCRDMA_BACKCHANNEL_DEBUG + +int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp, + struct xdr_buf *rcvbuf) +{ + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct kvec *dst, *src = &rcvbuf->head[0]; + struct rpc_rqst *req; + unsigned long cwnd; + u32 credits; + size_t len; + __be32 xid; + __be32 *p; + int ret; + + p = (__be32 *)src->iov_base; + len = src->iov_len; + xid = rmsgp->rm_xid; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: xid=%08x, length=%zu\n", + __func__, be32_to_cpu(xid), len); + pr_info("%s: RPC/RDMA: %*ph\n", + __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp); + pr_info("%s: RPC: %*ph\n", + __func__, (int)len, p); +#endif + + ret = -EAGAIN; + if (src->iov_len < 24) + goto out_shortreply; + + spin_lock_bh(&xprt->transport_lock); + req = xprt_lookup_rqst(xprt, xid); + if (!req) + goto out_notfound; + + dst = &req->rq_private_buf.head[0]; + memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf)); + if (dst->iov_len < len) + goto out_unlock; + memcpy(dst->iov_base, p, len); + + credits = be32_to_cpu(rmsgp->rm_credit); + if (credits == 0) + credits = 1; /* don't deadlock */ + else if (credits > r_xprt->rx_buf.rb_bc_max_requests) + credits = r_xprt->rx_buf.rb_bc_max_requests; + + cwnd = xprt->cwnd; + xprt->cwnd = credits << RPC_CWNDSHIFT; + if (xprt->cwnd > cwnd) + xprt_release_rqst_cong(req->rq_task); + + ret = 0; + xprt_complete_rqst(req->rq_task, rcvbuf->len); + rcvbuf->len = 0; + +out_unlock: + spin_unlock_bh(&xprt->transport_lock); +out: + return ret; + +out_shortreply: + dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n", + xprt, src->iov_len); + goto out; + +out_notfound: + dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n", + xprt, be32_to_cpu(xid)); + + goto out_unlock; +} + +/* Send a backwards direction RPC call. + * + * Caller holds the connection's mutex and has already marshaled + * the RPC/RDMA request. + * + * This is similar to svc_rdma_reply, but takes an rpc_rqst + * instead, does not support chunks, and avoids blocking memory + * allocation. + * + * XXX: There is still an opportunity to block in svc_rdma_send() + * if there are no SQ entries to post the Send. This may occur if + * the adapter has a small maximum SQ depth. + */ +static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma, + struct rpc_rqst *rqst) +{ + struct xdr_buf *sndbuf = &rqst->rq_snd_buf; + struct svc_rdma_op_ctxt *ctxt; + struct svc_rdma_req_map *vec; + struct ib_send_wr send_wr; + int ret; + + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, sndbuf, vec); + if (ret) + goto out_err; + + /* Post a recv buffer to handle the reply for this request. */ + ret = svc_rdma_post_recv(rdma, GFP_NOIO); + if (ret) { + pr_err("svcrdma: Failed to post bc receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma); + set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags); + ret = -ENOTCONN; + goto out_err; + } + + ctxt = svc_rdma_get_context(rdma); + ctxt->pages[0] = virt_to_page(rqst->rq_buffer); + ctxt->count = 1; + + ctxt->wr_op = IB_WR_SEND; + ctxt->direction = DMA_TO_DEVICE; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; + ctxt->sge[0].length = sndbuf->len; + ctxt->sge[0].addr = + ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0, + sndbuf->len, DMA_TO_DEVICE); + if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) { + ret = -EIO; + goto out_unmap; + } + atomic_inc(&rdma->sc_dma_used); + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (unsigned long)ctxt; + send_wr.sg_list = ctxt->sge; + send_wr.num_sge = 1; + send_wr.opcode = IB_WR_SEND; + send_wr.send_flags = IB_SEND_SIGNALED; + + ret = svc_rdma_send(rdma, &send_wr); + if (ret) { + ret = -EIO; + goto out_unmap; + } + +out_err: + svc_rdma_put_req_map(rdma, vec); + dprintk("svcrdma: %s returns %d\n", __func__, ret); + return ret; + +out_unmap: + svc_rdma_unmap_dma(ctxt); + svc_rdma_put_context(ctxt, 1); + goto out_err; +} + +/* Server-side transport endpoint wants a whole page for its send + * buffer. The client RPC code constructs the RPC header in this + * buffer before it invokes ->send_request. + * + * Returns NULL if there was a temporary allocation failure. + */ +static void * +xprt_rdma_bc_allocate(struct rpc_task *task, size_t size) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + struct page *page; + + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + + /* Prevent an infinite loop: try to make this case work */ + if (size > PAGE_SIZE) + WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n", + size); + + page = alloc_page(RPCRDMA_DEF_GFP); + if (!page) + return NULL; + + return page_address(page); +} + +static void +xprt_rdma_bc_free(void *buffer) +{ + /* No-op: ctxt and page have already been freed. */ +} + +static int +rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst) +{ + struct rpc_xprt *xprt = rqst->rq_xprt; + struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); + struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer; + int rc; + + /* Space in the send buffer for an RPC/RDMA header is reserved + * via xprt->tsh_size. + */ + headerp->rm_xid = rqst->rq_xid; + headerp->rm_vers = rpcrdma_version; + headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests); + headerp->rm_type = rdma_msg; + headerp->rm_body.rm_chunks[0] = xdr_zero; + headerp->rm_body.rm_chunks[1] = xdr_zero; + headerp->rm_body.rm_chunks[2] = xdr_zero; + +#ifdef SVCRDMA_BACKCHANNEL_DEBUG + pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer); +#endif + + rc = svc_rdma_bc_sendto(rdma, rqst); + if (rc) + goto drop_connection; + return rc; + +drop_connection: + dprintk("svcrdma: failed to send bc call\n"); + xprt_disconnect_done(xprt); + return -ENOTCONN; +} + +/* Send an RPC call on the passive end of a transport + * connection. + */ +static int +xprt_rdma_bc_send_request(struct rpc_task *task) +{ + struct rpc_rqst *rqst = task->tk_rqstp; + struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt; + struct svcxprt_rdma *rdma; + int ret; + + dprintk("svcrdma: sending bc call with xid: %08x\n", + be32_to_cpu(rqst->rq_xid)); + + if (!mutex_trylock(&sxprt->xpt_mutex)) { + rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL); + if (!mutex_trylock(&sxprt->xpt_mutex)) + return -EAGAIN; + rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task); + } + + ret = -ENOTCONN; + rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt); + if (!test_bit(XPT_DEAD, &sxprt->xpt_flags)) + ret = rpcrdma_bc_send_request(rdma, rqst); + + mutex_unlock(&sxprt->xpt_mutex); + + if (ret < 0) + return ret; + return 0; +} + +static void +xprt_rdma_bc_close(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); +} + +static void +xprt_rdma_bc_put(struct rpc_xprt *xprt) +{ + dprintk("svcrdma: %s: xprt %p\n", __func__, xprt); + + xprt_free(xprt); + module_put(THIS_MODULE); +} + +static struct rpc_xprt_ops xprt_rdma_bc_procs = { + .reserve_xprt = xprt_reserve_xprt_cong, + .release_xprt = xprt_release_xprt_cong, + .alloc_slot = xprt_alloc_slot, + .release_request = xprt_release_rqst_cong, + .buf_alloc = xprt_rdma_bc_allocate, + .buf_free = xprt_rdma_bc_free, + .send_request = xprt_rdma_bc_send_request, + .set_retrans_timeout = xprt_set_retrans_timeout_def, + .close = xprt_rdma_bc_close, + .destroy = xprt_rdma_bc_put, + .print_stats = xprt_rdma_print_stats +}; + +static const struct rpc_timeout xprt_rdma_bc_timeout = { + .to_initval = 60 * HZ, + .to_maxval = 60 * HZ, +}; + +/* It shouldn't matter if the number of backchannel session slots + * doesn't match the number of RPC/RDMA credits. That just means + * one or the other will have extra slots that aren't used. + */ +static struct rpc_xprt * +xprt_setup_rdma_bc(struct xprt_create *args) +{ + struct rpc_xprt *xprt; + struct rpcrdma_xprt *new_xprt; + + if (args->addrlen > sizeof(xprt->addr)) { + dprintk("RPC: %s: address too large\n", __func__); + return ERR_PTR(-EBADF); + } + + xprt = xprt_alloc(args->net, sizeof(*new_xprt), + RPCRDMA_MAX_BC_REQUESTS, + RPCRDMA_MAX_BC_REQUESTS); + if (!xprt) { + dprintk("RPC: %s: couldn't allocate rpc_xprt\n", + __func__); + return ERR_PTR(-ENOMEM); + } + + xprt->timeout = &xprt_rdma_bc_timeout; + xprt_set_bound(xprt); + xprt_set_connected(xprt); + xprt->bind_timeout = RPCRDMA_BIND_TO; + xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO; + xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO; + + xprt->prot = XPRT_TRANSPORT_BC_RDMA; + xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32); + xprt->ops = &xprt_rdma_bc_procs; + + memcpy(&xprt->addr, args->dstaddr, args->addrlen); + xprt->addrlen = args->addrlen; + xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr); + xprt->resvport = 0; + + xprt->max_payload = xprt_rdma_max_inline_read; + + new_xprt = rpcx_to_rdmax(xprt); + new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs; + + xprt_get(xprt); + args->bc_xprt->xpt_bc_xprt = xprt; + xprt->bc_xprt = args->bc_xprt; + + if (!try_module_get(THIS_MODULE)) + goto out_fail; + + /* Final put for backchannel xprt is in __svc_rdma_free */ + xprt_get(xprt); + return xprt; + +out_fail: + xprt_rdma_free_addresses(xprt); + args->bc_xprt->xpt_bc_xprt = NULL; + xprt_put(xprt); + xprt_free(xprt); + return ERR_PTR(-EINVAL); +} + +struct xprt_class xprt_rdma_bc = { + .list = LIST_HEAD_INIT(xprt_rdma_bc.list), + .name = "rdma backchannel", + .owner = THIS_MODULE, + .ident = XPRT_TRANSPORT_BC_RDMA, + .setup = xprt_setup_rdma_bc, +}; diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c index ff4f01e527ec..c8b8a8b4181e 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c +++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c @@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no]; head->arg.page_len += len; + head->arg.len += len; if (!pg_off) head->count++; @@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt, goto err; atomic_inc(&xprt->sc_dma_used); - /* The lkey here is either a local dma lkey or a dma_mr lkey */ - ctxt->sge[pno].lkey = xprt->sc_dma_lkey; + ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[pno].length = len; ctxt->count++; @@ -567,6 +567,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp, return ret; } +/* By convention, backchannel calls arrive via rdma_msg type + * messages, and never populate the chunk lists. This makes + * the RPC/RDMA header small and fixed in size, so it is + * straightforward to check the RPC header's direction field. + */ +static bool +svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp) +{ + __be32 *p = (__be32 *)rmsgp; + + if (!xprt->xpt_bc_xprt) + return false; + + if (rmsgp->rm_type != rdma_msg) + return false; + if (rmsgp->rm_body.rm_chunks[0] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[1] != xdr_zero) + return false; + if (rmsgp->rm_body.rm_chunks[2] != xdr_zero) + return false; + + /* sanity */ + if (p[7] != rmsgp->rm_xid) + return false; + /* call direction */ + if (p[8] == cpu_to_be32(RPC_CALL)) + return false; + + return true; +} + /* * Set up the rqstp thread context to point to the RQ buffer. If * necessary, pull additional data from the client with an RDMA_READ @@ -632,6 +664,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) goto close_out; } + if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) { + ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp, + &rqstp->rq_arg); + svc_rdma_put_context(ctxt, 0); + if (ret) + goto repost; + return ret; + } + /* Read read-list data. */ ret = rdma_read_chunks(rdma_xprt, rmsgp, rqstp, ctxt); if (ret > 0) { @@ -668,4 +709,15 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp) set_bit(XPT_CLOSE, &xprt->xpt_flags); defer: return 0; + +repost: + ret = svc_rdma_post_recv(rdma_xprt, GFP_KERNEL); + if (ret) { + pr_err("svcrdma: could not post a receive buffer, err=%d.\n", + ret); + pr_err("svcrdma: closing transport %p.\n", rdma_xprt); + set_bit(XPT_CLOSE, &rdma_xprt->sc_xprt.xpt_flags); + ret = -ENOTCONN; + } + return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c index 969a1ab75fc3..df57f3ce6cd2 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c +++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c @@ -50,9 +50,9 @@ #define RPCDBG_FACILITY RPCDBG_SVCXPRT -static int map_xdr(struct svcxprt_rdma *xprt, - struct xdr_buf *xdr, - struct svc_rdma_req_map *vec) +int svc_rdma_map_xdr(struct svcxprt_rdma *xprt, + struct xdr_buf *xdr, + struct svc_rdma_req_map *vec) { int sge_no; u32 sge_bytes; @@ -62,7 +62,7 @@ static int map_xdr(struct svcxprt_rdma *xprt, if (xdr->len != (xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) { - pr_err("svcrdma: map_xdr: XDR buffer length error\n"); + pr_err("svcrdma: %s: XDR buffer length error\n", __func__); return -EIO; } @@ -97,9 +97,9 @@ static int map_xdr(struct svcxprt_rdma *xprt, sge_no++; } - dprintk("svcrdma: map_xdr: sge_no %d page_no %d " + dprintk("svcrdma: %s: sge_no %d page_no %d " "page_base %u page_len %u head_len %zu tail_len %zu\n", - sge_no, page_no, xdr->page_base, xdr->page_len, + __func__, sge_no, page_no, xdr->page_base, xdr->page_len, xdr->head[0].iov_len, xdr->tail[0].iov_len); vec->count = sge_no; @@ -265,7 +265,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp, sge[sge_no].addr)) goto err; atomic_inc(&xprt->sc_dma_used); - sge[sge_no].lkey = xprt->sc_dma_lkey; + sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count++; sge_off = 0; sge_no++; @@ -465,7 +465,7 @@ static int send_reply(struct svcxprt_rdma *rdma, int ret; /* Post a recv buffer to handle another request. */ - ret = svc_rdma_post_recv(rdma); + ret = svc_rdma_post_recv(rdma, GFP_KERNEL); if (ret) { printk(KERN_INFO "svcrdma: could not post a receive buffer, err=%d." @@ -480,7 +480,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->count = 1; /* Prepare the SGE for the RPCRDMA Header */ - ctxt->sge[0].lkey = rdma->sc_dma_lkey; + ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp); ctxt->sge[0].addr = ib_dma_map_page(rdma->sc_cm_id->device, page, 0, @@ -504,7 +504,7 @@ static int send_reply(struct svcxprt_rdma *rdma, ctxt->sge[sge_no].addr)) goto err; atomic_inc(&rdma->sc_dma_used); - ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey; + ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey; ctxt->sge[sge_no].length = sge_bytes; } if (byte_count != 0) { @@ -591,14 +591,17 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) /* Build an req vec for the XDR */ ctxt = svc_rdma_get_context(rdma); ctxt->direction = DMA_TO_DEVICE; - vec = svc_rdma_get_req_map(); - ret = map_xdr(rdma, &rqstp->rq_res, vec); + vec = svc_rdma_get_req_map(rdma); + ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec); if (ret) goto err0; inline_bytes = rqstp->rq_res.len; /* Create the RDMA response header */ - res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + ret = -ENOMEM; + res_page = alloc_page(GFP_KERNEL); + if (!res_page) + goto err0; rdma_resp = page_address(res_page); reply_ary = svc_rdma_get_reply_array(rdma_argp); if (reply_ary) @@ -630,14 +633,14 @@ int svc_rdma_sendto(struct svc_rqst *rqstp) ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec, inline_bytes); - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); dprintk("svcrdma: send_reply returns %d\n", ret); return ret; err1: put_page(res_page); err0: - svc_rdma_put_req_map(vec); + svc_rdma_put_req_map(rdma, vec); svc_rdma_put_context(ctxt, 0); return ret; } diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c index b348b4adef29..5763825d09bf 100644 --- a/net/sunrpc/xprtrdma/svc_rdma_transport.c +++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c @@ -153,18 +153,76 @@ static void svc_rdma_bc_free(struct svc_xprt *xprt) } #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt, + gfp_t flags) { struct svc_rdma_op_ctxt *ctxt; - ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep, - GFP_KERNEL | __GFP_NOFAIL); - ctxt->xprt = xprt; - INIT_LIST_HEAD(&ctxt->dto_q); + ctxt = kmalloc(sizeof(*ctxt), flags); + if (ctxt) { + ctxt->xprt = xprt; + INIT_LIST_HEAD(&ctxt->free); + INIT_LIST_HEAD(&ctxt->dto_q); + } + return ctxt; +} + +static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* Each RPC/RDMA credit can consume a number of send + * and receive WQEs. One ctxt is allocated for each. + */ + i = xprt->sc_sq_depth + xprt->sc_rq_depth; + + while (i--) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = alloc_ctxt(xprt, GFP_KERNEL); + if (!ctxt) { + dprintk("svcrdma: No memory for RDMA ctxt\n"); + return false; + } + list_add(&ctxt->free, &xprt->sc_ctxts); + } + return true; +} + +struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_op_ctxt *ctxt = NULL; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used++; + if (list_empty(&xprt->sc_ctxts)) + goto out_empty; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del_init(&ctxt->free); + spin_unlock_bh(&xprt->sc_ctxt_lock); + +out: ctxt->count = 0; ctxt->frmr = NULL; - atomic_inc(&xprt->sc_ctxt_used); return ctxt; + +out_empty: + /* Either pre-allocation missed the mark, or send + * queue accounting is broken. + */ + spin_unlock_bh(&xprt->sc_ctxt_lock); + + ctxt = alloc_ctxt(xprt, GFP_NOIO); + if (ctxt) + goto out; + + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + spin_unlock_bh(&xprt->sc_ctxt_lock); + WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n"); + return NULL; } void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) @@ -174,11 +232,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) { /* * Unmap the DMA addr in the SGE if the lkey matches - * the sc_dma_lkey, otherwise, ignore it since it is + * the local_dma_lkey, otherwise, ignore it since it is * an FRMR lkey and will be unmapped later when the * last WR that uses it completes. */ - if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) { + if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) { atomic_dec(&xprt->sc_dma_used); ib_dma_unmap_page(xprt->sc_cm_id->device, ctxt->sge[i].addr, @@ -190,35 +248,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt) void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages) { - struct svcxprt_rdma *xprt; + struct svcxprt_rdma *xprt = ctxt->xprt; int i; - xprt = ctxt->xprt; if (free_pages) for (i = 0; i < ctxt->count; i++) put_page(ctxt->pages[i]); - kmem_cache_free(svc_rdma_ctxt_cachep, ctxt); - atomic_dec(&xprt->sc_ctxt_used); + spin_lock_bh(&xprt->sc_ctxt_lock); + xprt->sc_ctxt_used--; + list_add(&ctxt->free, &xprt->sc_ctxts); + spin_unlock_bh(&xprt->sc_ctxt_lock); } -/* - * Temporary NFS req mappings are shared across all transport - * instances. These are short lived and should be bounded by the number - * of concurrent server threads * depth of the SQ. - */ -struct svc_rdma_req_map *svc_rdma_get_req_map(void) +static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt) +{ + while (!list_empty(&xprt->sc_ctxts)) { + struct svc_rdma_op_ctxt *ctxt; + + ctxt = list_first_entry(&xprt->sc_ctxts, + struct svc_rdma_op_ctxt, free); + list_del(&ctxt->free); + kfree(ctxt); + } +} + +static struct svc_rdma_req_map *alloc_req_map(gfp_t flags) { struct svc_rdma_req_map *map; - map = kmem_cache_alloc(svc_rdma_map_cachep, - GFP_KERNEL | __GFP_NOFAIL); + + map = kmalloc(sizeof(*map), flags); + if (map) + INIT_LIST_HEAD(&map->free); + return map; +} + +static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt) +{ + unsigned int i; + + /* One for each receive buffer on this connection. */ + i = xprt->sc_max_requests; + + while (i--) { + struct svc_rdma_req_map *map; + + map = alloc_req_map(GFP_KERNEL); + if (!map) { + dprintk("svcrdma: No memory for request map\n"); + return false; + } + list_add(&map->free, &xprt->sc_maps); + } + return true; +} + +struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt) +{ + struct svc_rdma_req_map *map = NULL; + + spin_lock(&xprt->sc_map_lock); + if (list_empty(&xprt->sc_maps)) + goto out_empty; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del_init(&map->free); + spin_unlock(&xprt->sc_map_lock); + +out: map->count = 0; return map; + +out_empty: + spin_unlock(&xprt->sc_map_lock); + + /* Pre-allocation amount was incorrect */ + map = alloc_req_map(GFP_NOIO); + if (map) + goto out; + + WARN_ONCE(1, "svcrdma: empty request map list?\n"); + return NULL; +} + +void svc_rdma_put_req_map(struct svcxprt_rdma *xprt, + struct svc_rdma_req_map *map) +{ + spin_lock(&xprt->sc_map_lock); + list_add(&map->free, &xprt->sc_maps); + spin_unlock(&xprt->sc_map_lock); } -void svc_rdma_put_req_map(struct svc_rdma_req_map *map) +static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt) { - kmem_cache_free(svc_rdma_map_cachep, map); + while (!list_empty(&xprt->sc_maps)) { + struct svc_rdma_req_map *map; + + map = list_first_entry(&xprt->sc_maps, + struct svc_rdma_req_map, free); + list_del(&map->free); + kfree(map); + } } /* ib_cq event handler */ @@ -386,46 +517,44 @@ static void rq_cq_reap(struct svcxprt_rdma *xprt) static void process_context(struct svcxprt_rdma *xprt, struct svc_rdma_op_ctxt *ctxt) { + struct svc_rdma_op_ctxt *read_hdr; + int free_pages = 0; + svc_rdma_unmap_dma(ctxt); switch (ctxt->wr_op) { case IB_WR_SEND: - if (ctxt->frmr) - pr_err("svcrdma: SEND: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 1); + free_pages = 1; break; case IB_WR_RDMA_WRITE: - if (ctxt->frmr) - pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n"); - svc_rdma_put_context(ctxt, 0); break; case IB_WR_RDMA_READ: case IB_WR_RDMA_READ_WITH_INV: svc_rdma_put_frmr(xprt, ctxt->frmr); - if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) { - struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr; - if (read_hdr) { - spin_lock_bh(&xprt->sc_rq_dto_lock); - set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); - list_add_tail(&read_hdr->dto_q, - &xprt->sc_read_complete_q); - spin_unlock_bh(&xprt->sc_rq_dto_lock); - } else { - pr_err("svcrdma: ctxt->read_hdr == NULL\n"); - } - svc_xprt_enqueue(&xprt->sc_xprt); - } + + if (!test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) + break; + + read_hdr = ctxt->read_hdr; svc_rdma_put_context(ctxt, 0); - break; + + spin_lock_bh(&xprt->sc_rq_dto_lock); + set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags); + list_add_tail(&read_hdr->dto_q, + &xprt->sc_read_complete_q); + spin_unlock_bh(&xprt->sc_rq_dto_lock); + svc_xprt_enqueue(&xprt->sc_xprt); + return; default: - printk(KERN_ERR "svcrdma: unexpected completion type, " - "opcode=%d\n", - ctxt->wr_op); + dprintk("svcrdma: unexpected completion opcode=%d\n", + ctxt->wr_op); break; } + + svc_rdma_put_context(ctxt, free_pages); } /* @@ -523,19 +652,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q); INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q); INIT_LIST_HEAD(&cma_xprt->sc_frmr_q); + INIT_LIST_HEAD(&cma_xprt->sc_ctxts); + INIT_LIST_HEAD(&cma_xprt->sc_maps); init_waitqueue_head(&cma_xprt->sc_send_wait); spin_lock_init(&cma_xprt->sc_lock); spin_lock_init(&cma_xprt->sc_rq_dto_lock); spin_lock_init(&cma_xprt->sc_frmr_q_lock); - - cma_xprt->sc_ord = svcrdma_ord; - - cma_xprt->sc_max_req_size = svcrdma_max_req_size; - cma_xprt->sc_max_requests = svcrdma_max_requests; - cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT; - atomic_set(&cma_xprt->sc_sq_count, 0); - atomic_set(&cma_xprt->sc_ctxt_used, 0); + spin_lock_init(&cma_xprt->sc_ctxt_lock); + spin_lock_init(&cma_xprt->sc_map_lock); if (listener) set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags); @@ -543,7 +668,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv, return cma_xprt; } -int svc_rdma_post_recv(struct svcxprt_rdma *xprt) +int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags) { struct ib_recv_wr recv_wr, *bad_recv_wr; struct svc_rdma_op_ctxt *ctxt; @@ -561,7 +686,9 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) pr_err("svcrdma: Too many sges (%d)\n", sge_no); goto err_put_ctxt; } - page = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + page = alloc_page(flags); + if (!page) + goto err_put_ctxt; ctxt->pages[sge_no] = page; pa = ib_dma_map_page(xprt->sc_cm_id->device, page, 0, PAGE_SIZE, @@ -571,7 +698,7 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt) atomic_inc(&xprt->sc_dma_used); ctxt->sge[sge_no].addr = pa; ctxt->sge[sge_no].length = PAGE_SIZE; - ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey; + ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey; ctxt->count = sge_no + 1; buflen += PAGE_SIZE; } @@ -886,11 +1013,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) struct rdma_conn_param conn_param; struct ib_cq_init_attr cq_attr = {}; struct ib_qp_init_attr qp_attr; - struct ib_device_attr devattr; - int uninitialized_var(dma_mr_acc); - int need_dma_mr = 0; - int ret; - int i; + struct ib_device *dev; + unsigned int i; + int ret = 0; listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt); clear_bit(XPT_CONN, &xprt->xpt_flags); @@ -910,37 +1035,42 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n", newxprt, newxprt->sc_cm_id); - ret = ib_query_device(newxprt->sc_cm_id->device, &devattr); - if (ret) { - dprintk("svcrdma: could not query device attributes on " - "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret); - goto errout; - } + dev = newxprt->sc_cm_id->device; /* Qualify the transport resource defaults with the * capabilities of this particular device */ - newxprt->sc_max_sge = min((size_t)devattr.max_sge, + newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge, (size_t)RPCSVC_MAXPAGES); - newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd, + newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd, RPCSVC_MAXPAGES); - newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr, - (size_t)svcrdma_max_requests); - newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests; + newxprt->sc_max_req_size = svcrdma_max_req_size; + newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_requests); + newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr, + svcrdma_max_bc_requests); + newxprt->sc_rq_depth = newxprt->sc_max_requests + + newxprt->sc_max_bc_requests; + newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth; + + if (!svc_rdma_prealloc_ctxts(newxprt)) + goto errout; + if (!svc_rdma_prealloc_maps(newxprt)) + goto errout; /* * Limit ORD based on client limit, local device limit, and * configured svcrdma limit. */ - newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord); + newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord); newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord); - newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device); + newxprt->sc_pd = ib_alloc_pd(dev); if (IS_ERR(newxprt->sc_pd)) { dprintk("svcrdma: error creating PD for connect request\n"); goto errout; } cq_attr.cqe = newxprt->sc_sq_depth; - newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device, + newxprt->sc_sq_cq = ib_create_cq(dev, sq_comp_handler, cq_event_handler, newxprt, @@ -949,8 +1079,8 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) dprintk("svcrdma: error creating SQ CQ for connect request\n"); goto errout; } - cq_attr.cqe = newxprt->sc_max_requests; - newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device, + cq_attr.cqe = newxprt->sc_rq_depth; + newxprt->sc_rq_cq = ib_create_cq(dev, rq_comp_handler, cq_event_handler, newxprt, @@ -964,7 +1094,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) qp_attr.event_handler = qp_event_handler; qp_attr.qp_context = &newxprt->sc_xprt; qp_attr.cap.max_send_wr = newxprt->sc_sq_depth; - qp_attr.cap.max_recv_wr = newxprt->sc_max_requests; + qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth; qp_attr.cap.max_send_sge = newxprt->sc_max_sge; qp_attr.cap.max_recv_sge = newxprt->sc_max_sge; qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR; @@ -978,7 +1108,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) " cap.max_send_sge = %d\n" " cap.max_recv_sge = %d\n", newxprt->sc_cm_id, newxprt->sc_pd, - newxprt->sc_cm_id->device, newxprt->sc_pd->device, + dev, newxprt->sc_pd->device, qp_attr.cap.max_send_wr, qp_attr.cap.max_recv_wr, qp_attr.cap.max_send_sge, @@ -1014,9 +1144,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) * of an RDMA_READ. IB does not. */ newxprt->sc_reader = rdma_read_chunk_lcl; - if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { + if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) { newxprt->sc_frmr_pg_list_len = - devattr.max_fast_reg_page_list_len; + dev->attrs.max_fast_reg_page_list_len; newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG; newxprt->sc_reader = rdma_read_chunk_frmr; } @@ -1024,44 +1154,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt) /* * Determine if a DMA MR is required and if so, what privs are required */ - if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !rdma_ib_or_roce(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) && + !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num)) goto errout; - if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) || - !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) { - need_dma_mr = 1; - dma_mr_acc = IB_ACCESS_LOCAL_WRITE; - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num) && - !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG)) - dma_mr_acc |= IB_ACCESS_REMOTE_WRITE; - } - - if (rdma_protocol_iwarp(newxprt->sc_cm_id->device, - newxprt->sc_cm_id->port_num)) + if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num)) newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV; - /* Create the DMA MR if needed, otherwise, use the DMA LKEY */ - if (need_dma_mr) { - /* Register all of physical memory */ - newxprt->sc_phys_mr = - ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc); - if (IS_ERR(newxprt->sc_phys_mr)) { - dprintk("svcrdma: Failed to create DMA MR ret=%d\n", - ret); - goto errout; - } - newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey; - } else - newxprt->sc_dma_lkey = - newxprt->sc_cm_id->device->local_dma_lkey; - /* Post receive buffers */ - for (i = 0; i < newxprt->sc_max_requests; i++) { - ret = svc_rdma_post_recv(newxprt); + for (i = 0; i < newxprt->sc_rq_depth; i++) { + ret = svc_rdma_post_recv(newxprt, GFP_KERNEL); if (ret) { dprintk("svcrdma: failure posting receive buffers\n"); goto errout; @@ -1160,12 +1262,14 @@ static void __svc_rdma_free(struct work_struct *work) { struct svcxprt_rdma *rdma = container_of(work, struct svcxprt_rdma, sc_work); - dprintk("svcrdma: svc_rdma_free(%p)\n", rdma); + struct svc_xprt *xprt = &rdma->sc_xprt; + + dprintk("svcrdma: %s(%p)\n", __func__, rdma); /* We should only be called from kref_put */ - if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0) + if (atomic_read(&xprt->xpt_ref.refcount) != 0) pr_err("svcrdma: sc_xprt still in use? (%d)\n", - atomic_read(&rdma->sc_xprt.xpt_ref.refcount)); + atomic_read(&xprt->xpt_ref.refcount)); /* * Destroy queued, but not processed read completions. Note @@ -1193,15 +1297,22 @@ static void __svc_rdma_free(struct work_struct *work) } /* Warn if we leaked a resource or under-referenced */ - if (atomic_read(&rdma->sc_ctxt_used) != 0) + if (rdma->sc_ctxt_used != 0) pr_err("svcrdma: ctxt still in use? (%d)\n", - atomic_read(&rdma->sc_ctxt_used)); + rdma->sc_ctxt_used); if (atomic_read(&rdma->sc_dma_used) != 0) pr_err("svcrdma: dma still in use? (%d)\n", atomic_read(&rdma->sc_dma_used)); - /* De-allocate fastreg mr */ + /* Final put of backchannel client transport */ + if (xprt->xpt_bc_xprt) { + xprt_put(xprt->xpt_bc_xprt); + xprt->xpt_bc_xprt = NULL; + } + rdma_dealloc_frmr_q(rdma); + svc_rdma_destroy_ctxts(rdma); + svc_rdma_destroy_maps(rdma); /* Destroy the QP if present (not a listener) */ if (rdma->sc_qp && !IS_ERR(rdma->sc_qp)) @@ -1213,9 +1324,6 @@ static void __svc_rdma_free(struct work_struct *work) if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq)) ib_destroy_cq(rdma->sc_rq_cq); - if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr)) - ib_dereg_mr(rdma->sc_phys_mr); - if (rdma->sc_pd && !IS_ERR(rdma->sc_pd)) ib_dealloc_pd(rdma->sc_pd); @@ -1321,7 +1429,9 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, int length; int ret; - p = alloc_page(GFP_KERNEL | __GFP_NOFAIL); + p = alloc_page(GFP_KERNEL); + if (!p) + return; va = page_address(p); /* XDR encode error */ @@ -1341,7 +1451,7 @@ void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp, return; } atomic_inc(&xprt->sc_dma_used); - ctxt->sge[0].lkey = xprt->sc_dma_lkey; + ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey; ctxt->sge[0].length = length; /* Prepare SEND WR */ diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c index 740bddcf3488..b1b009f10ea3 100644 --- a/net/sunrpc/xprtrdma/transport.c +++ b/net/sunrpc/xprtrdma/transport.c @@ -63,7 +63,7 @@ */ static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE; -static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; +unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE; static unsigned int xprt_rdma_inline_write_padding; static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR; @@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = { #endif -#define RPCRDMA_BIND_TO (60U * HZ) -#define RPCRDMA_INIT_REEST_TO (5U * HZ) -#define RPCRDMA_MAX_REEST_TO (30U * HZ) -#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) - -static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */ +static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */ static void xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap) @@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6; } -static void +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) { char buf[128]; @@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap) xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma"; } -static void +void xprt_rdma_free_addresses(struct rpc_xprt *xprt) { unsigned int i; @@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size) if (req == NULL) return NULL; - flags = GFP_NOIO | __GFP_NOWARN; + flags = RPCRDMA_DEF_GFP; if (RPC_IS_SWAPPER(task)) flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN; @@ -642,7 +637,7 @@ drop_connection: return -ENOTCONN; /* implies disconnect */ } -static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq) { struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt); long idle_time = 0; @@ -743,6 +738,11 @@ void xprt_rdma_cleanup(void) rpcrdma_destroy_wq(); frwr_destroy_recovery_wq(); + + rc = xprt_unregister_transport(&xprt_rdma_bc); + if (rc) + dprintk("RPC: %s: xprt_unregister(bc) returned %i\n", + __func__, rc); } int xprt_rdma_init(void) @@ -766,6 +766,14 @@ int xprt_rdma_init(void) return rc; } + rc = xprt_register_transport(&xprt_rdma_bc); + if (rc) { + xprt_unregister_transport(&xprt_rdma); + rpcrdma_destroy_wq(); + frwr_destroy_recovery_wq(); + return rc; + } + dprintk("RPCRDMA Module Init, register RPC RDMA transport\n"); dprintk("Defaults:\n"); diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c index 732c71ce5dca..878f1bfb1db9 100644 --- a/net/sunrpc/xprtrdma/verbs.c +++ b/net/sunrpc/xprtrdma/verbs.c @@ -462,7 +462,6 @@ int rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) { struct rpcrdma_ia *ia = &xprt->rx_ia; - struct ib_device_attr *devattr = &ia->ri_devattr; int rc; ia->ri_dma_mr = NULL; @@ -482,16 +481,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg) goto out2; } - rc = ib_query_device(ia->ri_device, devattr); - if (rc) { - dprintk("RPC: %s: ib_query_device failed %d\n", - __func__, rc); - goto out3; - } - if (memreg == RPCRDMA_FRMR) { - if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) || - (devattr->max_fast_reg_page_list_len == 0)) { + if (!(ia->ri_device->attrs.device_cap_flags & + IB_DEVICE_MEM_MGT_EXTENSIONS) || + (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) { dprintk("RPC: %s: FRMR registration " "not supported by HCA\n", __func__); memreg = RPCRDMA_MTHCAFMR; @@ -566,24 +559,23 @@ int rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, struct rpcrdma_create_data_internal *cdata) { - struct ib_device_attr *devattr = &ia->ri_devattr; struct ib_cq *sendcq, *recvcq; struct ib_cq_init_attr cq_attr = {}; unsigned int max_qp_wr; int rc, err; - if (devattr->max_sge < RPCRDMA_MAX_IOVS) { + if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) { dprintk("RPC: %s: insufficient sge's available\n", __func__); return -ENOMEM; } - if (devattr->max_qp_wr <= RPCRDMA_BACKWARD_WRS) { + if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) { dprintk("RPC: %s: insufficient wqe's available\n", __func__); return -ENOMEM; } - max_qp_wr = devattr->max_qp_wr - RPCRDMA_BACKWARD_WRS; + max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS; /* check provider's send/recv wr limits */ if (cdata->max_requests > max_qp_wr) @@ -668,11 +660,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia, /* Client offers RDMA Read but does not initiate */ ep->rep_remote_cma.initiator_depth = 0; - if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */ + if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */ ep->rep_remote_cma.responder_resources = 32; else ep->rep_remote_cma.responder_resources = - devattr->max_qp_rd_atom; + ia->ri_device->attrs.max_qp_rd_atom; ep->rep_remote_cma.retry_count = 7; ep->rep_remote_cma.flow_control = 0; diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h index 728101ddc44b..38fe11b09875 100644 --- a/net/sunrpc/xprtrdma/xprt_rdma.h +++ b/net/sunrpc/xprtrdma/xprt_rdma.h @@ -55,6 +55,11 @@ #define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */ #define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */ +#define RPCRDMA_BIND_TO (60U * HZ) +#define RPCRDMA_INIT_REEST_TO (5U * HZ) +#define RPCRDMA_MAX_REEST_TO (30U * HZ) +#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ) + /* * Interface Adapter -- one per transport instance */ @@ -68,7 +73,6 @@ struct rpcrdma_ia { struct completion ri_done; int ri_async_rc; unsigned int ri_max_frmr_depth; - struct ib_device_attr ri_devattr; struct ib_qp_attr ri_qp_attr; struct ib_qp_init_attr ri_qp_init_attr; }; @@ -142,6 +146,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb) return (struct rpcrdma_msg *)rb->rg_base; } +#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN) + /* * struct rpcrdma_rep -- this structure encapsulates state required to recv * and complete a reply, asychronously. It needs several pieces of @@ -309,6 +315,8 @@ struct rpcrdma_buffer { u32 rb_bc_srv_max_requests; spinlock_t rb_reqslock; /* protect rb_allreqs */ struct list_head rb_allreqs; + + u32 rb_bc_max_requests; }; #define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia) @@ -516,6 +524,10 @@ int rpcrdma_marshal_req(struct rpc_rqst *); /* RPC/RDMA module init - xprtrdma/transport.c */ +extern unsigned int xprt_rdma_max_inline_read; +void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap); +void xprt_rdma_free_addresses(struct rpc_xprt *xprt); +void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq); int xprt_rdma_init(void); void xprt_rdma_cleanup(void); @@ -531,11 +543,6 @@ void xprt_rdma_bc_free_rqst(struct rpc_rqst *); void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int); #endif /* CONFIG_SUNRPC_BACKCHANNEL */ -/* Temporary NFS request map cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_map_cachep; -/* WR context cache. Created in svc_rdma.c */ -extern struct kmem_cache *svc_rdma_ctxt_cachep; -/* Workqueue created in svc_rdma.c */ -extern struct workqueue_struct *svc_rdma_wq; +extern struct xprt_class xprt_rdma_bc; #endif /* _LINUX_SUNRPC_XPRT_RDMA_H */ diff --git a/security/inode.c b/security/inode.c index 16622aef9bde..28414b0207ce 100644 --- a/security/inode.c +++ b/security/inode.c @@ -99,7 +99,7 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode, dir = d_inode(parent); - mutex_lock(&dir->i_mutex); + inode_lock(dir); dentry = lookup_one_len(name, parent, strlen(name)); if (IS_ERR(dentry)) goto out; @@ -129,14 +129,14 @@ struct dentry *securityfs_create_file(const char *name, umode_t mode, } d_instantiate(dentry, inode); dget(dentry); - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); return dentry; out1: dput(dentry); dentry = ERR_PTR(error); out: - mutex_unlock(&dir->i_mutex); + inode_unlock(dir); simple_release_fs(&mount, &mount_count); return dentry; } @@ -195,7 +195,7 @@ void securityfs_remove(struct dentry *dentry) if (!parent || d_really_is_negative(parent)) return; - mutex_lock(&d_inode(parent)->i_mutex); + inode_lock(d_inode(parent)); if (simple_positive(dentry)) { if (d_is_dir(dentry)) simple_rmdir(d_inode(parent), dentry); @@ -203,7 +203,7 @@ void securityfs_remove(struct dentry *dentry) simple_unlink(d_inode(parent), dentry); dput(dentry); } - mutex_unlock(&d_inode(parent)->i_mutex); + inode_unlock(d_inode(parent)); simple_release_fs(&mount, &mount_count); } EXPORT_SYMBOL_GPL(securityfs_remove); diff --git a/security/integrity/ima/ima_main.c b/security/integrity/ima/ima_main.c index c21f09bf8b99..9d96551d0196 100644 --- a/security/integrity/ima/ima_main.c +++ b/security/integrity/ima/ima_main.c @@ -121,7 +121,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint, if (!(mode & FMODE_WRITE)) return; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (atomic_read(&inode->i_writecount) == 1) { if ((iint->version != inode->i_version) || (iint->flags & IMA_NEW_FILE)) { @@ -130,7 +130,7 @@ static void ima_check_last_writer(struct integrity_iint_cache *iint, ima_update_xattr(iint, file); } } - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } /** @@ -186,7 +186,7 @@ static int process_measurement(struct file *file, int mask, int function, if (action & IMA_FILE_APPRAISE) function = FILE_CHECK; - mutex_lock(&inode->i_mutex); + inode_lock(inode); if (action) { iint = integrity_inode_get(inode); @@ -250,7 +250,7 @@ out_free: if (pathbuf) __putname(pathbuf); out: - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); if ((rc && must_appraise) && (ima_appraise & IMA_APPRAISE_ENFORCE)) return -EACCES; return 0; diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c index 732c1c77dccd..1b1fd27de632 100644 --- a/security/selinux/selinuxfs.c +++ b/security/selinux/selinuxfs.c @@ -380,9 +380,9 @@ static int sel_open_policy(struct inode *inode, struct file *filp) goto err; if (i_size_read(inode) != security_policydb_len()) { - mutex_lock(&inode->i_mutex); + inode_lock(inode); i_size_write(inode, security_policydb_len()); - mutex_unlock(&inode->i_mutex); + inode_unlock(inode); } rc = security_read_policy(&plm->data, &plm->len); diff --git a/sound/core/control.c b/sound/core/control.c index 196a6fe100ca..a85d45595d02 100644 --- a/sound/core/control.c +++ b/sound/core/control.c @@ -1405,6 +1405,8 @@ static int snd_ctl_tlv_ioctl(struct snd_ctl_file *file, return -EFAULT; if (tlv.length < sizeof(unsigned int) * 2) return -EINVAL; + if (!tlv.numid) + return -EINVAL; down_read(&card->controls_rwsem); kctl = snd_ctl_find_numid(card, tlv.numid); if (kctl == NULL) { diff --git a/sound/core/hrtimer.c b/sound/core/hrtimer.c index f845ecf7e172..656d9a9032dc 100644 --- a/sound/core/hrtimer.c +++ b/sound/core/hrtimer.c @@ -90,7 +90,7 @@ static int snd_hrtimer_start(struct snd_timer *t) struct snd_hrtimer *stime = t->private_data; atomic_set(&stime->running, 0); - hrtimer_cancel(&stime->hrt); + hrtimer_try_to_cancel(&stime->hrt); hrtimer_start(&stime->hrt, ns_to_ktime(t->sticks * resolution), HRTIMER_MODE_REL); atomic_set(&stime->running, 1); @@ -101,6 +101,7 @@ static int snd_hrtimer_stop(struct snd_timer *t) { struct snd_hrtimer *stime = t->private_data; atomic_set(&stime->running, 0); + hrtimer_try_to_cancel(&stime->hrt); return 0; } diff --git a/sound/core/pcm_compat.c b/sound/core/pcm_compat.c index b48b434444ed..9630e9f72b7b 100644 --- a/sound/core/pcm_compat.c +++ b/sound/core/pcm_compat.c @@ -255,10 +255,15 @@ static int snd_pcm_ioctl_hw_params_compat(struct snd_pcm_substream *substream, if (! (runtime = substream->runtime)) return -ENOTTY; - /* only fifo_size is different, so just copy all */ - data = memdup_user(data32, sizeof(*data32)); - if (IS_ERR(data)) - return PTR_ERR(data); + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + /* only fifo_size (RO from userspace) is different, so just copy all */ + if (copy_from_user(data, data32, sizeof(*data32))) { + err = -EFAULT; + goto error; + } if (refine) err = snd_pcm_hw_refine(substream, data); diff --git a/sound/core/seq/seq_compat.c b/sound/core/seq/seq_compat.c index 81f7c109dc46..65175902a68a 100644 --- a/sound/core/seq/seq_compat.c +++ b/sound/core/seq/seq_compat.c @@ -49,11 +49,12 @@ static int snd_seq_call_port_info_ioctl(struct snd_seq_client *client, unsigned struct snd_seq_port_info *data; mm_segment_t fs; - data = memdup_user(data32, sizeof(*data32)); - if (IS_ERR(data)) - return PTR_ERR(data); + data = kmalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; - if (get_user(data->flags, &data32->flags) || + if (copy_from_user(data, data32, sizeof(*data32)) || + get_user(data->flags, &data32->flags) || get_user(data->time_queue, &data32->time_queue)) goto error; data->kernel = NULL; diff --git a/sound/core/timer.c b/sound/core/timer.c index cb25aded5349..af1f68f7e315 100644 --- a/sound/core/timer.c +++ b/sound/core/timer.c @@ -65,6 +65,7 @@ struct snd_timer_user { int qtail; int qused; int queue_size; + bool disconnected; struct snd_timer_read *queue; struct snd_timer_tread *tqueue; spinlock_t qlock; @@ -290,6 +291,9 @@ int snd_timer_open(struct snd_timer_instance **ti, mutex_unlock(®ister_mutex); return -ENOMEM; } + /* take a card refcount for safe disconnection */ + if (timer->card) + get_device(&timer->card->card_dev); timeri->slave_class = tid->dev_sclass; timeri->slave_id = slave_id; if (list_empty(&timer->open_list_head) && timer->hw.open) @@ -359,6 +363,9 @@ int snd_timer_close(struct snd_timer_instance *timeri) } spin_unlock(&timer->lock); spin_unlock_irq(&slave_active_lock); + /* release a card refcount for safe disconnection */ + if (timer->card) + put_device(&timer->card->card_dev); mutex_unlock(®ister_mutex); } out: @@ -474,6 +481,8 @@ int snd_timer_start(struct snd_timer_instance *timeri, unsigned int ticks) timer = timeri->timer; if (timer == NULL) return -EINVAL; + if (timer->card && timer->card->shutdown) + return -ENODEV; spin_lock_irqsave(&timer->lock, flags); timeri->ticks = timeri->cticks = ticks; timeri->pticks = 0; @@ -505,6 +514,10 @@ static int _snd_timer_stop(struct snd_timer_instance *timeri, int event) spin_lock_irqsave(&timer->lock, flags); list_del_init(&timeri->ack_list); list_del_init(&timeri->active_list); + if (timer->card && timer->card->shutdown) { + spin_unlock_irqrestore(&timer->lock, flags); + return 0; + } if ((timeri->flags & SNDRV_TIMER_IFLG_RUNNING) && !(--timer->running)) { timer->hw.stop(timer); @@ -565,6 +578,8 @@ int snd_timer_continue(struct snd_timer_instance *timeri) timer = timeri->timer; if (! timer) return -EINVAL; + if (timer->card && timer->card->shutdown) + return -ENODEV; spin_lock_irqsave(&timer->lock, flags); if (!timeri->cticks) timeri->cticks = 1; @@ -628,6 +643,9 @@ static void snd_timer_tasklet(unsigned long arg) unsigned long resolution, ticks; unsigned long flags; + if (timer->card && timer->card->shutdown) + return; + spin_lock_irqsave(&timer->lock, flags); /* now process all callbacks */ while (!list_empty(&timer->sack_list_head)) { @@ -668,6 +686,9 @@ void snd_timer_interrupt(struct snd_timer * timer, unsigned long ticks_left) if (timer == NULL) return; + if (timer->card && timer->card->shutdown) + return; + spin_lock_irqsave(&timer->lock, flags); /* remember the current resolution */ @@ -881,8 +902,15 @@ static int snd_timer_dev_register(struct snd_device *dev) static int snd_timer_dev_disconnect(struct snd_device *device) { struct snd_timer *timer = device->device_data; + struct snd_timer_instance *ti; + mutex_lock(®ister_mutex); list_del_init(&timer->device_list); + /* wake up pending sleepers */ + list_for_each_entry(ti, &timer->open_list_head, open_list) { + if (ti->disconnect) + ti->disconnect(ti); + } mutex_unlock(®ister_mutex); return 0; } @@ -893,6 +921,8 @@ void snd_timer_notify(struct snd_timer *timer, int event, struct timespec *tstam unsigned long resolution = 0; struct snd_timer_instance *ti, *ts; + if (timer->card && timer->card->shutdown) + return; if (! (timer->hw.flags & SNDRV_TIMER_HW_SLAVE)) return; if (snd_BUG_ON(event < SNDRV_TIMER_EVENT_MSTART || @@ -1051,6 +1081,8 @@ static void snd_timer_proc_read(struct snd_info_entry *entry, mutex_lock(®ister_mutex); list_for_each_entry(timer, &snd_timer_list, device_list) { + if (timer->card && timer->card->shutdown) + continue; switch (timer->tmr_class) { case SNDRV_TIMER_CLASS_GLOBAL: snd_iprintf(buffer, "G%i: ", timer->tmr_device); @@ -1185,6 +1217,14 @@ static void snd_timer_user_ccallback(struct snd_timer_instance *timeri, wake_up(&tu->qchange_sleep); } +static void snd_timer_user_disconnect(struct snd_timer_instance *timeri) +{ + struct snd_timer_user *tu = timeri->callback_data; + + tu->disconnected = true; + wake_up(&tu->qchange_sleep); +} + static void snd_timer_user_tinterrupt(struct snd_timer_instance *timeri, unsigned long resolution, unsigned long ticks) @@ -1558,6 +1598,7 @@ static int snd_timer_user_tselect(struct file *file, ? snd_timer_user_tinterrupt : snd_timer_user_interrupt; tu->timeri->ccallback = snd_timer_user_ccallback; tu->timeri->callback_data = (void *)tu; + tu->timeri->disconnect = snd_timer_user_disconnect; } __err: @@ -1876,6 +1917,10 @@ static ssize_t snd_timer_user_read(struct file *file, char __user *buffer, remove_wait_queue(&tu->qchange_sleep, &wait); + if (tu->disconnected) { + err = -ENODEV; + break; + } if (signal_pending(current)) { err = -ERESTARTSYS; break; @@ -1925,6 +1970,8 @@ static unsigned int snd_timer_user_poll(struct file *file, poll_table * wait) mask = 0; if (tu->qused) mask |= POLLIN | POLLRDNORM; + if (tu->disconnected) + mask |= POLLERR; return mask; } diff --git a/sound/hda/hdac_i915.c b/sound/hda/hdac_i915.c index c50177fb469f..f6854dbd7d8d 100644 --- a/sound/hda/hdac_i915.c +++ b/sound/hda/hdac_i915.c @@ -306,7 +306,7 @@ out_master_del: out_err: kfree(acomp); bus->audio_component = NULL; - dev_err(dev, "failed to add i915 component master (%d)\n", ret); + dev_info(dev, "failed to add i915 component master (%d)\n", ret); return ret; } diff --git a/sound/pci/hda/hda_bind.c b/sound/pci/hda/hda_bind.c index 70671ad65d24..6efadbfb3fe3 100644 --- a/sound/pci/hda/hda_bind.c +++ b/sound/pci/hda/hda_bind.c @@ -174,14 +174,40 @@ static inline bool codec_probed(struct hda_codec *codec) return device_attach(hda_codec_dev(codec)) > 0 && codec->preset; } -/* try to auto-load and bind the codec module */ -static void codec_bind_module(struct hda_codec *codec) +/* try to auto-load codec module */ +static void request_codec_module(struct hda_codec *codec) { #ifdef MODULE char modalias[32]; + const char *mod = NULL; + + switch (codec->probe_id) { + case HDA_CODEC_ID_GENERIC_HDMI: +#if IS_MODULE(CONFIG_SND_HDA_CODEC_HDMI) + mod = "snd-hda-codec-hdmi"; +#endif + break; + case HDA_CODEC_ID_GENERIC: +#if IS_MODULE(CONFIG_SND_HDA_GENERIC) + mod = "snd-hda-codec-generic"; +#endif + break; + default: + snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias)); + mod = modalias; + break; + } + + if (mod) + request_module(mod); +#endif /* MODULE */ +} - snd_hdac_codec_modalias(&codec->core, modalias, sizeof(modalias)); - request_module(modalias); +/* try to auto-load and bind the codec module */ +static void codec_bind_module(struct hda_codec *codec) +{ +#ifdef MODULE + request_codec_module(codec); if (codec_probed(codec)) return; #endif @@ -218,17 +244,13 @@ static int codec_bind_generic(struct hda_codec *codec) if (is_likely_hdmi_codec(codec)) { codec->probe_id = HDA_CODEC_ID_GENERIC_HDMI; -#if IS_MODULE(CONFIG_SND_HDA_CODEC_HDMI) - request_module("snd-hda-codec-hdmi"); -#endif + request_codec_module(codec); if (codec_probed(codec)) return 0; } codec->probe_id = HDA_CODEC_ID_GENERIC; -#if IS_MODULE(CONFIG_SND_HDA_GENERIC) - request_module("snd-hda-codec-generic"); -#endif + request_codec_module(codec); if (codec_probed(codec)) return 0; return -ENODEV; diff --git a/sound/pci/hda/hda_intel.c b/sound/pci/hda/hda_intel.c index c0bef11afa7e..256e6cda218f 100644 --- a/sound/pci/hda/hda_intel.c +++ b/sound/pci/hda/hda_intel.c @@ -2078,9 +2078,11 @@ static int azx_probe_continue(struct azx *chip) * for other chips, still continue probing as other * codecs can be on the same link. */ - if (CONTROLLER_IN_GPU(pci)) + if (CONTROLLER_IN_GPU(pci)) { + dev_err(chip->card->dev, + "HSW/BDW HD-audio HDMI/DP requires binding with gfx driver\n"); goto out_free; - else + } else goto skip_i915; } @@ -2149,9 +2151,17 @@ i915_power_fail: static void azx_remove(struct pci_dev *pci) { struct snd_card *card = pci_get_drvdata(pci); + struct azx *chip; + struct hda_intel *hda; + + if (card) { + /* flush the pending probing work */ + chip = card->private_data; + hda = container_of(chip, struct hda_intel, chip); + flush_work(&hda->probe_work); - if (card) snd_card_free(card); + } } static void azx_shutdown(struct pci_dev *pci) diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c index 8143c0e24a27..33753244f48f 100644 --- a/sound/pci/hda/patch_realtek.c +++ b/sound/pci/hda/patch_realtek.c @@ -6566,6 +6566,7 @@ static const struct snd_pci_quirk alc662_fixup_tbl[] = { SND_PCI_QUIRK(0x1028, 0x069f, "Dell", ALC668_FIXUP_DELL_MIC_NO_PRESENCE), SND_PCI_QUIRK(0x103c, 0x1632, "HP RP5800", ALC662_FIXUP_HP_RP5800), SND_PCI_QUIRK(0x1043, 0x11cd, "Asus N550", ALC662_FIXUP_BASS_1A), + SND_PCI_QUIRK(0x1043, 0x13df, "Asus N550JX", ALC662_FIXUP_BASS_1A), SND_PCI_QUIRK(0x1043, 0x1477, "ASUS N56VZ", ALC662_FIXUP_BASS_MODE4_CHMAP), SND_PCI_QUIRK(0x1043, 0x15a7, "ASUS UX51VZH", ALC662_FIXUP_BASS_16), SND_PCI_QUIRK(0x1043, 0x1b73, "ASUS N55SF", ALC662_FIXUP_BASS_16), diff --git a/sound/spi/at73c213.c b/sound/spi/at73c213.c index 39522367897c..fac7e6eb9529 100644 --- a/sound/spi/at73c213.c +++ b/sound/spi/at73c213.c @@ -221,6 +221,8 @@ static int snd_at73c213_pcm_open(struct snd_pcm_substream *substream) runtime->hw = snd_at73c213_playback_hw; chip->substream = substream; + clk_enable(chip->ssc->clk); + return 0; } @@ -228,6 +230,7 @@ static int snd_at73c213_pcm_close(struct snd_pcm_substream *substream) { struct snd_at73c213 *chip = snd_pcm_substream_chip(substream); chip->substream = NULL; + clk_disable(chip->ssc->clk); return 0; } @@ -897,6 +900,8 @@ static int snd_at73c213_dev_init(struct snd_card *card, chip->card = card; chip->irq = -1; + clk_enable(chip->ssc->clk); + retval = request_irq(irq, snd_at73c213_interrupt, 0, "at73c213", chip); if (retval) { dev_dbg(&chip->spi->dev, "unable to request irq %d\n", irq); @@ -935,6 +940,8 @@ out_irq: free_irq(chip->irq, chip); chip->irq = -1; out: + clk_disable(chip->ssc->clk); + return retval; } @@ -1012,7 +1019,9 @@ static int snd_at73c213_remove(struct spi_device *spi) int retval; /* Stop playback. */ + clk_enable(chip->ssc->clk); ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXDIS)); + clk_disable(chip->ssc->clk); /* Mute sound. */ retval = snd_at73c213_write_reg(chip, DAC_LMPG, 0x3f); @@ -1080,6 +1089,7 @@ static int snd_at73c213_suspend(struct device *dev) struct snd_at73c213 *chip = card->private_data; ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXDIS)); + clk_disable(chip->ssc->clk); clk_disable(chip->board->dac_clk); return 0; @@ -1091,6 +1101,7 @@ static int snd_at73c213_resume(struct device *dev) struct snd_at73c213 *chip = card->private_data; clk_enable(chip->board->dac_clk); + clk_enable(chip->ssc->clk); ssc_writel(chip->ssc->regs, CR, SSC_BIT(CR_TXEN)); return 0; diff --git a/tools/lib/traceevent/event-parse.c b/tools/lib/traceevent/event-parse.c index ea69ce35e902..c3bd294a63d1 100644 --- a/tools/lib/traceevent/event-parse.c +++ b/tools/lib/traceevent/event-parse.c @@ -3746,7 +3746,7 @@ static const struct flag flags[] = { { "NET_TX_SOFTIRQ", 2 }, { "NET_RX_SOFTIRQ", 3 }, { "BLOCK_SOFTIRQ", 4 }, - { "BLOCK_IOPOLL_SOFTIRQ", 5 }, + { "IRQ_POLL_SOFTIRQ", 5 }, { "TASKLET_SOFTIRQ", 6 }, { "SCHED_SOFTIRQ", 7 }, { "HRTIMER_SOFTIRQ", 8 }, diff --git a/tools/perf/util/trace-event-parse.c b/tools/perf/util/trace-event-parse.c index 8ff7d620d942..33b52eaa39db 100644 --- a/tools/perf/util/trace-event-parse.c +++ b/tools/perf/util/trace-event-parse.c @@ -209,7 +209,7 @@ static const struct flag flags[] = { { "NET_TX_SOFTIRQ", 2 }, { "NET_RX_SOFTIRQ", 3 }, { "BLOCK_SOFTIRQ", 4 }, - { "BLOCK_IOPOLL_SOFTIRQ", 5 }, + { "IRQ_POLL_SOFTIRQ", 5 }, { "TASKLET_SOFTIRQ", 6 }, { "SCHED_SOFTIRQ", 7 }, { "HRTIMER_SOFTIRQ", 8 }, |