summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Documentation/ABI/testing/sysfs-fs-f2fs5
-rw-r--r--Documentation/devicetree/bindings/net/marvell,mvusb.yaml29
-rw-r--r--Documentation/filesystems/f2fs.rst4
-rw-r--r--Documentation/openrisc/openrisc_port.rst4
-rw-r--r--arch/openrisc/Kconfig1
-rw-r--r--arch/openrisc/configs/or1ksim_defconfig1
-rw-r--r--arch/openrisc/configs/simple_smp_defconfig1
-rw-r--r--arch/openrisc/include/uapi/asm/unistd.h1
-rw-r--r--arch/openrisc/kernel/process.c18
-rw-r--r--arch/openrisc/kernel/smp.c3
-rw-r--r--arch/openrisc/kernel/traps.c7
-rw-r--r--arch/parisc/include/asm/spinlock.h160
-rw-r--r--arch/parisc/include/asm/spinlock_types.h14
-rw-r--r--arch/parisc/kernel/alternative.c37
-rw-r--r--arch/parisc/kernel/irq.c22
-rw-r--r--arch/parisc/kernel/syscall.S2
-rw-r--r--arch/parisc/kernel/syscalls/syscalltbl.sh4
-rw-r--r--arch/sparc/include/asm/dma-mapping.h15
-rw-r--r--arch/sparc/kernel/ioport.c3
-rw-r--r--arch/sparc/kernel/of_device_common.c1
-rw-r--r--arch/sparc/mm/io-unit.c9
-rw-r--r--arch/sparc/mm/iommu.c15
-rw-r--r--arch/sparc/mm/mm_32.h3
-rw-r--r--arch/sparc/mm/srmmu.c4
-rw-r--r--arch/um/Kconfig9
-rw-r--r--arch/um/configs/i386_defconfig2
-rw-r--r--arch/um/configs/x86_64_defconfig2
-rw-r--r--arch/um/drivers/Kconfig3
-rw-r--r--arch/um/drivers/net_kern.c13
-rw-r--r--arch/um/drivers/ubd_kern.c12
-rw-r--r--arch/um/drivers/vector_kern.c5
-rw-r--r--arch/um/drivers/vector_user.c15
-rw-r--r--arch/um/drivers/vhost_user.h12
-rw-r--r--arch/um/drivers/virtio_uml.c153
-rw-r--r--arch/um/include/asm/Kbuild1
-rw-r--r--arch/um/include/asm/delay.h30
-rw-r--r--arch/um/include/linux/time-internal.h84
-rw-r--r--arch/um/include/shared/os.h1
-rw-r--r--arch/um/include/shared/timer-internal.h76
-rw-r--r--arch/um/kernel/kmsg_dump.c9
-rw-r--r--arch/um/kernel/process.c39
-rw-r--r--arch/um/kernel/skas/syscall.c5
-rw-r--r--arch/um/kernel/time.c538
-rw-r--r--arch/um/kernel/uml.lds.S2
-rw-r--r--arch/um/os-Linux/file.c31
-rw-r--r--arch/um/os-Linux/time.c1
-rw-r--r--arch/um/os-Linux/umid.c5
-rw-r--r--arch/x86/um/asm/processor.h12
-rw-r--r--drivers/crypto/chelsio/chcr_ktls.c1
-rw-r--r--drivers/ide/ide-scan-pci.c8
-rw-r--r--drivers/mtd/ubi/fastmap-wl.c15
-rw-r--r--drivers/mtd/ubi/ubi-media.h2
-rw-r--r--drivers/mtd/ubi/wl.c3
-rw-r--r--drivers/net/can/slcan.c4
-rw-r--r--drivers/net/dsa/bcm_sf2.c9
-rw-r--r--drivers/net/dsa/mt7530.c3
-rw-r--r--drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c2
-rw-r--r--drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c3
-rw-r--r--drivers/net/ethernet/cavium/common/cavium_ptp.h2
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c5
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c23
-rw-r--r--drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h1
-rw-r--r--drivers/net/ethernet/faraday/ftgmac100.c2
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c18
-rw-r--r--drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c2
-rw-r--r--drivers/net/ethernet/qlogic/qed/qed_l2.c2
-rw-r--r--drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c31
-rw-r--r--drivers/net/ethernet/realtek/r8169_main.c29
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c2
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c11
-rw-r--r--drivers/net/ethernet/stmicro/stmmac/stmmac_main.c17
-rw-r--r--drivers/net/macsec.c3
-rw-r--r--drivers/net/phy/at803x.c4
-rw-r--r--drivers/net/phy/micrel.c7
-rw-r--r--drivers/net/tun.c10
-rw-r--r--drivers/net/usb/pegasus.c38
-rw-r--r--drivers/net/wimax/i2400m/driver.c7
-rw-r--r--drivers/parisc/eisa.c8
-rw-r--r--drivers/pcmcia/cs_internal.h2
-rw-r--r--drivers/pcmcia/omap_cf.c2
-rw-r--r--drivers/pcmcia/rsrc_nonstatic.c6
-rw-r--r--drivers/pcmcia/sa1100_simpad.c6
-rw-r--r--drivers/pcmcia/soc_common.h2
-rw-r--r--drivers/pcmcia/yenta_socket.c10
-rw-r--r--fs/f2fs/Kconfig9
-rw-r--r--fs/f2fs/checkpoint.c42
-rw-r--r--fs/f2fs/compress.c317
-rw-r--r--fs/f2fs/data.c141
-rw-r--r--fs/f2fs/debug.c3
-rw-r--r--fs/f2fs/dir.c16
-rw-r--r--fs/f2fs/f2fs.h206
-rw-r--r--fs/f2fs/file.c91
-rw-r--r--fs/f2fs/gc.c51
-rw-r--r--fs/f2fs/inode.c29
-rw-r--r--fs/f2fs/namei.c12
-rw-r--r--fs/f2fs/node.c33
-rw-r--r--fs/f2fs/recovery.c12
-rw-r--r--fs/f2fs/segment.c54
-rw-r--r--fs/f2fs/segment.h2
-rw-r--r--fs/f2fs/shrinker.c2
-rw-r--r--fs/f2fs/super.c89
-rw-r--r--fs/f2fs/sysfs.c50
-rw-r--r--fs/f2fs/xattr.c67
-rw-r--r--fs/f2fs/xattr.h9
-rw-r--r--fs/hostfs/hostfs_kern.c12
-rw-r--r--fs/nfs/blocklayout/blocklayout.c2
-rw-r--r--fs/nfs/callback.h4
-rw-r--r--fs/nfs/callback_proc.c69
-rw-r--r--fs/nfs/delegation.c319
-rw-r--r--fs/nfs/dir.c79
-rw-r--r--fs/nfs/direct.c197
-rw-r--r--fs/nfs/filelayout/filelayout.c165
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.c229
-rw-r--r--fs/nfs/flexfilelayout/flexfilelayout.h2
-rw-r--r--fs/nfs/fs_context.c9
-rw-r--r--fs/nfs/inode.c28
-rw-r--r--fs/nfs/internal.h36
-rw-r--r--fs/nfs/namespace.c67
-rw-r--r--fs/nfs/nfs4_fs.h4
-rw-r--r--fs/nfs/nfs4file.c3
-rw-r--r--fs/nfs/nfs4namespace.c2
-rw-r--r--fs/nfs/nfs4proc.c19
-rw-r--r--fs/nfs/nfs4state.c24
-rw-r--r--fs/nfs/nfs4trace.h8
-rw-r--r--fs/nfs/nfsroot.c2
-rw-r--r--fs/nfs/nfstrace.h1
-rw-r--r--fs/nfs/pagelist.c367
-rw-r--r--fs/nfs/pnfs.c241
-rw-r--r--fs/nfs/pnfs.h143
-rw-r--r--fs/nfs/pnfs_nfs.c514
-rw-r--r--fs/nfs/read.c2
-rw-r--r--fs/nfs/super.c35
-rw-r--r--fs/nfs/unlink.c4
-rw-r--r--fs/nfs/write.c276
-rw-r--r--fs/ubifs/io.c16
-rw-r--r--fs/ubifs/journal.c1
-rw-r--r--fs/ubifs/orphan.c13
-rw-r--r--include/linux/f2fs_fs.h1
-rw-r--r--include/linux/nfs_fs.h1
-rw-r--r--include/linux/nfs_page.h5
-rw-r--r--include/linux/nfs_xdr.h32
-rw-r--r--include/linux/skbuff.h38
-rw-r--r--include/linux/sunrpc/sched.h1
-rw-r--r--include/linux/sunrpc/xdr.h1
-rw-r--r--include/trace/events/f2fs.h3
-rw-r--r--include/trace/events/rpcrdma.h153
-rw-r--r--include/uapi/linux/um_timetravel.h128
-rw-r--r--net/core/neighbour.c10
-rw-r--r--net/core/sock.c2
-rw-r--r--net/dsa/slave.c2
-rw-r--r--net/ipv6/addrconf.c11
-rw-r--r--net/ipv6/ndisc.c4
-rw-r--r--net/ipv6/rpl.c6
-rw-r--r--net/ipv6/rpl_iptunnel.c2
-rw-r--r--net/mptcp/options.c2
-rw-r--r--net/mptcp/pm.c2
-rw-r--r--net/mptcp/pm_netlink.c2
-rw-r--r--net/mptcp/protocol.c109
-rw-r--r--net/mptcp/protocol.h2
-rw-r--r--net/mptcp/subflow.c3
-rw-r--r--net/mptcp/token.c9
-rw-r--r--net/openvswitch/flow_table.c10
-rw-r--r--net/sched/cls_tcindex.c45
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c94
-rw-r--r--net/sunrpc/clnt.c8
-rw-r--r--net/sunrpc/sched.c22
-rw-r--r--net/sunrpc/xdr.c55
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c8
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c154
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c32
-rw-r--r--net/sunrpc/xprtrdma/transport.c72
-rw-r--r--net/sunrpc/xprtrdma/verbs.c683
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h89
-rw-r--r--net/sunrpc/xprtsock.c2
174 files changed, 4671 insertions, 2983 deletions
diff --git a/Documentation/ABI/testing/sysfs-fs-f2fs b/Documentation/ABI/testing/sysfs-fs-f2fs
index 1a6cd5397129..bd8a0d19abe6 100644
--- a/Documentation/ABI/testing/sysfs-fs-f2fs
+++ b/Documentation/ABI/testing/sysfs-fs-f2fs
@@ -318,3 +318,8 @@ Date: September 2019
Contact: "Hridya Valsaraju" <hridya@google.com>
Description: Average number of valid blocks.
Available when CONFIG_F2FS_STAT_FS=y.
+
+What: /sys/fs/f2fs/<disk>/mounted_time_sec
+Date: February 2020
+Contact: "Jaegeuk Kim" <jaegeuk@kernel.org>
+Description: Show the mounted time in secs of this partition.
diff --git a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml
index 9458f6659be1..68573762294b 100644
--- a/Documentation/devicetree/bindings/net/marvell,mvusb.yaml
+++ b/Documentation/devicetree/bindings/net/marvell,mvusb.yaml
@@ -38,28 +38,27 @@ required:
examples:
- |
/* USB host controller */
- &usb1 {
- mvusb: mdio@1 {
+ usb {
+ #address-cells = <1>;
+ #size-cells = <0>;
+
+ mdio@1 {
compatible = "usb1286,1fa4";
reg = <1>;
#address-cells = <1>;
#size-cells = <0>;
- };
- };
- /* MV88E6390X devboard */
- &mvusb {
- switch@0 {
- compatible = "marvell,mv88e6190";
- status = "ok";
- reg = <0x0>;
+ switch@0 {
+ compatible = "marvell,mv88e6190";
+ reg = <0x0>;
- ports {
- /* Port definitions */
- };
+ ports {
+ /* Port definitions */
+ };
- mdio {
- /* PHY definitions */
+ mdio {
+ /* PHY definitions */
+ };
};
};
};
diff --git a/Documentation/filesystems/f2fs.rst b/Documentation/filesystems/f2fs.rst
index d681203728d7..87d794bc75a4 100644
--- a/Documentation/filesystems/f2fs.rst
+++ b/Documentation/filesystems/f2fs.rst
@@ -243,8 +243,8 @@ checkpoint=%s[:%u[%]] Set to "disable" to turn off checkpointing. Set to "enabl
hide up to all remaining free space. The actual space that
would be unusable can be viewed at /sys/fs/f2fs/<disk>/unusable
This space is reclaimed once checkpoint=enable.
-compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo"
- and "lz4" algorithm.
+compress_algorithm=%s Control compress algorithm, currently f2fs supports "lzo",
+ "lz4" and "zstd" algorithm.
compress_log_size=%u Support configuring compress cluster size, the size will
be 4KB * (1 << %u), 16KB is minimum size, also it's
default size.
diff --git a/Documentation/openrisc/openrisc_port.rst b/Documentation/openrisc/openrisc_port.rst
index a18747a8d191..4b2c437942a0 100644
--- a/Documentation/openrisc/openrisc_port.rst
+++ b/Documentation/openrisc/openrisc_port.rst
@@ -37,8 +37,8 @@ or Stafford's toolchain build and release scripts.
Build the Linux kernel as usual::
- make ARCH=openrisc defconfig
- make ARCH=openrisc
+ make ARCH=openrisc CROSS_COMPILE="or1k-linux-" defconfig
+ make ARCH=openrisc CROSS_COMPILE="or1k-linux-"
3) Running on FPGA (optional)
diff --git a/arch/openrisc/Kconfig b/arch/openrisc/Kconfig
index 7e94fe37cb2f..8588996165ae 100644
--- a/arch/openrisc/Kconfig
+++ b/arch/openrisc/Kconfig
@@ -16,6 +16,7 @@ config OPENRISC
select HANDLE_DOMAIN_IRQ
select GPIOLIB
select HAVE_ARCH_TRACEHOOK
+ select HAVE_COPY_THREAD_TLS
select SPARSE_IRQ
select GENERIC_IRQ_CHIP
select GENERIC_IRQ_PROBE
diff --git a/arch/openrisc/configs/or1ksim_defconfig b/arch/openrisc/configs/or1ksim_defconfig
index d8ff4f8ffb88..75f2da324d0e 100644
--- a/arch/openrisc/configs/or1ksim_defconfig
+++ b/arch/openrisc/configs/or1ksim_defconfig
@@ -1,4 +1,3 @@
-CONFIG_CROSS_COMPILE="or1k-linux-"
CONFIG_NO_HZ=y
CONFIG_LOG_BUF_SHIFT=14
CONFIG_BLK_DEV_INITRD=y
diff --git a/arch/openrisc/configs/simple_smp_defconfig b/arch/openrisc/configs/simple_smp_defconfig
index 64278992df9c..ff49d868e040 100644
--- a/arch/openrisc/configs/simple_smp_defconfig
+++ b/arch/openrisc/configs/simple_smp_defconfig
@@ -1,4 +1,3 @@
-CONFIG_CROSS_COMPILE="or1k-linux-"
CONFIG_LOCALVERSION="-simple-smp"
CONFIG_NO_HZ=y
CONFIG_LOG_BUF_SHIFT=14
diff --git a/arch/openrisc/include/uapi/asm/unistd.h b/arch/openrisc/include/uapi/asm/unistd.h
index 566f8c4f8047..fae34c60fa88 100644
--- a/arch/openrisc/include/uapi/asm/unistd.h
+++ b/arch/openrisc/include/uapi/asm/unistd.h
@@ -24,6 +24,7 @@
#define __ARCH_WANT_SET_GET_RLIMIT
#define __ARCH_WANT_SYS_FORK
#define __ARCH_WANT_SYS_CLONE
+#define __ARCH_WANT_SYS_CLONE3
#define __ARCH_WANT_TIME32_SYSCALLS
#include <asm-generic/unistd.h>
diff --git a/arch/openrisc/kernel/process.c b/arch/openrisc/kernel/process.c
index b06f84f6676f..6bcdca424e11 100644
--- a/arch/openrisc/kernel/process.c
+++ b/arch/openrisc/kernel/process.c
@@ -117,12 +117,12 @@ void release_thread(struct task_struct *dead_task)
extern asmlinkage void ret_from_fork(void);
/*
- * copy_thread
+ * copy_thread_tls
* @clone_flags: flags
* @usp: user stack pointer or fn for kernel thread
* @arg: arg to fn for kernel thread; always NULL for userspace thread
* @p: the newly created task
- * @regs: CPU context to copy for userspace thread; always NULL for kthread
+ * @tls: the Thread Local Storage pointer for the new process
*
* At the top of a newly initialized kernel stack are two stacked pt_reg
* structures. The first (topmost) is the userspace context of the thread.
@@ -148,8 +148,8 @@ extern asmlinkage void ret_from_fork(void);
*/
int
-copy_thread(unsigned long clone_flags, unsigned long usp,
- unsigned long arg, struct task_struct *p)
+copy_thread_tls(unsigned long clone_flags, unsigned long usp,
+ unsigned long arg, struct task_struct *p, unsigned long tls)
{
struct pt_regs *userregs;
struct pt_regs *kregs;
@@ -179,16 +179,10 @@ copy_thread(unsigned long clone_flags, unsigned long usp,
userregs->sp = usp;
/*
- * For CLONE_SETTLS set "tp" (r10) to the TLS pointer passed to sys_clone.
- *
- * The kernel entry is:
- * int clone (long flags, void *child_stack, int *parent_tid,
- * int *child_tid, struct void *tls)
- *
- * This makes the source r7 in the kernel registers.
+ * For CLONE_SETTLS set "tp" (r10) to the TLS pointer.
*/
if (clone_flags & CLONE_SETTLS)
- userregs->gpr[10] = userregs->gpr[7];
+ userregs->gpr[10] = tls;
userregs->gpr[11] = 0; /* Result from fork() */
diff --git a/arch/openrisc/kernel/smp.c b/arch/openrisc/kernel/smp.c
index 7d518ee8bddc..bd1e660bbc89 100644
--- a/arch/openrisc/kernel/smp.c
+++ b/arch/openrisc/kernel/smp.c
@@ -14,6 +14,7 @@
#include <linux/smp.h>
#include <linux/cpu.h>
#include <linux/sched.h>
+#include <linux/sched/mm.h>
#include <linux/irq.h>
#include <asm/cpuinfo.h>
#include <asm/mmu_context.h>
@@ -113,7 +114,7 @@ asmlinkage __init void secondary_start_kernel(void)
* All kernel threads share the same mm context; grab a
* reference and switch to it.
*/
- atomic_inc(&mm->mm_count);
+ mmgrab(mm);
current->active_mm = mm;
cpumask_set_cpu(cpu, mm_cpumask(mm));
diff --git a/arch/openrisc/kernel/traps.c b/arch/openrisc/kernel/traps.c
index 932a8ec2b520..c11aa2e17ce0 100644
--- a/arch/openrisc/kernel/traps.c
+++ b/arch/openrisc/kernel/traps.c
@@ -55,13 +55,6 @@ void show_stack(struct task_struct *task, unsigned long *esp)
unwind_stack(NULL, esp, print_trace);
}
-void show_trace_task(struct task_struct *tsk)
-{
- /*
- * TODO: SysRq-T trace dump...
- */
-}
-
void show_registers(struct pt_regs *regs)
{
int i;
diff --git a/arch/parisc/include/asm/spinlock.h b/arch/parisc/include/asm/spinlock.h
index 197d2247e4db..70fecb8dc4e2 100644
--- a/arch/parisc/include/asm/spinlock.h
+++ b/arch/parisc/include/asm/spinlock.h
@@ -10,25 +10,34 @@
static inline int arch_spin_is_locked(arch_spinlock_t *x)
{
volatile unsigned int *a = __ldcw_align(x);
+ smp_mb();
return *a == 0;
}
-#define arch_spin_lock(lock) arch_spin_lock_flags(lock, 0)
+static inline void arch_spin_lock(arch_spinlock_t *x)
+{
+ volatile unsigned int *a;
+
+ a = __ldcw_align(x);
+ while (__ldcw(a) == 0)
+ while (*a == 0)
+ cpu_relax();
+}
static inline void arch_spin_lock_flags(arch_spinlock_t *x,
unsigned long flags)
{
volatile unsigned int *a;
+ unsigned long flags_dis;
a = __ldcw_align(x);
- while (__ldcw(a) == 0)
+ while (__ldcw(a) == 0) {
+ local_save_flags(flags_dis);
+ local_irq_restore(flags);
while (*a == 0)
- if (flags & PSW_SM_I) {
- local_irq_enable();
- cpu_relax();
- local_irq_disable();
- } else
- cpu_relax();
+ cpu_relax();
+ local_irq_restore(flags_dis);
+ }
}
#define arch_spin_lock_flags arch_spin_lock_flags
@@ -58,116 +67,93 @@ static inline int arch_spin_trylock(arch_spinlock_t *x)
/*
* Read-write spinlocks, allowing multiple readers but only one writer.
- * Linux rwlocks are unfair to writers; they can be starved for an indefinite
- * time by readers. With care, they can also be taken in interrupt context.
+ * Unfair locking as Writers could be starved indefinitely by Reader(s)
*
- * In the PA-RISC implementation, we have a spinlock and a counter.
- * Readers use the lock to serialise their access to the counter (which
- * records how many readers currently hold the lock).
- * Writers hold the spinlock, preventing any readers or other writers from
- * grabbing the rwlock.
+ * The spinlock itself is contained in @counter and access to it is
+ * serialized with @lock_mutex.
*/
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to grab the same read lock */
-static __inline__ void arch_read_lock(arch_rwlock_t *rw)
+/* 1 - lock taken successfully */
+static inline int arch_read_trylock(arch_rwlock_t *rw)
{
+ int ret = 0;
unsigned long flags;
- local_irq_save(flags);
- arch_spin_lock_flags(&rw->lock, flags);
- rw->counter++;
- arch_spin_unlock(&rw->lock);
- local_irq_restore(flags);
-}
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to grab the same read lock */
-static __inline__ void arch_read_unlock(arch_rwlock_t *rw)
-{
- unsigned long flags;
local_irq_save(flags);
- arch_spin_lock_flags(&rw->lock, flags);
- rw->counter--;
- arch_spin_unlock(&rw->lock);
+ arch_spin_lock(&(rw->lock_mutex));
+
+ /*
+ * zero means writer holds the lock exclusively, deny Reader.
+ * Otherwise grant lock to first/subseq reader
+ */
+ if (rw->counter > 0) {
+ rw->counter--;
+ ret = 1;
+ }
+
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
+
+ return ret;
}
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to grab the same read lock */
-static __inline__ int arch_read_trylock(arch_rwlock_t *rw)
+/* 1 - lock taken successfully */
+static inline int arch_write_trylock(arch_rwlock_t *rw)
{
+ int ret = 0;
unsigned long flags;
- retry:
+
local_irq_save(flags);
- if (arch_spin_trylock(&rw->lock)) {
- rw->counter++;
- arch_spin_unlock(&rw->lock);
- local_irq_restore(flags);
- return 1;
+ arch_spin_lock(&(rw->lock_mutex));
+
+ /*
+ * If reader(s) hold lock (lock < __ARCH_RW_LOCK_UNLOCKED__),
+ * deny writer. Otherwise if unlocked grant to writer
+ * Hence the claim that Linux rwlocks are unfair to writers.
+ * (can be starved for an indefinite time by readers).
+ */
+ if (rw->counter == __ARCH_RW_LOCK_UNLOCKED__) {
+ rw->counter = 0;
+ ret = 1;
}
-
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
- /* If write-locked, we fail to acquire the lock */
- if (rw->counter < 0)
- return 0;
- /* Wait until we have a realistic chance at the lock */
- while (arch_spin_is_locked(&rw->lock) && rw->counter >= 0)
+ return ret;
+}
+
+static inline void arch_read_lock(arch_rwlock_t *rw)
+{
+ while (!arch_read_trylock(rw))
cpu_relax();
+}
- goto retry;
+static inline void arch_write_lock(arch_rwlock_t *rw)
+{
+ while (!arch_write_trylock(rw))
+ cpu_relax();
}
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to read_trylock() this lock */
-static __inline__ void arch_write_lock(arch_rwlock_t *rw)
+static inline void arch_read_unlock(arch_rwlock_t *rw)
{
unsigned long flags;
-retry:
- local_irq_save(flags);
- arch_spin_lock_flags(&rw->lock, flags);
- if (rw->counter != 0) {
- arch_spin_unlock(&rw->lock);
- local_irq_restore(flags);
-
- while (rw->counter != 0)
- cpu_relax();
-
- goto retry;
- }
-
- rw->counter = -1; /* mark as write-locked */
- mb();
+ local_irq_save(flags);
+ arch_spin_lock(&(rw->lock_mutex));
+ rw->counter++;
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
}
-static __inline__ void arch_write_unlock(arch_rwlock_t *rw)
-{
- rw->counter = 0;
- arch_spin_unlock(&rw->lock);
-}
-
-/* Note that we have to ensure interrupts are disabled in case we're
- * interrupted by some other code that wants to read_trylock() this lock */
-static __inline__ int arch_write_trylock(arch_rwlock_t *rw)
+static inline void arch_write_unlock(arch_rwlock_t *rw)
{
unsigned long flags;
- int result = 0;
local_irq_save(flags);
- if (arch_spin_trylock(&rw->lock)) {
- if (rw->counter == 0) {
- rw->counter = -1;
- result = 1;
- } else {
- /* Read-locked. Oh well. */
- arch_spin_unlock(&rw->lock);
- }
- }
+ arch_spin_lock(&(rw->lock_mutex));
+ rw->counter = __ARCH_RW_LOCK_UNLOCKED__;
+ arch_spin_unlock(&(rw->lock_mutex));
local_irq_restore(flags);
-
- return result;
}
#endif /* __ASM_SPINLOCK_H */
diff --git a/arch/parisc/include/asm/spinlock_types.h b/arch/parisc/include/asm/spinlock_types.h
index 42979c5704dc..ca39ee350c3f 100644
--- a/arch/parisc/include/asm/spinlock_types.h
+++ b/arch/parisc/include/asm/spinlock_types.h
@@ -12,11 +12,19 @@ typedef struct {
#endif
} arch_spinlock_t;
+
+/* counter:
+ * Unlocked : 0x0100_0000
+ * Read lock(s) : 0x00FF_FFFF to 0x01 (Multiple Readers decrement it)
+ * Write lock : 0x0, but only if prior value is "unlocked" 0x0100_0000
+ */
typedef struct {
- arch_spinlock_t lock;
- volatile int counter;
+ arch_spinlock_t lock_mutex;
+ volatile unsigned int counter;
} arch_rwlock_t;
-#define __ARCH_RW_LOCK_UNLOCKED { __ARCH_SPIN_LOCK_UNLOCKED, 0 }
+#define __ARCH_RW_LOCK_UNLOCKED__ 0x01000000
+#define __ARCH_RW_LOCK_UNLOCKED { .lock_mutex = __ARCH_SPIN_LOCK_UNLOCKED, \
+ .counter = __ARCH_RW_LOCK_UNLOCKED__ }
#endif
diff --git a/arch/parisc/kernel/alternative.c b/arch/parisc/kernel/alternative.c
index 3c66d5c4d90d..fa28c4c9f972 100644
--- a/arch/parisc/kernel/alternative.c
+++ b/arch/parisc/kernel/alternative.c
@@ -25,6 +25,22 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *entry;
int index = 0, applied = 0;
int num_cpus = num_online_cpus();
+ u32 cond_check;
+
+ cond_check = ALT_COND_ALWAYS |
+ ((num_cpus == 1) ? ALT_COND_NO_SMP : 0) |
+ ((cache_info.dc_size == 0) ? ALT_COND_NO_DCACHE : 0) |
+ ((cache_info.ic_size == 0) ? ALT_COND_NO_ICACHE : 0) |
+ (running_on_qemu ? ALT_COND_RUN_ON_QEMU : 0) |
+ ((split_tlb == 0) ? ALT_COND_NO_SPLIT_TLB : 0) |
+ /*
+ * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit
+ * set (bit #61, big endian), we have to flush and sync every
+ * time IO-PDIR is changed in Ike/Astro.
+ */
+ (((boot_cpu_data.cpu_type > pcxw_) &&
+ ((boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC) == 0))
+ ? ALT_COND_NO_IOC_FDC : 0);
for (entry = start; entry < end; entry++, index++) {
@@ -38,29 +54,14 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
WARN_ON(!cond);
- if (cond != ALT_COND_ALWAYS && no_alternatives)
+ if ((cond & ALT_COND_ALWAYS) == 0 && no_alternatives)
continue;
pr_debug("Check %d: Cond 0x%x, Replace %02d instructions @ 0x%px with 0x%08x\n",
index, cond, len, from, replacement);
- if ((cond & ALT_COND_NO_SMP) && (num_cpus != 1))
- continue;
- if ((cond & ALT_COND_NO_DCACHE) && (cache_info.dc_size != 0))
- continue;
- if ((cond & ALT_COND_NO_ICACHE) && (cache_info.ic_size != 0))
- continue;
- if ((cond & ALT_COND_RUN_ON_QEMU) && !running_on_qemu)
- continue;
-
- /*
- * If the PDC_MODEL capabilities has Non-coherent IO-PDIR bit
- * set (bit #61, big endian), we have to flush and sync every
- * time IO-PDIR is changed in Ike/Astro.
- */
- if ((cond & ALT_COND_NO_IOC_FDC) &&
- ((boot_cpu_data.cpu_type <= pcxw_) ||
- (boot_cpu_data.pdc.capabilities & PDC_MODEL_IOPDIR_FDC)))
+ /* Bounce out if none of the conditions are true. */
+ if ((cond & cond_check) == 0)
continue;
/* Want to replace pdtlb by a pdtlb,l instruction? */
diff --git a/arch/parisc/kernel/irq.c b/arch/parisc/kernel/irq.c
index e5fcfb70cc7c..e76c86619949 100644
--- a/arch/parisc/kernel/irq.c
+++ b/arch/parisc/kernel/irq.c
@@ -560,33 +560,23 @@ void do_cpu_irq_mask(struct pt_regs *regs)
goto out;
}
-static struct irqaction timer_action = {
- .handler = timer_interrupt,
- .name = "timer",
- .flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL,
-};
-
-#ifdef CONFIG_SMP
-static struct irqaction ipi_action = {
- .handler = ipi_interrupt,
- .name = "IPI",
- .flags = IRQF_PERCPU,
-};
-#endif
-
static void claim_cpu_irqs(void)
{
+ unsigned long flags = IRQF_TIMER | IRQF_PERCPU | IRQF_IRQPOLL;
int i;
+
for (i = CPU_IRQ_BASE; i <= CPU_IRQ_MAX; i++) {
irq_set_chip_and_handler(i, &cpu_interrupt_type,
handle_percpu_irq);
}
irq_set_handler(TIMER_IRQ, handle_percpu_irq);
- setup_irq(TIMER_IRQ, &timer_action);
+ if (request_irq(TIMER_IRQ, timer_interrupt, flags, "timer", NULL))
+ pr_err("Failed to register timer interrupt\n");
#ifdef CONFIG_SMP
irq_set_handler(IPI_IRQ, handle_percpu_irq);
- setup_irq(IPI_IRQ, &ipi_action);
+ if (request_irq(IPI_IRQ, ipi_interrupt, IRQF_PERCPU, "IPI", NULL))
+ pr_err("Failed to register IPI interrupt\n");
#endif
}
diff --git a/arch/parisc/kernel/syscall.S b/arch/parisc/kernel/syscall.S
index 97ac707c6bff..f05c9d5b6b9e 100644
--- a/arch/parisc/kernel/syscall.S
+++ b/arch/parisc/kernel/syscall.S
@@ -935,7 +935,7 @@ ENTRY(lws_table)
END(lws_table)
/* End of lws table */
-#define __SYSCALL(nr, entry, nargs) ASM_ULONG_INSN entry
+#define __SYSCALL(nr, entry) ASM_ULONG_INSN entry
.align 8
ENTRY(sys_call_table)
.export sys_call_table,data
diff --git a/arch/parisc/kernel/syscalls/syscalltbl.sh b/arch/parisc/kernel/syscalls/syscalltbl.sh
index 45b5bae26240..f7393a7b18aa 100644
--- a/arch/parisc/kernel/syscalls/syscalltbl.sh
+++ b/arch/parisc/kernel/syscalls/syscalltbl.sh
@@ -13,10 +13,10 @@ emit() {
t_entry="$3"
while [ $t_nxt -lt $t_nr ]; do
- printf "__SYSCALL(%s, sys_ni_syscall, )\n" "${t_nxt}"
+ printf "__SYSCALL(%s,sys_ni_syscall)\n" "${t_nxt}"
t_nxt=$((t_nxt+1))
done
- printf "__SYSCALL(%s, %s, )\n" "${t_nxt}" "${t_entry}"
+ printf "__SYSCALL(%s,%s)\n" "${t_nxt}" "${t_entry}"
}
grep -E "^[0-9A-Fa-fXx]+[[:space:]]+${my_abis}" "$in" | sort -n | (
diff --git a/arch/sparc/include/asm/dma-mapping.h b/arch/sparc/include/asm/dma-mapping.h
index ed32845bd2d2..2f051343612e 100644
--- a/arch/sparc/include/asm/dma-mapping.h
+++ b/arch/sparc/include/asm/dma-mapping.h
@@ -2,23 +2,12 @@
#ifndef ___ASM_SPARC_DMA_MAPPING_H
#define ___ASM_SPARC_DMA_MAPPING_H
-#include <asm/cpu_type.h>
-
extern const struct dma_map_ops *dma_ops;
-extern struct bus_type pci_bus_type;
-
static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus)
{
-#ifdef CONFIG_SPARC_LEON
- if (sparc_cpu_model == sparc_leon)
- return NULL;
-#endif
-#if defined(CONFIG_SPARC32) && defined(CONFIG_PCI)
- if (bus == &pci_bus_type)
- return NULL;
-#endif
- return dma_ops;
+ /* sparc32 uses per-device dma_ops */
+ return IS_ENABLED(CONFIG_SPARC64) ? dma_ops : NULL;
}
#endif
diff --git a/arch/sparc/kernel/ioport.c b/arch/sparc/kernel/ioport.c
index e59461d03b9a..d6874c9b639f 100644
--- a/arch/sparc/kernel/ioport.c
+++ b/arch/sparc/kernel/ioport.c
@@ -373,9 +373,6 @@ void arch_sync_dma_for_cpu(phys_addr_t paddr, size_t size,
dma_make_coherent(paddr, PAGE_ALIGN(size));
}
-const struct dma_map_ops *dma_ops;
-EXPORT_SYMBOL(dma_ops);
-
#ifdef CONFIG_PROC_FS
static int sparc_io_proc_show(struct seq_file *m, void *v)
diff --git a/arch/sparc/kernel/of_device_common.c b/arch/sparc/kernel/of_device_common.c
index b32cc5610712..e717a56efc5d 100644
--- a/arch/sparc/kernel/of_device_common.c
+++ b/arch/sparc/kernel/of_device_common.c
@@ -67,6 +67,7 @@ void of_propagate_archdata(struct platform_device *bus)
op->dev.archdata.stc = bus_sd->stc;
op->dev.archdata.host_controller = bus_sd->host_controller;
op->dev.archdata.numa_node = bus_sd->numa_node;
+ op->dev.dma_ops = bus->dev.dma_ops;
if (dp->child)
of_propagate_archdata(op);
diff --git a/arch/sparc/mm/io-unit.c b/arch/sparc/mm/io-unit.c
index 33a0facd9eb5..289276b99b01 100644
--- a/arch/sparc/mm/io-unit.c
+++ b/arch/sparc/mm/io-unit.c
@@ -38,6 +38,8 @@
#define IOPERM (IOUPTE_CACHE | IOUPTE_WRITE | IOUPTE_VALID)
#define MKIOPTE(phys) __iopte((((phys)>>4) & IOUPTE_PAGE) | IOPERM)
+static const struct dma_map_ops iounit_dma_ops;
+
static void __init iounit_iommu_init(struct platform_device *op)
{
struct iounit_struct *iounit;
@@ -70,6 +72,8 @@ static void __init iounit_iommu_init(struct platform_device *op)
xptend = iounit->page_table + (16 * PAGE_SIZE) / sizeof(iopte_t);
for (; xpt < xptend; xpt++)
sbus_writel(0, xpt);
+
+ op->dev.dma_ops = &iounit_dma_ops;
}
static int __init iounit_init(void)
@@ -288,8 +292,3 @@ static const struct dma_map_ops iounit_dma_ops = {
.map_sg = iounit_map_sg,
.unmap_sg = iounit_unmap_sg,
};
-
-void __init ld_mmu_iounit(void)
-{
- dma_ops = &iounit_dma_ops;
-}
diff --git a/arch/sparc/mm/iommu.c b/arch/sparc/mm/iommu.c
index 4d3c6991f0ae..b00dde13681b 100644
--- a/arch/sparc/mm/iommu.c
+++ b/arch/sparc/mm/iommu.c
@@ -54,6 +54,9 @@ static pgprot_t dvma_prot; /* Consistent mapping pte flags */
#define IOPERM (IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID)
#define MKIOPTE(pfn, perm) (((((pfn)<<8) & IOPTE_PAGE) | (perm)) & ~IOPTE_WAZ)
+static const struct dma_map_ops sbus_iommu_dma_gflush_ops;
+static const struct dma_map_ops sbus_iommu_dma_pflush_ops;
+
static void __init sbus_iommu_init(struct platform_device *op)
{
struct iommu_struct *iommu;
@@ -129,6 +132,11 @@ static void __init sbus_iommu_init(struct platform_device *op)
(int)(IOMMU_NPTES*sizeof(iopte_t)), (int)IOMMU_NPTES);
op->dev.archdata.iommu = iommu;
+
+ if (flush_page_for_dma_global)
+ op->dev.dma_ops = &sbus_iommu_dma_gflush_ops;
+ else
+ op->dev.dma_ops = &sbus_iommu_dma_pflush_ops;
}
static int __init iommu_init(void)
@@ -445,13 +453,6 @@ static const struct dma_map_ops sbus_iommu_dma_pflush_ops = {
void __init ld_mmu_iommu(void)
{
- if (flush_page_for_dma_global) {
- /* flush_page_for_dma flushes everything, no matter of what page is it */
- dma_ops = &sbus_iommu_dma_gflush_ops;
- } else {
- dma_ops = &sbus_iommu_dma_pflush_ops;
- }
-
if (viking_mxcc_present || srmmu_modtype == HyperSparc) {
dvma_prot = __pgprot(SRMMU_CACHE | SRMMU_ET_PTE | SRMMU_PRIV);
ioperm_noc = IOPTE_CACHE | IOPTE_WRITE | IOPTE_VALID;
diff --git a/arch/sparc/mm/mm_32.h b/arch/sparc/mm/mm_32.h
index 0d0b06e952a5..ce750a99eea9 100644
--- a/arch/sparc/mm/mm_32.h
+++ b/arch/sparc/mm/mm_32.h
@@ -20,6 +20,3 @@ void __init srmmu_paging_init(void);
/* iommu.c */
void ld_mmu_iommu(void);
-
-/* io-unit.c */
-void ld_mmu_iounit(void);
diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c
index f56c3c9a9793..b7c94de70cca 100644
--- a/arch/sparc/mm/srmmu.c
+++ b/arch/sparc/mm/srmmu.c
@@ -1865,9 +1865,7 @@ void __init load_mmu(void)
&smp_cachetlb_ops;
#endif
- if (sparc_cpu_model == sun4d)
- ld_mmu_iounit();
- else
+ if (sparc_cpu_model != sun4d)
ld_mmu_iommu();
#ifdef CONFIG_SMP
if (sparc_cpu_model == sun4d)
diff --git a/arch/um/Kconfig b/arch/um/Kconfig
index 0917f8443c28..96ab7026b037 100644
--- a/arch/um/Kconfig
+++ b/arch/um/Kconfig
@@ -62,9 +62,12 @@ config NR_CPUS
source "arch/$(HEADER_ARCH)/um/Kconfig"
+config FORBID_STATIC_LINK
+ bool
+
config STATIC_LINK
bool "Force a static link"
- default n
+ depends on !FORBID_STATIC_LINK
help
This option gives you the ability to force a static link of UML.
Normally, UML is linked as a shared binary. This is inconvenient for
@@ -73,6 +76,9 @@ config STATIC_LINK
Additionally, this option enables using higher memory spaces (up to
2.75G) for UML.
+ NOTE: This option is incompatible with some networking features which
+ depend on features that require being dynamically loaded (like NSS).
+
config LD_SCRIPT_STATIC
bool
default y
@@ -191,6 +197,7 @@ config UML_TIME_TRAVEL_SUPPORT
prompt "Support time-travel mode (e.g. for test execution)"
# inf-cpu mode is incompatible with the benchmarking
depends on !RAID6_PQ_BENCHMARK
+ depends on !SMP
help
Enable this option to support time travel inside the UML instance.
diff --git a/arch/um/configs/i386_defconfig b/arch/um/configs/i386_defconfig
index 73e98bb57bf5..fb51bd206dbe 100644
--- a/arch/um/configs/i386_defconfig
+++ b/arch/um/configs/i386_defconfig
@@ -26,7 +26,7 @@ CONFIG_SLAB=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
-CONFIG_IOSCHED_CFQ=m
+CONFIG_IOSCHED_BFQ=m
CONFIG_SSL=y
CONFIG_NULL_CHAN=y
CONFIG_PORT_CHAN=y
diff --git a/arch/um/configs/x86_64_defconfig b/arch/um/configs/x86_64_defconfig
index 3281d7600225..477b87317424 100644
--- a/arch/um/configs/x86_64_defconfig
+++ b/arch/um/configs/x86_64_defconfig
@@ -24,7 +24,7 @@ CONFIG_SLAB=y
CONFIG_MODULES=y
CONFIG_MODULE_UNLOAD=y
# CONFIG_BLK_DEV_BSG is not set
-CONFIG_IOSCHED_CFQ=m
+CONFIG_IOSCHED_BFQ=m
CONFIG_SSL=y
CONFIG_NULL_CHAN=y
CONFIG_PORT_CHAN=y
diff --git a/arch/um/drivers/Kconfig b/arch/um/drivers/Kconfig
index 72d417055782..9160ead56e33 100644
--- a/arch/um/drivers/Kconfig
+++ b/arch/um/drivers/Kconfig
@@ -234,6 +234,7 @@ config UML_NET_DAEMON
config UML_NET_VECTOR
bool "Vector I/O high performance network devices"
depends on UML_NET
+ select FORBID_STATIC_LINK
help
This User-Mode Linux network driver uses multi-message send
and receive functions. The host running the UML guest must have
@@ -245,6 +246,7 @@ config UML_NET_VECTOR
config UML_NET_VDE
bool "VDE transport (obsolete)"
depends on UML_NET
+ select FORBID_STATIC_LINK
help
This User-Mode Linux network transport allows one or more running
UMLs on a single host to communicate with each other and also
@@ -292,6 +294,7 @@ config UML_NET_MCAST
config UML_NET_PCAP
bool "pcap transport (obsolete)"
depends on UML_NET
+ select FORBID_STATIC_LINK
help
The pcap transport makes a pcap packet stream on the host look
like an ethernet device inside UML. This is useful for making
diff --git a/arch/um/drivers/net_kern.c b/arch/um/drivers/net_kern.c
index 35ebeebfc1a8..1802cf4ef5a5 100644
--- a/arch/um/drivers/net_kern.c
+++ b/arch/um/drivers/net_kern.c
@@ -266,7 +266,6 @@ static void uml_net_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strlcpy(info->version, "42", sizeof(info->version));
}
static const struct ethtool_ops uml_net_ethtool_ops = {
@@ -275,17 +274,6 @@ static const struct ethtool_ops uml_net_ethtool_ops = {
.get_ts_info = ethtool_op_get_ts_info,
};
-static void uml_net_user_timer_expire(struct timer_list *t)
-{
-#ifdef undef
- struct uml_net_private *lp = from_timer(lp, t, tl);
- struct connection *conn = &lp->user;
-
- dprintk(KERN_INFO "uml_net_user_timer_expire [%p]\n", conn);
- do_connect(conn);
-#endif
-}
-
void uml_net_setup_etheraddr(struct net_device *dev, char *str)
{
unsigned char *addr = dev->dev_addr;
@@ -456,7 +444,6 @@ static void eth_configure(int n, void *init, char *mac,
.add_address = transport->user->add_address,
.delete_address = transport->user->delete_address });
- timer_setup(&lp->tl, uml_net_user_timer_expire, 0);
spin_lock_init(&lp->lock);
memcpy(lp->mac, dev->dev_addr, sizeof(lp->mac));
diff --git a/arch/um/drivers/ubd_kern.c b/arch/um/drivers/ubd_kern.c
index 247f95da057b..eae8c83364f7 100644
--- a/arch/um/drivers/ubd_kern.c
+++ b/arch/um/drivers/ubd_kern.c
@@ -1592,11 +1592,11 @@ int io_thread(void *arg)
&io_remainder_size,
UBD_REQ_BUFFER_SIZE
);
- if (n < 0) {
- if (n == -EAGAIN) {
+ if (n <= 0) {
+ if (n == -EAGAIN)
ubd_read_poll(-1);
- continue;
- }
+
+ continue;
}
for (count = 0; count < n/sizeof(struct io_thread_req *); count++) {
@@ -1607,7 +1607,9 @@ int io_thread(void *arg)
written = 0;
do {
- res = os_write_file(kernel_fd, ((char *) io_req_buffer) + written, n);
+ res = os_write_file(kernel_fd,
+ ((char *) io_req_buffer) + written,
+ n - written);
if (res >= 0) {
written += res;
}
diff --git a/arch/um/drivers/vector_kern.c b/arch/um/drivers/vector_kern.c
index e98304d0219e..8735c468230a 100644
--- a/arch/um/drivers/vector_kern.c
+++ b/arch/um/drivers/vector_kern.c
@@ -46,7 +46,6 @@
#define DRIVER_NAME "uml-vector"
-#define DRIVER_VERSION "01"
struct vector_cmd_line_arg {
struct list_head list;
int unit;
@@ -198,6 +197,9 @@ static int get_transport_options(struct arglist *def)
long parsed;
int result = 0;
+ if (transport == NULL)
+ return -EINVAL;
+
if (vector != NULL) {
if (kstrtoul(vector, 10, &parsed) == 0) {
if (parsed == 0) {
@@ -1378,7 +1380,6 @@ static void vector_net_get_drvinfo(struct net_device *dev,
struct ethtool_drvinfo *info)
{
strlcpy(info->driver, DRIVER_NAME, sizeof(info->driver));
- strlcpy(info->version, DRIVER_VERSION, sizeof(info->version));
}
static int vector_net_load_bpf_flash(struct net_device *dev,
diff --git a/arch/um/drivers/vector_user.c b/arch/um/drivers/vector_user.c
index ddcd917be0af..aa28e9eecb7b 100644
--- a/arch/um/drivers/vector_user.c
+++ b/arch/um/drivers/vector_user.c
@@ -221,8 +221,7 @@ static struct vector_fds *user_init_tap_fds(struct arglist *ifspec)
return result;
tap_cleanup:
printk(UM_KERN_ERR "user_init_tap: init failed, error %d", fd);
- if (result != NULL)
- kfree(result);
+ kfree(result);
return NULL;
}
@@ -266,8 +265,7 @@ static struct vector_fds *user_init_hybrid_fds(struct arglist *ifspec)
return result;
hybrid_cleanup:
printk(UM_KERN_ERR "user_init_hybrid: init failed");
- if (result != NULL)
- kfree(result);
+ kfree(result);
return NULL;
}
@@ -344,10 +342,8 @@ static struct vector_fds *user_init_unix_fds(struct arglist *ifspec, int id)
unix_cleanup:
if (fd >= 0)
os_close_file(fd);
- if (remote_addr != NULL)
- kfree(remote_addr);
- if (result != NULL)
- kfree(result);
+ kfree(remote_addr);
+ kfree(result);
return NULL;
}
@@ -382,8 +378,7 @@ static struct vector_fds *user_init_raw_fds(struct arglist *ifspec)
return result;
raw_cleanup:
printk(UM_KERN_ERR "user_init_raw: init failed, error %d", err);
- if (result != NULL)
- kfree(result);
+ kfree(result);
return NULL;
}
diff --git a/arch/um/drivers/vhost_user.h b/arch/um/drivers/vhost_user.h
index 45ff5ea22fea..6c71b6005177 100644
--- a/arch/um/drivers/vhost_user.h
+++ b/arch/um/drivers/vhost_user.h
@@ -10,9 +10,10 @@
/* Feature bits */
#define VHOST_USER_F_PROTOCOL_FEATURES 30
/* Protocol feature bits */
-#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
-#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
-#define VHOST_USER_PROTOCOL_F_CONFIG 9
+#define VHOST_USER_PROTOCOL_F_REPLY_ACK 3
+#define VHOST_USER_PROTOCOL_F_SLAVE_REQ 5
+#define VHOST_USER_PROTOCOL_F_CONFIG 9
+#define VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS 14
/* Vring state index masks */
#define VHOST_USER_VRING_INDEX_MASK 0xff
#define VHOST_USER_VRING_POLL_MASK BIT(8)
@@ -24,7 +25,8 @@
/* Supported protocol features */
#define VHOST_USER_SUPPORTED_PROTOCOL_F (BIT_ULL(VHOST_USER_PROTOCOL_F_REPLY_ACK) | \
BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ) | \
- BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG))
+ BIT_ULL(VHOST_USER_PROTOCOL_F_CONFIG) | \
+ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS))
enum vhost_user_request {
VHOST_USER_GET_FEATURES = 1,
@@ -52,12 +54,14 @@ enum vhost_user_request {
VHOST_USER_SET_VRING_ENDIAN = 23,
VHOST_USER_GET_CONFIG = 24,
VHOST_USER_SET_CONFIG = 25,
+ VHOST_USER_VRING_KICK = 35,
};
enum vhost_user_slave_request {
VHOST_USER_SLAVE_IOTLB_MSG = 1,
VHOST_USER_SLAVE_CONFIG_CHANGE_MSG = 2,
VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG = 3,
+ VHOST_USER_SLAVE_VRING_CALL = 4,
};
struct vhost_user_header {
diff --git a/arch/um/drivers/virtio_uml.c b/arch/um/drivers/virtio_uml.c
index 023ced2250ea..be54d368e73d 100644
--- a/arch/um/drivers/virtio_uml.c
+++ b/arch/um/drivers/virtio_uml.c
@@ -26,6 +26,7 @@
#include <linux/virtio.h>
#include <linux/virtio_config.h>
#include <linux/virtio_ring.h>
+#include <linux/time-internal.h>
#include <shared/as-layout.h>
#include <irq_kern.h>
#include <init.h>
@@ -53,6 +54,7 @@ struct virtio_uml_device {
struct virtio_device vdev;
struct platform_device *pdev;
+ spinlock_t sock_lock;
int sock, req_fd;
u64 features;
u64 protocol_features;
@@ -63,6 +65,11 @@ struct virtio_uml_device {
struct virtio_uml_vq_info {
int kick_fd, call_fd;
char name[32];
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ struct virtqueue *vq;
+ vq_callback_t *callback;
+ struct time_travel_event defer;
+#endif
};
extern unsigned long long physmem_size, highmem;
@@ -117,10 +124,27 @@ static int vhost_user_recv_header(int fd, struct vhost_user_msg *msg)
static int vhost_user_recv(struct virtio_uml_device *vu_dev,
int fd, struct vhost_user_msg *msg,
- size_t max_payload_size)
+ size_t max_payload_size, bool wait)
{
size_t size;
- int rc = vhost_user_recv_header(fd, msg);
+ int rc;
+
+ /*
+ * In virtio time-travel mode, we're handling all the vhost-user
+ * FDs by polling them whenever appropriate. However, we may get
+ * into a situation where we're sending out an interrupt message
+ * to a device (e.g. a net device) and need to handle a simulation
+ * time message while doing so, e.g. one that tells us to update
+ * our idea of how long we can run without scheduling.
+ *
+ * Thus, we need to not just read() from the given fd, but need
+ * to also handle messages for the simulation time - this function
+ * does that for us while waiting for the given fd to be readable.
+ */
+ if (wait)
+ time_travel_wait_readable(fd);
+
+ rc = vhost_user_recv_header(fd, msg);
if (rc == -ECONNRESET && vu_dev->registered) {
struct virtio_uml_platform_data *pdata;
@@ -142,7 +166,8 @@ static int vhost_user_recv_resp(struct virtio_uml_device *vu_dev,
struct vhost_user_msg *msg,
size_t max_payload_size)
{
- int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg, max_payload_size);
+ int rc = vhost_user_recv(vu_dev, vu_dev->sock, msg,
+ max_payload_size, true);
if (rc)
return rc;
@@ -172,7 +197,8 @@ static int vhost_user_recv_req(struct virtio_uml_device *vu_dev,
struct vhost_user_msg *msg,
size_t max_payload_size)
{
- int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg, max_payload_size);
+ int rc = vhost_user_recv(vu_dev, vu_dev->req_fd, msg,
+ max_payload_size, false);
if (rc)
return rc;
@@ -189,6 +215,7 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev,
int *fds, size_t num_fds)
{
size_t size = sizeof(msg->header) + msg->header.size;
+ unsigned long flags;
bool request_ack;
int rc;
@@ -207,24 +234,28 @@ static int vhost_user_send(struct virtio_uml_device *vu_dev,
if (request_ack)
msg->header.flags |= VHOST_USER_FLAG_NEED_REPLY;
+ spin_lock_irqsave(&vu_dev->sock_lock, flags);
rc = full_sendmsg_fds(vu_dev->sock, msg, size, fds, num_fds);
if (rc < 0)
- return rc;
+ goto out;
if (request_ack) {
uint64_t status;
rc = vhost_user_recv_u64(vu_dev, &status);
if (rc)
- return rc;
+ goto out;
if (status) {
vu_err(vu_dev, "slave reports error: %llu\n", status);
- return -EIO;
+ rc = -EIO;
+ goto out;
}
}
- return 0;
+out:
+ spin_unlock_irqrestore(&vu_dev->sock_lock, flags);
+ return rc;
}
static int vhost_user_send_no_payload(struct virtio_uml_device *vu_dev,
@@ -324,6 +355,7 @@ static void vhost_user_reply(struct virtio_uml_device *vu_dev,
static irqreturn_t vu_req_interrupt(int irq, void *data)
{
struct virtio_uml_device *vu_dev = data;
+ struct virtqueue *vq;
int response = 1;
struct {
struct vhost_user_msg msg;
@@ -343,6 +375,15 @@ static irqreturn_t vu_req_interrupt(int irq, void *data)
virtio_config_changed(&vu_dev->vdev);
response = 0;
break;
+ case VHOST_USER_SLAVE_VRING_CALL:
+ virtio_device_for_each_vq((&vu_dev->vdev), vq) {
+ if (vq->index == msg.msg.payload.vring_state.index) {
+ response = 0;
+ vring_interrupt(0 /* ignored */, vq);
+ break;
+ }
+ }
+ break;
case VHOST_USER_SLAVE_IOTLB_MSG:
/* not supported - VIRTIO_F_IOMMU_PLATFORM */
case VHOST_USER_SLAVE_VRING_HOST_NOTIFIER_MSG:
@@ -684,6 +725,17 @@ static bool vu_notify(struct virtqueue *vq)
const uint64_t n = 1;
int rc;
+ time_travel_propagate_time();
+
+ if (info->kick_fd < 0) {
+ struct virtio_uml_device *vu_dev;
+
+ vu_dev = to_virtio_uml_device(vq->vdev);
+
+ return vhost_user_set_vring_state(vu_dev, VHOST_USER_VRING_KICK,
+ vq->index, 0) == 0;
+ }
+
do {
rc = os_write_file(info->kick_fd, &n, sizeof(n));
} while (rc == -EINTR);
@@ -749,10 +801,13 @@ static void vu_del_vq(struct virtqueue *vq)
{
struct virtio_uml_vq_info *info = vq->priv;
- um_free_irq(VIRTIO_IRQ, vq);
+ if (info->call_fd >= 0) {
+ um_free_irq(VIRTIO_IRQ, vq);
+ os_close_file(info->call_fd);
+ }
- os_close_file(info->call_fd);
- os_close_file(info->kick_fd);
+ if (info->kick_fd >= 0)
+ os_close_file(info->kick_fd);
vring_del_virtqueue(vq);
kfree(info);
@@ -782,6 +837,15 @@ static int vu_setup_vq_call_fd(struct virtio_uml_device *vu_dev,
int call_fds[2];
int rc;
+ /* no call FD needed/desired in this case */
+ if (vu_dev->protocol_features &
+ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS) &&
+ vu_dev->protocol_features &
+ BIT_ULL(VHOST_USER_PROTOCOL_F_SLAVE_REQ)) {
+ info->call_fd = -1;
+ return 0;
+ }
+
/* Use a pipe for call fd, since SIGIO is not supported for eventfd */
rc = os_pipe(call_fds, true, true);
if (rc < 0)
@@ -810,6 +874,23 @@ out:
return rc;
}
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+static void vu_defer_irq_handle(struct time_travel_event *d)
+{
+ struct virtio_uml_vq_info *info;
+
+ info = container_of(d, struct virtio_uml_vq_info, defer);
+ info->callback(info->vq);
+}
+
+static void vu_defer_irq_callback(struct virtqueue *vq)
+{
+ struct virtio_uml_vq_info *info = vq->priv;
+
+ time_travel_add_irq_event(&info->defer);
+}
+#endif
+
static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
unsigned index, vq_callback_t *callback,
const char *name, bool ctx)
@@ -829,6 +910,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
snprintf(info->name, sizeof(info->name), "%s.%d-%s", pdev->name,
pdev->id, name);
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ /*
+ * When we get an interrupt, we must bounce it through the simulation
+ * calendar (the simtime device), except for the simtime device itself
+ * since that's part of the simulation control.
+ */
+ if (time_travel_mode == TT_MODE_EXTERNAL && callback) {
+ info->callback = callback;
+ callback = vu_defer_irq_callback;
+ time_travel_set_event_fn(&info->defer, vu_defer_irq_handle);
+ }
+#endif
+
vq = vring_create_virtqueue(index, num, PAGE_SIZE, vdev, true, true,
ctx, vu_notify, callback, info->name);
if (!vq) {
@@ -837,11 +931,19 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
}
vq->priv = info;
num = virtqueue_get_vring_size(vq);
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+ info->vq = vq;
+#endif
- rc = os_eventfd(0, 0);
- if (rc < 0)
- goto error_kick;
- info->kick_fd = rc;
+ if (vu_dev->protocol_features &
+ BIT_ULL(VHOST_USER_PROTOCOL_F_INBAND_NOTIFICATIONS)) {
+ info->kick_fd = -1;
+ } else {
+ rc = os_eventfd(0, 0);
+ if (rc < 0)
+ goto error_kick;
+ info->kick_fd = rc;
+ }
rc = vu_setup_vq_call_fd(vu_dev, vq);
if (rc)
@@ -866,10 +968,13 @@ static struct virtqueue *vu_setup_vq(struct virtio_device *vdev,
return vq;
error_setup:
- um_free_irq(VIRTIO_IRQ, vq);
- os_close_file(info->call_fd);
+ if (info->call_fd >= 0) {
+ um_free_irq(VIRTIO_IRQ, vq);
+ os_close_file(info->call_fd);
+ }
error_call:
- os_close_file(info->kick_fd);
+ if (info->kick_fd >= 0)
+ os_close_file(info->kick_fd);
error_kick:
vring_del_virtqueue(vq);
error_create:
@@ -908,10 +1013,12 @@ static int vu_find_vqs(struct virtio_device *vdev, unsigned nvqs,
list_for_each_entry(vq, &vdev->vqs, list) {
struct virtio_uml_vq_info *info = vq->priv;
- rc = vhost_user_set_vring_kick(vu_dev, vq->index,
- info->kick_fd);
- if (rc)
- goto error_setup;
+ if (info->kick_fd >= 0) {
+ rc = vhost_user_set_vring_kick(vu_dev, vq->index,
+ info->kick_fd);
+ if (rc)
+ goto error_setup;
+ }
rc = vhost_user_set_vring_enable(vu_dev, vq->index, true);
if (rc)
@@ -1008,6 +1115,8 @@ static int virtio_uml_probe(struct platform_device *pdev)
return rc;
vu_dev->sock = rc;
+ spin_lock_init(&vu_dev->sock_lock);
+
rc = vhost_user_init(vu_dev);
if (rc)
goto error_init;
diff --git a/arch/um/include/asm/Kbuild b/arch/um/include/asm/Kbuild
index db7d9d4e30d8..8d435f8a6dec 100644
--- a/arch/um/include/asm/Kbuild
+++ b/arch/um/include/asm/Kbuild
@@ -3,7 +3,6 @@ generic-y += bpf_perf_event.h
generic-y += bug.h
generic-y += compat.h
generic-y += current.h
-generic-y += delay.h
generic-y += device.h
generic-y += emergency-restart.h
generic-y += exec.h
diff --git a/arch/um/include/asm/delay.h b/arch/um/include/asm/delay.h
new file mode 100644
index 000000000000..56fc2b8f2dd0
--- /dev/null
+++ b/arch/um/include/asm/delay.h
@@ -0,0 +1,30 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef __UM_DELAY_H
+#define __UM_DELAY_H
+#include <asm-generic/delay.h>
+#include <linux/time-internal.h>
+
+static inline void um_ndelay(unsigned long nsecs)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL) {
+ time_travel_ndelay(nsecs);
+ return;
+ }
+ ndelay(nsecs);
+}
+#undef ndelay
+#define ndelay um_ndelay
+
+static inline void um_udelay(unsigned long usecs)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL) {
+ time_travel_ndelay(1000 * usecs);
+ return;
+ }
+ udelay(usecs);
+}
+#undef udelay
+#define udelay um_udelay
+#endif /* __UM_DELAY_H */
diff --git a/arch/um/include/linux/time-internal.h b/arch/um/include/linux/time-internal.h
new file mode 100644
index 000000000000..f3b03d39a854
--- /dev/null
+++ b/arch/um/include/linux/time-internal.h
@@ -0,0 +1,84 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+/*
+ * Copyright (C) 2012 - 2014 Cisco Systems
+ * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ */
+
+#ifndef __TIMER_INTERNAL_H__
+#define __TIMER_INTERNAL_H__
+#include <linux/list.h>
+
+#define TIMER_MULTIPLIER 256
+#define TIMER_MIN_DELTA 500
+
+enum time_travel_mode {
+ TT_MODE_OFF,
+ TT_MODE_BASIC,
+ TT_MODE_INFCPU,
+ TT_MODE_EXTERNAL,
+};
+
+#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
+struct time_travel_event {
+ unsigned long long time;
+ void (*fn)(struct time_travel_event *d);
+ struct list_head list;
+ bool pending, onstack;
+};
+
+extern enum time_travel_mode time_travel_mode;
+
+void time_travel_sleep(unsigned long long duration);
+
+static inline void
+time_travel_set_event_fn(struct time_travel_event *e,
+ void (*fn)(struct time_travel_event *d))
+{
+ e->fn = fn;
+}
+
+void __time_travel_propagate_time(void);
+
+static inline void time_travel_propagate_time(void)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL)
+ __time_travel_propagate_time();
+}
+
+void __time_travel_wait_readable(int fd);
+
+static inline void time_travel_wait_readable(int fd)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL)
+ __time_travel_wait_readable(fd);
+}
+
+void time_travel_add_irq_event(struct time_travel_event *e);
+#else
+struct time_travel_event {
+};
+
+#define time_travel_mode TT_MODE_OFF
+
+static inline void time_travel_sleep(unsigned long long duration)
+{
+}
+
+/* this is a macro so the event/function need not exist */
+#define time_travel_set_event_fn(e, fn) do {} while (0)
+
+static inline void time_travel_propagate_time(void)
+{
+}
+
+static inline void time_travel_wait_readable(int fd)
+{
+}
+#endif /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
+
+/*
+ * Without CONFIG_UML_TIME_TRAVEL_SUPPORT this is a linker error if used,
+ * which is intentional since we really shouldn't link it in that case.
+ */
+void time_travel_ndelay(unsigned long nsec);
+#endif /* __TIMER_INTERNAL_H__ */
diff --git a/arch/um/include/shared/os.h b/arch/um/include/shared/os.h
index 0f30204b6afa..f467d28fc0b4 100644
--- a/arch/um/include/shared/os.h
+++ b/arch/um/include/shared/os.h
@@ -181,6 +181,7 @@ extern int os_falloc_punch(int fd, unsigned long long offset, int count);
extern int os_eventfd(unsigned int initval, int flags);
extern int os_sendmsg_fds(int fd, const void *buf, unsigned int len,
const int *fds, unsigned int fds_num);
+int os_poll(unsigned int n, const int *fds);
/* start_up.c */
extern void os_early_checks(void);
diff --git a/arch/um/include/shared/timer-internal.h b/arch/um/include/shared/timer-internal.h
deleted file mode 100644
index 2d2d13c9b46f..000000000000
--- a/arch/um/include/shared/timer-internal.h
+++ /dev/null
@@ -1,76 +0,0 @@
-/* SPDX-License-Identifier: GPL-2.0 */
-/*
- * Copyright (C) 2012 - 2014 Cisco Systems
- * Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
- */
-
-#ifndef __TIMER_INTERNAL_H__
-#define __TIMER_INTERNAL_H__
-
-#define TIMER_MULTIPLIER 256
-#define TIMER_MIN_DELTA 500
-
-enum time_travel_mode {
- TT_MODE_OFF,
- TT_MODE_BASIC,
- TT_MODE_INFCPU,
-};
-
-enum time_travel_timer_mode {
- TT_TMR_DISABLED,
- TT_TMR_ONESHOT,
- TT_TMR_PERIODIC,
-};
-
-#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
-extern enum time_travel_mode time_travel_mode;
-extern unsigned long long time_travel_time;
-extern enum time_travel_timer_mode time_travel_timer_mode;
-extern unsigned long long time_travel_timer_expiry;
-extern unsigned long long time_travel_timer_interval;
-
-static inline void time_travel_set_time(unsigned long long ns)
-{
- time_travel_time = ns;
-}
-
-static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode)
-{
- time_travel_timer_mode = mode;
-}
-
-static inline void time_travel_set_timer_expiry(unsigned long long expiry)
-{
- time_travel_timer_expiry = expiry;
-}
-
-static inline void time_travel_set_timer_interval(unsigned long long interval)
-{
- time_travel_timer_interval = interval;
-}
-#else
-#define time_travel_mode TT_MODE_OFF
-#define time_travel_time 0
-#define time_travel_timer_expiry 0
-#define time_travel_timer_interval 0
-
-static inline void time_travel_set_time(unsigned long long ns)
-{
-}
-
-static inline void time_travel_set_timer_mode(enum time_travel_timer_mode mode)
-{
-}
-
-static inline void time_travel_set_timer_expiry(unsigned long long expiry)
-{
-}
-
-static inline void time_travel_set_timer_interval(unsigned long long interval)
-{
-}
-
-#define time_travel_timer_mode TT_TMR_DISABLED
-#endif
-
-#endif
diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c
index 98bdf69e4c2e..e4abac6c9727 100644
--- a/arch/um/kernel/kmsg_dump.c
+++ b/arch/um/kernel/kmsg_dump.c
@@ -9,20 +9,19 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper,
enum kmsg_dump_reason reason)
{
static char line[1024];
-
+ struct console *con;
size_t len = 0;
- bool con_available = false;
/* only dump kmsg when no console is available */
if (!console_trylock())
return;
- if (console_drivers != NULL)
- con_available = true;
+ for_each_console(con)
+ break;
console_unlock();
- if (con_available == true)
+ if (con)
return;
printf("kmsg_dump:\n");
diff --git a/arch/um/kernel/process.c b/arch/um/kernel/process.c
index 56a094182bf5..cbe33af2a880 100644
--- a/arch/um/kernel/process.c
+++ b/arch/um/kernel/process.c
@@ -32,7 +32,7 @@
#include <kern_util.h>
#include <os.h>
#include <skas.h>
-#include <timer-internal.h>
+#include <linux/time-internal.h>
/*
* This is a per-cpu array. A processor only modifies its entry and it only
@@ -203,43 +203,6 @@ void initial_thread_cb(void (*proc)(void *), void *arg)
kmalloc_ok = save_kmalloc_ok;
}
-static void time_travel_sleep(unsigned long long duration)
-{
- unsigned long long next = time_travel_time + duration;
-
- if (time_travel_mode != TT_MODE_INFCPU)
- os_timer_disable();
-
- while (time_travel_timer_mode == TT_TMR_PERIODIC &&
- time_travel_timer_expiry < time_travel_time)
- time_travel_set_timer_expiry(time_travel_timer_expiry +
- time_travel_timer_interval);
-
- if (time_travel_timer_mode != TT_TMR_DISABLED &&
- time_travel_timer_expiry < next) {
- if (time_travel_timer_mode == TT_TMR_ONESHOT)
- time_travel_set_timer_mode(TT_TMR_DISABLED);
- /*
- * In basic mode, time_travel_time will be adjusted in
- * the timer IRQ handler so it works even when the signal
- * comes from the OS timer, see there.
- */
- if (time_travel_mode != TT_MODE_BASIC)
- time_travel_set_time(time_travel_timer_expiry);
-
- deliver_alarm();
- } else {
- time_travel_set_time(next);
- }
-
- if (time_travel_mode != TT_MODE_INFCPU) {
- if (time_travel_timer_mode == TT_TMR_PERIODIC)
- os_timer_set_interval(time_travel_timer_interval);
- else if (time_travel_timer_mode == TT_TMR_ONESHOT)
- os_timer_one_shot(time_travel_timer_expiry - next);
- }
-}
-
static void um_idle_sleep(void)
{
unsigned long long duration = UM_NSEC_PER_SEC;
diff --git a/arch/um/kernel/skas/syscall.c b/arch/um/kernel/skas/syscall.c
index 40d90dddf3f1..0a12d5a09217 100644
--- a/arch/um/kernel/skas/syscall.c
+++ b/arch/um/kernel/skas/syscall.c
@@ -10,7 +10,7 @@
#include <sysdep/ptrace.h>
#include <sysdep/ptrace_user.h>
#include <sysdep/syscalls.h>
-#include <shared/timer-internal.h>
+#include <linux/time-internal.h>
void handle_syscall(struct uml_pt_regs *r)
{
@@ -24,7 +24,8 @@ void handle_syscall(struct uml_pt_regs *r)
* went to sleep, even if said userspace interacts with the kernel in
* various ways.
*/
- if (time_travel_mode == TT_MODE_INFCPU)
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
schedule();
/* Initialize the syscall number and default return value. */
diff --git a/arch/um/kernel/time.c b/arch/um/kernel/time.c
index 94ea87bd231c..25eaa6a0c658 100644
--- a/arch/um/kernel/time.c
+++ b/arch/um/kernel/time.c
@@ -4,6 +4,7 @@
* Copyright (C) 2015 Thomas Meyer (thomas@m3y3r.de)
* Copyright (C) 2012-2014 Cisco Systems
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
+ * Copyright (C) 2019 Intel Corporation
*/
#include <linux/clockchips.h>
@@ -18,21 +19,484 @@
#include <asm/param.h>
#include <kern_util.h>
#include <os.h>
-#include <timer-internal.h>
+#include <linux/time-internal.h>
+#include <linux/um_timetravel.h>
#include <shared/init.h>
#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
enum time_travel_mode time_travel_mode;
-unsigned long long time_travel_time;
-enum time_travel_timer_mode time_travel_timer_mode;
-unsigned long long time_travel_timer_expiry;
-unsigned long long time_travel_timer_interval;
+EXPORT_SYMBOL_GPL(time_travel_mode);
static bool time_travel_start_set;
static unsigned long long time_travel_start;
-#else
+static unsigned long long time_travel_time;
+static LIST_HEAD(time_travel_events);
+static unsigned long long time_travel_timer_interval;
+static unsigned long long time_travel_next_event;
+static struct time_travel_event time_travel_timer_event;
+static int time_travel_ext_fd = -1;
+static unsigned int time_travel_ext_waiting;
+static bool time_travel_ext_prev_request_valid;
+static unsigned long long time_travel_ext_prev_request;
+static bool time_travel_ext_free_until_valid;
+static unsigned long long time_travel_ext_free_until;
+
+static void time_travel_set_time(unsigned long long ns)
+{
+ if (unlikely(ns < time_travel_time))
+ panic("time-travel: time goes backwards %lld -> %lld\n",
+ time_travel_time, ns);
+ time_travel_time = ns;
+}
+
+enum time_travel_message_handling {
+ TTMH_IDLE,
+ TTMH_POLL,
+ TTMH_READ,
+};
+
+static void time_travel_handle_message(struct um_timetravel_msg *msg,
+ enum time_travel_message_handling mode)
+{
+ struct um_timetravel_msg resp = {
+ .op = UM_TIMETRAVEL_ACK,
+ };
+ int ret;
+
+ /*
+ * Poll outside the locked section (if we're not called to only read
+ * the response) so we can get interrupts for e.g. virtio while we're
+ * here, but then we need to lock to not get interrupted between the
+ * read of the message and write of the ACK.
+ */
+ if (mode != TTMH_READ) {
+ while (os_poll(1, &time_travel_ext_fd) != 0) {
+ if (mode == TTMH_IDLE) {
+ BUG_ON(!irqs_disabled());
+ local_irq_enable();
+ local_irq_disable();
+ }
+ }
+ }
+
+ ret = os_read_file(time_travel_ext_fd, msg, sizeof(*msg));
+
+ if (ret == 0)
+ panic("time-travel external link is broken\n");
+ if (ret != sizeof(*msg))
+ panic("invalid time-travel message - %d bytes\n", ret);
+
+ switch (msg->op) {
+ default:
+ WARN_ONCE(1, "time-travel: unexpected message %lld\n",
+ (unsigned long long)msg->op);
+ break;
+ case UM_TIMETRAVEL_ACK:
+ return;
+ case UM_TIMETRAVEL_RUN:
+ time_travel_set_time(msg->time);
+ break;
+ case UM_TIMETRAVEL_FREE_UNTIL:
+ time_travel_ext_free_until_valid = true;
+ time_travel_ext_free_until = msg->time;
+ break;
+ }
+
+ os_write_file(time_travel_ext_fd, &resp, sizeof(resp));
+}
+
+static u64 time_travel_ext_req(u32 op, u64 time)
+{
+ static int seq;
+ int mseq = ++seq;
+ struct um_timetravel_msg msg = {
+ .op = op,
+ .time = time,
+ .seq = mseq,
+ };
+ unsigned long flags;
+
+ /*
+ * We need to save interrupts here and only restore when we
+ * got the ACK - otherwise we can get interrupted and send
+ * another request while we're still waiting for an ACK, but
+ * the peer doesn't know we got interrupted and will send
+ * the ACKs in the same order as the message, but we'd need
+ * to see them in the opposite order ...
+ *
+ * This wouldn't matter *too* much, but some ACKs carry the
+ * current time (for UM_TIMETRAVEL_GET) and getting another
+ * ACK without a time would confuse us a lot!
+ *
+ * The sequence number assignment that happens here lets us
+ * debug such message handling issues more easily.
+ */
+ local_irq_save(flags);
+ os_write_file(time_travel_ext_fd, &msg, sizeof(msg));
+
+ while (msg.op != UM_TIMETRAVEL_ACK)
+ time_travel_handle_message(&msg, TTMH_READ);
+
+ if (msg.seq != mseq)
+ panic("time-travel: ACK message has different seqno! op=%d, seq=%d != %d time=%lld\n",
+ msg.op, msg.seq, mseq, msg.time);
+
+ if (op == UM_TIMETRAVEL_GET)
+ time_travel_set_time(msg.time);
+ local_irq_restore(flags);
+
+ return msg.time;
+}
+
+void __time_travel_wait_readable(int fd)
+{
+ int fds[2] = { fd, time_travel_ext_fd };
+ int ret;
+
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return;
+
+ while ((ret = os_poll(2, fds))) {
+ struct um_timetravel_msg msg;
+
+ if (ret == 1)
+ time_travel_handle_message(&msg, TTMH_READ);
+ }
+}
+EXPORT_SYMBOL_GPL(__time_travel_wait_readable);
+
+static void time_travel_ext_update_request(unsigned long long time)
+{
+ if (time_travel_mode != TT_MODE_EXTERNAL)
+ return;
+
+ /* asked for exactly this time previously */
+ if (time_travel_ext_prev_request_valid &&
+ time == time_travel_ext_prev_request)
+ return;
+
+ time_travel_ext_prev_request = time;
+ time_travel_ext_prev_request_valid = true;
+ time_travel_ext_req(UM_TIMETRAVEL_REQUEST, time);
+}
+
+void __time_travel_propagate_time(void)
+{
+ time_travel_ext_req(UM_TIMETRAVEL_UPDATE, time_travel_time);
+}
+EXPORT_SYMBOL_GPL(__time_travel_propagate_time);
+
+/* returns true if we must do a wait to the simtime device */
+static bool time_travel_ext_request(unsigned long long time)
+{
+ /*
+ * If we received an external sync point ("free until") then we
+ * don't have to request/wait for anything until then, unless
+ * we're already waiting.
+ */
+ if (!time_travel_ext_waiting && time_travel_ext_free_until_valid &&
+ time < time_travel_ext_free_until)
+ return false;
+
+ time_travel_ext_update_request(time);
+ return true;
+}
+
+static void time_travel_ext_wait(bool idle)
+{
+ struct um_timetravel_msg msg = {
+ .op = UM_TIMETRAVEL_ACK,
+ };
+
+ time_travel_ext_prev_request_valid = false;
+ time_travel_ext_waiting++;
+
+ time_travel_ext_req(UM_TIMETRAVEL_WAIT, -1);
+
+ /*
+ * Here we are deep in the idle loop, so we have to break out of the
+ * kernel abstraction in a sense and implement this in terms of the
+ * UML system waiting on the VQ interrupt while sleeping, when we get
+ * the signal it'll call time_travel_ext_vq_notify_done() completing the
+ * call.
+ */
+ while (msg.op != UM_TIMETRAVEL_RUN)
+ time_travel_handle_message(&msg, idle ? TTMH_IDLE : TTMH_POLL);
+
+ time_travel_ext_waiting--;
+
+ /* we might request more stuff while polling - reset when we run */
+ time_travel_ext_prev_request_valid = false;
+}
+
+static void time_travel_ext_get_time(void)
+{
+ time_travel_ext_req(UM_TIMETRAVEL_GET, -1);
+}
+
+static void __time_travel_update_time(unsigned long long ns, bool idle)
+{
+ if (time_travel_mode == TT_MODE_EXTERNAL && time_travel_ext_request(ns))
+ time_travel_ext_wait(idle);
+ else
+ time_travel_set_time(ns);
+}
+
+static struct time_travel_event *time_travel_first_event(void)
+{
+ return list_first_entry_or_null(&time_travel_events,
+ struct time_travel_event,
+ list);
+}
+
+static void __time_travel_add_event(struct time_travel_event *e,
+ unsigned long long time)
+{
+ struct time_travel_event *tmp;
+ bool inserted = false;
+
+ if (WARN(time_travel_mode == TT_MODE_BASIC &&
+ e != &time_travel_timer_event,
+ "only timer events can be handled in basic mode"))
+ return;
+
+ if (e->pending)
+ return;
+
+ e->pending = true;
+ e->time = time;
+
+ list_for_each_entry(tmp, &time_travel_events, list) {
+ /*
+ * Add the new entry before one with higher time,
+ * or if they're equal and both on stack, because
+ * in that case we need to unwind the stack in the
+ * right order, and the later event (timer sleep
+ * or such) must be dequeued first.
+ */
+ if ((tmp->time > e->time) ||
+ (tmp->time == e->time && tmp->onstack && e->onstack)) {
+ list_add_tail(&e->list, &tmp->list);
+ inserted = true;
+ break;
+ }
+ }
+
+ if (!inserted)
+ list_add_tail(&e->list, &time_travel_events);
+
+ tmp = time_travel_first_event();
+ time_travel_ext_update_request(tmp->time);
+ time_travel_next_event = tmp->time;
+}
+
+static void time_travel_add_event(struct time_travel_event *e,
+ unsigned long long time)
+{
+ if (WARN_ON(!e->fn))
+ return;
+
+ __time_travel_add_event(e, time);
+}
+
+void time_travel_periodic_timer(struct time_travel_event *e)
+{
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + time_travel_timer_interval);
+ deliver_alarm();
+}
+
+static void time_travel_deliver_event(struct time_travel_event *e)
+{
+ if (e == &time_travel_timer_event) {
+ /*
+ * deliver_alarm() does the irq_enter/irq_exit
+ * by itself, so must handle it specially here
+ */
+ e->fn(e);
+ } else {
+ unsigned long flags;
+
+ local_irq_save(flags);
+ irq_enter();
+ e->fn(e);
+ irq_exit();
+ local_irq_restore(flags);
+ }
+}
+
+static bool time_travel_del_event(struct time_travel_event *e)
+{
+ if (!e->pending)
+ return false;
+ list_del(&e->list);
+ e->pending = false;
+ return true;
+}
+
+static void time_travel_update_time(unsigned long long next, bool idle)
+{
+ struct time_travel_event ne = {
+ .onstack = true,
+ };
+ struct time_travel_event *e;
+ bool finished = idle;
+
+ /* add it without a handler - we deal with that specifically below */
+ __time_travel_add_event(&ne, next);
+
+ do {
+ e = time_travel_first_event();
+
+ BUG_ON(!e);
+ __time_travel_update_time(e->time, idle);
+
+ /* new events may have been inserted while we were waiting */
+ if (e == time_travel_first_event()) {
+ BUG_ON(!time_travel_del_event(e));
+ BUG_ON(time_travel_time != e->time);
+
+ if (e == &ne) {
+ finished = true;
+ } else {
+ if (e->onstack)
+ panic("On-stack event dequeued outside of the stack! time=%lld, event time=%lld, event=%pS\n",
+ time_travel_time, e->time, e);
+ time_travel_deliver_event(e);
+ }
+ }
+
+ e = time_travel_first_event();
+ if (e)
+ time_travel_ext_update_request(e->time);
+ } while (ne.pending && !finished);
+
+ time_travel_del_event(&ne);
+}
+
+void time_travel_ndelay(unsigned long nsec)
+{
+ time_travel_update_time(time_travel_time + nsec, false);
+}
+EXPORT_SYMBOL(time_travel_ndelay);
+
+void time_travel_add_irq_event(struct time_travel_event *e)
+{
+ BUG_ON(time_travel_mode != TT_MODE_EXTERNAL);
+
+ time_travel_ext_get_time();
+ /*
+ * We could model interrupt latency here, for now just
+ * don't have any latency at all and request the exact
+ * same time (again) to run the interrupt...
+ */
+ time_travel_add_event(e, time_travel_time);
+}
+EXPORT_SYMBOL_GPL(time_travel_add_irq_event);
+
+static void time_travel_oneshot_timer(struct time_travel_event *e)
+{
+ deliver_alarm();
+}
+
+void time_travel_sleep(unsigned long long duration)
+{
+ unsigned long long next = time_travel_time + duration;
+
+ if (time_travel_mode == TT_MODE_BASIC)
+ os_timer_disable();
+
+ time_travel_update_time(next, true);
+
+ if (time_travel_mode == TT_MODE_BASIC &&
+ time_travel_timer_event.pending) {
+ if (time_travel_timer_event.fn == time_travel_periodic_timer) {
+ /*
+ * This is somewhat wrong - we should get the first
+ * one sooner like the os_timer_one_shot() below...
+ */
+ os_timer_set_interval(time_travel_timer_interval);
+ } else {
+ os_timer_one_shot(time_travel_timer_event.time - next);
+ }
+ }
+}
+
+static void time_travel_handle_real_alarm(void)
+{
+ time_travel_set_time(time_travel_next_event);
+
+ time_travel_del_event(&time_travel_timer_event);
+
+ if (time_travel_timer_event.fn == time_travel_periodic_timer)
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time +
+ time_travel_timer_interval);
+}
+
+static void time_travel_set_interval(unsigned long long interval)
+{
+ time_travel_timer_interval = interval;
+}
+
+static int time_travel_connect_external(const char *socket)
+{
+ const char *sep;
+ unsigned long long id = (unsigned long long)-1;
+ int rc;
+
+ if ((sep = strchr(socket, ':'))) {
+ char buf[25] = {};
+ if (sep - socket > sizeof(buf) - 1)
+ goto invalid_number;
+
+ memcpy(buf, socket, sep - socket);
+ if (kstrtoull(buf, 0, &id)) {
+invalid_number:
+ panic("time-travel: invalid external ID in string '%s'\n",
+ socket);
+ return -EINVAL;
+ }
+
+ socket = sep + 1;
+ }
+
+ rc = os_connect_socket(socket);
+ if (rc < 0) {
+ panic("time-travel: failed to connect to external socket %s\n",
+ socket);
+ return rc;
+ }
+
+ time_travel_ext_fd = rc;
+
+ time_travel_ext_req(UM_TIMETRAVEL_START, id);
+
+ return 1;
+}
+#else /* CONFIG_UML_TIME_TRAVEL_SUPPORT */
#define time_travel_start_set 0
#define time_travel_start 0
+#define time_travel_time 0
+
+static inline void time_travel_update_time(unsigned long long ns, bool retearly)
+{
+}
+
+static inline void time_travel_handle_real_alarm(void)
+{
+}
+
+static void time_travel_set_interval(unsigned long long interval)
+{
+}
+
+/* fail link if this actually gets used */
+extern u64 time_travel_ext_req(u32 op, u64 time);
+
+/* these are empty macros so the struct/fn need not exist */
+#define time_travel_add_event(e, time) do { } while (0)
+#define time_travel_del_event(e) do { } while (0)
#endif
void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
@@ -48,7 +512,7 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
* never get any real signals from the OS.
*/
if (time_travel_mode == TT_MODE_BASIC)
- time_travel_set_time(time_travel_timer_expiry);
+ time_travel_handle_real_alarm();
local_irq_save(flags);
do_IRQ(TIMER_IRQ, regs);
@@ -58,9 +522,10 @@ void timer_handler(int sig, struct siginfo *unused_si, struct uml_pt_regs *regs)
static int itimer_shutdown(struct clock_event_device *evt)
{
if (time_travel_mode != TT_MODE_OFF)
- time_travel_set_timer_mode(TT_TMR_DISABLED);
+ time_travel_del_event(&time_travel_timer_event);
- if (time_travel_mode != TT_MODE_INFCPU)
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
os_timer_disable();
return 0;
@@ -71,12 +536,16 @@ static int itimer_set_periodic(struct clock_event_device *evt)
unsigned long long interval = NSEC_PER_SEC / HZ;
if (time_travel_mode != TT_MODE_OFF) {
- time_travel_set_timer_mode(TT_TMR_PERIODIC);
- time_travel_set_timer_expiry(time_travel_time + interval);
- time_travel_set_timer_interval(interval);
+ time_travel_del_event(&time_travel_timer_event);
+ time_travel_set_event_fn(&time_travel_timer_event,
+ time_travel_periodic_timer);
+ time_travel_set_interval(interval);
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + interval);
}
- if (time_travel_mode != TT_MODE_INFCPU)
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
os_timer_set_interval(interval);
return 0;
@@ -88,11 +557,15 @@ static int itimer_next_event(unsigned long delta,
delta += 1;
if (time_travel_mode != TT_MODE_OFF) {
- time_travel_set_timer_mode(TT_TMR_ONESHOT);
- time_travel_set_timer_expiry(time_travel_time + delta);
+ time_travel_del_event(&time_travel_timer_event);
+ time_travel_set_event_fn(&time_travel_timer_event,
+ time_travel_oneshot_timer);
+ time_travel_add_event(&time_travel_timer_event,
+ time_travel_time + delta);
}
- if (time_travel_mode != TT_MODE_INFCPU)
+ if (time_travel_mode != TT_MODE_INFCPU &&
+ time_travel_mode != TT_MODE_EXTERNAL)
return os_timer_one_shot(delta);
return 0;
@@ -143,8 +616,17 @@ static u64 timer_read(struct clocksource *cs)
* stuck in loops that expect time to move more than the
* exact requested sleep amount, e.g. python's socket server,
* see https://bugs.python.org/issue37026.
+ *
+ * However, don't do that when we're in interrupt or such as
+ * then we might recurse into our own processing, and get to
+ * even more waiting, and that's not good - it messes up the
+ * "what do I do next" and onstack event we use to know when
+ * to return from time_travel_update_time().
*/
- time_travel_set_time(time_travel_time + TIMER_MULTIPLIER);
+ if (!irqs_disabled() && !in_interrupt() && !in_softirq())
+ time_travel_update_time(time_travel_time +
+ TIMER_MULTIPLIER,
+ false);
return time_travel_time / TIMER_MULTIPLIER;
}
@@ -188,6 +670,8 @@ void read_persistent_clock64(struct timespec64 *ts)
if (time_travel_start_set)
nsecs = time_travel_start + time_travel_time;
+ else if (time_travel_mode == TT_MODE_EXTERNAL)
+ nsecs = time_travel_ext_req(UM_TIMETRAVEL_GET_TOD, -1);
else
nsecs = os_persistent_clock_emulation();
@@ -204,7 +688,8 @@ void __init time_init(void)
#ifdef CONFIG_UML_TIME_TRAVEL_SUPPORT
unsigned long calibrate_delay_is_known(void)
{
- if (time_travel_mode == TT_MODE_INFCPU)
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
return 1;
return 0;
}
@@ -218,6 +703,13 @@ int setup_time_travel(char *str)
return 1;
}
+ if (strncmp(str, "=ext:", 5) == 0) {
+ time_travel_mode = TT_MODE_EXTERNAL;
+ timer_clockevent.name = "time-travel-timer-external";
+ timer_clocksource.name = "time-travel-clock-external";
+ return time_travel_connect_external(str + 5);
+ }
+
if (!*str) {
time_travel_mode = TT_MODE_BASIC;
timer_clockevent.name = "time-travel-timer";
@@ -242,7 +734,15 @@ __uml_help(setup_time_travel,
"are no wall clock timers, and any CPU processing happens - as seen from the\n"
"guest - instantly. This can be useful for accurate simulation regardless of\n"
"debug overhead, physical CPU speed, etc. but is somewhat dangerous as it can\n"
-"easily lead to getting stuck (e.g. if anything in the system busy loops).\n");
+"easily lead to getting stuck (e.g. if anything in the system busy loops).\n"
+"\n"
+"time-travel=ext:[ID:]/path/to/socket\n"
+"This enables time travel mode similar to =inf-cpu, except the system will\n"
+"use the given socket to coordinate with a central scheduler, in order to\n"
+"have more than one system simultaneously be on simulated time. The virtio\n"
+"driver code in UML knows about this so you can also simulate networks and\n"
+"devices using it, assuming the device has the right capabilities.\n"
+"The optional ID is a 64-bit integer that's sent to the central scheduler.\n");
int setup_time_travel_start(char *str)
{
diff --git a/arch/um/kernel/uml.lds.S b/arch/um/kernel/uml.lds.S
index 9f21443be2c9..3b6dab3d4501 100644
--- a/arch/um/kernel/uml.lds.S
+++ b/arch/um/kernel/uml.lds.S
@@ -19,10 +19,10 @@ SECTIONS
__binary_start = START;
. = START + SIZEOF_HEADERS;
+ . = ALIGN(PAGE_SIZE);
_text = .;
INIT_TEXT_SECTION(0)
- . = ALIGN(PAGE_SIZE);
.text :
{
diff --git a/arch/um/os-Linux/file.c b/arch/um/os-Linux/file.c
index fbda10535dab..26ecbd64c409 100644
--- a/arch/um/os-Linux/file.c
+++ b/arch/um/os-Linux/file.c
@@ -5,9 +5,11 @@
#include <stdio.h>
#include <unistd.h>
+#include <stdlib.h>
#include <errno.h>
#include <fcntl.h>
#include <signal.h>
+#include <linux/falloc.h>
#include <sys/ioctl.h>
#include <sys/mount.h>
#include <sys/socket.h>
@@ -16,6 +18,7 @@
#include <sys/un.h>
#include <sys/types.h>
#include <sys/eventfd.h>
+#include <poll.h>
#include <os.h>
static void copy_stat(struct uml_stat *dst, const struct stat64 *src)
@@ -664,3 +667,31 @@ int os_sendmsg_fds(int fd, const void *buf, unsigned int len, const int *fds,
return -errno;
return err;
}
+
+int os_poll(unsigned int n, const int *fds)
+{
+ /* currently need 2 FDs at most so avoid dynamic allocation */
+ struct pollfd pollfds[2] = {};
+ unsigned int i;
+ int ret;
+
+ if (n > ARRAY_SIZE(pollfds))
+ return -EINVAL;
+
+ for (i = 0; i < n; i++) {
+ pollfds[i].fd = fds[i];
+ pollfds[i].events = POLLIN;
+ }
+
+ ret = poll(pollfds, n, -1);
+ if (ret < 0)
+ return -errno;
+
+ /* Return the index of the available FD */
+ for (i = 0; i < n; i++) {
+ if (pollfds[i].revents)
+ return i;
+ }
+
+ return -EIO;
+}
diff --git a/arch/um/os-Linux/time.c b/arch/um/os-Linux/time.c
index 432f8e1f55c2..90f6de224c70 100644
--- a/arch/um/os-Linux/time.c
+++ b/arch/um/os-Linux/time.c
@@ -14,7 +14,6 @@
#include <kern_util.h>
#include <os.h>
#include <string.h>
-#include <timer-internal.h>
static timer_t event_high_res_timer = 0;
diff --git a/arch/um/os-Linux/umid.c b/arch/um/os-Linux/umid.c
index 44def53a11cd..9e16078a4bf8 100644
--- a/arch/um/os-Linux/umid.c
+++ b/arch/um/os-Linux/umid.c
@@ -220,11 +220,12 @@ static void __init create_pid_file(void)
char pid[sizeof("nnnnn\0")], *file;
int fd, n;
- file = malloc(strlen(uml_dir) + UMID_LEN + sizeof("/pid\0"));
+ n = strlen(uml_dir) + UMID_LEN + sizeof("/pid\0");
+ file = malloc(n);
if (!file)
return;
- if (umid_file_name("pid", file, sizeof(file)))
+ if (umid_file_name("pid", file, n))
goto out;
fd = open(file, O_RDWR | O_CREAT | O_EXCL, 0644);
diff --git a/arch/x86/um/asm/processor.h b/arch/x86/um/asm/processor.h
index 593d5f3902bd..478710384b34 100644
--- a/arch/x86/um/asm/processor.h
+++ b/arch/x86/um/asm/processor.h
@@ -1,6 +1,7 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __UM_PROCESSOR_H
#define __UM_PROCESSOR_H
+#include <linux/time-internal.h>
/* include faultinfo structure */
#include <sysdep/faultinfo.h>
@@ -21,12 +22,19 @@
#include <asm/user.h>
/* REP NOP (PAUSE) is a good thing to insert into busy-wait loops. */
-static inline void rep_nop(void)
+static __always_inline void rep_nop(void)
{
__asm__ __volatile__("rep;nop": : :"memory");
}
-#define cpu_relax() rep_nop()
+static __always_inline void cpu_relax(void)
+{
+ if (time_travel_mode == TT_MODE_INFCPU ||
+ time_travel_mode == TT_MODE_EXTERNAL)
+ time_travel_ndelay(1);
+ else
+ rep_nop();
+}
#define task_pt_regs(t) (&(t)->thread.regs)
diff --git a/drivers/crypto/chelsio/chcr_ktls.c b/drivers/crypto/chelsio/chcr_ktls.c
index 73658b71d4a3..cd1769ecdc1c 100644
--- a/drivers/crypto/chelsio/chcr_ktls.c
+++ b/drivers/crypto/chelsio/chcr_ktls.c
@@ -2,6 +2,7 @@
/* Copyright (C) 2020 Chelsio Communications. All rights reserved. */
#ifdef CONFIG_CHELSIO_TLS_DEVICE
+#include <linux/highmem.h>
#include "chcr_ktls.h"
#include "clip_tbl.h"
diff --git a/drivers/ide/ide-scan-pci.c b/drivers/ide/ide-scan-pci.c
index acf874800ca4..b0411a1827a3 100644
--- a/drivers/ide/ide-scan-pci.c
+++ b/drivers/ide/ide-scan-pci.c
@@ -89,8 +89,7 @@ static int __init ide_scan_pcidev(struct pci_dev *dev)
static int __init ide_scan_pcibus(void)
{
struct pci_dev *dev = NULL;
- struct pci_driver *d;
- struct list_head *l, *n;
+ struct pci_driver *d, *tmp;
pre_init = 0;
for_each_pci_dev(dev)
@@ -101,9 +100,8 @@ static int __init ide_scan_pcibus(void)
* are post init.
*/
- list_for_each_safe(l, n, &ide_pci_drivers) {
- list_del(l);
- d = list_entry(l, struct pci_driver, node);
+ list_for_each_entry_safe(d, tmp, &ide_pci_drivers, node) {
+ list_del(&d->node);
if (__pci_register_driver(d, d->driver.owner,
d->driver.mod_name))
printk(KERN_ERR "%s: failed to register %s driver\n",
diff --git a/drivers/mtd/ubi/fastmap-wl.c b/drivers/mtd/ubi/fastmap-wl.c
index 426820ab9afe..b486250923c5 100644
--- a/drivers/mtd/ubi/fastmap-wl.c
+++ b/drivers/mtd/ubi/fastmap-wl.c
@@ -39,6 +39,13 @@ static struct ubi_wl_entry *find_anchor_wl_entry(struct rb_root *root)
return victim;
}
+static inline void return_unused_peb(struct ubi_device *ubi,
+ struct ubi_wl_entry *e)
+{
+ wl_tree_add(e, &ubi->free);
+ ubi->free_count++;
+}
+
/**
* return_unused_pool_pebs - returns unused PEB to the free tree.
* @ubi: UBI device description object
@@ -52,8 +59,7 @@ static void return_unused_pool_pebs(struct ubi_device *ubi,
for (i = pool->used; i < pool->size; i++) {
e = ubi->lookuptbl[pool->pebs[i]];
- wl_tree_add(e, &ubi->free);
- ubi->free_count++;
+ return_unused_peb(ubi, e);
}
}
@@ -361,6 +367,11 @@ static void ubi_fastmap_close(struct ubi_device *ubi)
return_unused_pool_pebs(ubi, &ubi->fm_pool);
return_unused_pool_pebs(ubi, &ubi->fm_wl_pool);
+ if (ubi->fm_anchor) {
+ return_unused_peb(ubi, ubi->fm_anchor);
+ ubi->fm_anchor = NULL;
+ }
+
if (ubi->fm) {
for (i = 0; i < ubi->fm->used_blocks; i++)
kfree(ubi->fm->e[i]);
diff --git a/drivers/mtd/ubi/ubi-media.h b/drivers/mtd/ubi/ubi-media.h
index b5fe8f82281b..386db0598e95 100644
--- a/drivers/mtd/ubi/ubi-media.h
+++ b/drivers/mtd/ubi/ubi-media.h
@@ -498,6 +498,6 @@ struct ubi_fm_volhdr {
struct ubi_fm_eba {
__be32 magic;
__be32 reserved_pebs;
- __be32 pnum[0];
+ __be32 pnum[];
} __packed;
#endif /* !__UBI_MEDIA_H__ */
diff --git a/drivers/mtd/ubi/wl.c b/drivers/mtd/ubi/wl.c
index 837d690a8c60..5146cce5fe32 100644
--- a/drivers/mtd/ubi/wl.c
+++ b/drivers/mtd/ubi/wl.c
@@ -1875,7 +1875,8 @@ int ubi_wl_init(struct ubi_device *ubi, struct ubi_attach_info *ai)
goto out_free;
#ifdef CONFIG_MTD_UBI_FASTMAP
- ubi_ensure_anchor_pebs(ubi);
+ if (!ubi->ro_mode && !ubi->fm_disabled)
+ ubi_ensure_anchor_pebs(ubi);
#endif
return 0;
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index 086dfb1b9d0b..91cdc0a2b1a7 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -148,7 +148,7 @@ static void slc_bump(struct slcan *sl)
u32 tmpid;
char *cmd = sl->rbuff;
- cf.can_id = 0;
+ memset(&cf, 0, sizeof(cf));
switch (*cmd) {
case 'r':
@@ -187,8 +187,6 @@ static void slc_bump(struct slcan *sl)
else
return;
- *(u64 *) (&cf.data) = 0; /* clear payload */
-
/* RTR frames may have a dlc > 0 but they never have any data bytes */
if (!(cf.can_id & CAN_RTR_FLAG)) {
for (i = 0; i < cf.can_dlc; i++) {
diff --git a/drivers/net/dsa/bcm_sf2.c b/drivers/net/dsa/bcm_sf2.c
index affa5c6e135c..c7ac63f41918 100644
--- a/drivers/net/dsa/bcm_sf2.c
+++ b/drivers/net/dsa/bcm_sf2.c
@@ -480,7 +480,7 @@ static int bcm_sf2_mdio_register(struct dsa_switch *ds)
priv->slave_mii_bus->parent = ds->dev->parent;
priv->slave_mii_bus->phy_mask = ~priv->indir_phy_mask;
- err = of_mdiobus_register(priv->slave_mii_bus, dn);
+ err = mdiobus_register(priv->slave_mii_bus);
if (err && dn)
of_node_put(dn);
@@ -1079,6 +1079,7 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
const struct bcm_sf2_of_data *data;
struct b53_platform_data *pdata;
struct dsa_switch_ops *ops;
+ struct device_node *ports;
struct bcm_sf2_priv *priv;
struct b53_device *dev;
struct dsa_switch *ds;
@@ -1146,7 +1147,11 @@ static int bcm_sf2_sw_probe(struct platform_device *pdev)
set_bit(0, priv->cfp.used);
set_bit(0, priv->cfp.unique);
- bcm_sf2_identify_ports(priv, dn->child);
+ ports = of_find_node_by_name(dn, "ports");
+ if (ports) {
+ bcm_sf2_identify_ports(priv, ports);
+ of_node_put(ports);
+ }
priv->irq0 = irq_of_parse_and_map(dn, 0);
priv->irq1 = irq_of_parse_and_map(dn, 1);
diff --git a/drivers/net/dsa/mt7530.c b/drivers/net/dsa/mt7530.c
index ef57552db260..2d0d91db0ddb 100644
--- a/drivers/net/dsa/mt7530.c
+++ b/drivers/net/dsa/mt7530.c
@@ -1403,6 +1403,9 @@ mt7530_setup(struct dsa_switch *ds)
continue;
phy_node = of_parse_phandle(mac_np, "phy-handle", 0);
+ if (!phy_node)
+ continue;
+
if (phy_node->parent == priv->dev->of_node->parent) {
ret = of_get_phy_mode(mac_np, &interface);
if (ret && ret != -ENODEV)
diff --git a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c
index 97901c114bfa..fbe9d88b13c7 100644
--- a/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c
+++ b/drivers/net/ethernet/aquantia/atlantic/macsec/macsec_api.c
@@ -491,7 +491,7 @@ get_ingress_preclass_record(struct aq_hw_s *hw,
rec->snap[1] = packed_record[8] & 0xFF;
rec->llc = (packed_record[8] >> 8) & 0xFF;
- rec->llc = packed_record[9] << 8;
+ rec->llc |= packed_record[9] << 8;
rec->mac_sa[0] = packed_record[10];
rec->mac_sa[0] |= packed_record[11] << 16;
diff --git a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
index 9638d65d8261..517caedc0a87 100644
--- a/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
+++ b/drivers/net/ethernet/broadcom/bnx2x/bnx2x_link.c
@@ -6874,7 +6874,8 @@ int bnx2x_link_update(struct link_params *params, struct link_vars *vars)
case PORT_HW_CFG_PHY_SELECTION_FIRST_PHY_PRIORITY:
/* In this option, the first PHY makes sure to pass the
* traffic through itself only.
- * Its not clear how to reset the link on the second phy
+ * It's not clear how to reset the link on the second
+ * phy.
*/
active_external_phy = EXT_PHY1;
break;
diff --git a/drivers/net/ethernet/cavium/common/cavium_ptp.h b/drivers/net/ethernet/cavium/common/cavium_ptp.h
index a04eccbc78e8..1e0ffe8f4152 100644
--- a/drivers/net/ethernet/cavium/common/cavium_ptp.h
+++ b/drivers/net/ethernet/cavium/common/cavium_ptp.h
@@ -24,7 +24,7 @@ struct cavium_ptp {
struct ptp_clock *ptp_clock;
};
-#if IS_ENABLED(CONFIG_CAVIUM_PTP)
+#if IS_REACHABLE(CONFIG_CAVIUM_PTP)
struct cavium_ptp *cavium_ptp_get(void);
void cavium_ptp_put(struct cavium_ptp *ptp);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 75fde0d4d493..a70018f067aa 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -3132,7 +3132,6 @@ static int cxgb_set_mac_addr(struct net_device *dev, void *p)
return ret;
memcpy(dev->dev_addr, addr->sa_data, dev->addr_len);
- pi->xact_addr_filt = ret;
return 0;
}
@@ -6672,6 +6671,10 @@ static void shutdown_one(struct pci_dev *pdev)
if (adapter->port[i]->reg_state == NETREG_REGISTERED)
cxgb_close(adapter->port[i]);
+ rtnl_lock();
+ cxgb4_mqprio_stop_offload(adapter);
+ rtnl_unlock();
+
if (is_uld(adapter)) {
detach_ulds(adapter);
t4_uld_clean_up(adapter);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c
index ec3eb45ee3b4..e6af4906d674 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.c
@@ -301,6 +301,7 @@ static void cxgb4_mqprio_free_hw_resources(struct net_device *dev)
cxgb4_clear_msix_aff(eorxq->msix->vec,
eorxq->msix->aff_mask);
free_irq(eorxq->msix->vec, &eorxq->rspq);
+ cxgb4_free_msix_idx_in_bmap(adap, eorxq->msix->idx);
}
free_rspq_fl(adap, &eorxq->rspq, &eorxq->fl);
@@ -611,6 +612,28 @@ out:
return ret;
}
+void cxgb4_mqprio_stop_offload(struct adapter *adap)
+{
+ struct cxgb4_tc_port_mqprio *tc_port_mqprio;
+ struct net_device *dev;
+ u8 i;
+
+ if (!adap->tc_mqprio || !adap->tc_mqprio->port_mqprio)
+ return;
+
+ for_each_port(adap, i) {
+ dev = adap->port[i];
+ if (!dev)
+ continue;
+
+ tc_port_mqprio = &adap->tc_mqprio->port_mqprio[i];
+ if (!tc_port_mqprio->mqprio.qopt.num_tc)
+ continue;
+
+ cxgb4_mqprio_disable_offload(dev);
+ }
+}
+
int cxgb4_init_tc_mqprio(struct adapter *adap)
{
struct cxgb4_tc_port_mqprio *tc_port_mqprio, *port_mqprio;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h
index c532f1ef8451..ff8794132b22 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_mqprio.h
@@ -38,6 +38,7 @@ struct cxgb4_tc_mqprio {
int cxgb4_setup_tc_mqprio(struct net_device *dev,
struct tc_mqprio_qopt_offload *mqprio);
+void cxgb4_mqprio_stop_offload(struct adapter *adap);
int cxgb4_init_tc_mqprio(struct adapter *adap);
void cxgb4_cleanup_tc_mqprio(struct adapter *adap);
#endif /* __CXGB4_TC_MQPRIO_H__ */
diff --git a/drivers/net/ethernet/faraday/ftgmac100.c b/drivers/net/ethernet/faraday/ftgmac100.c
index 835b7816e372..87236206366f 100644
--- a/drivers/net/ethernet/faraday/ftgmac100.c
+++ b/drivers/net/ethernet/faraday/ftgmac100.c
@@ -1731,7 +1731,7 @@ static int ftgmac100_setup_clk(struct ftgmac100 *priv)
if (rc)
goto cleanup_clk;
- /* RCLK is for RMII, typically used for NCSI. Optional because its not
+ /* RCLK is for RMII, typically used for NCSI. Optional because it's not
* necessary if it's the AST2400 MAC, or the MAC is configured for
* RGMII, or the controller is not an ASPEED-based controller.
*/
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
index 2f76908cae73..51117a5a6bbf 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_flower.c
@@ -150,14 +150,20 @@ static int mlxsw_sp_flower_parse_actions(struct mlxsw_sp *mlxsw_sp,
u8 prio = act->vlan.prio;
u16 vid = act->vlan.vid;
- return mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei,
- act->id, vid,
- proto, prio, extack);
+ err = mlxsw_sp_acl_rulei_act_vlan(mlxsw_sp, rulei,
+ act->id, vid,
+ proto, prio, extack);
+ if (err)
+ return err;
+ break;
}
case FLOW_ACTION_PRIORITY:
- return mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei,
- act->priority,
- extack);
+ err = mlxsw_sp_acl_rulei_act_priority(mlxsw_sp, rulei,
+ act->priority,
+ extack);
+ if (err)
+ return err;
+ break;
case FLOW_ACTION_MANGLE: {
enum flow_action_mangle_base htype = act->mangle.htype;
__be32 be_mask = (__force __be32) act->mangle.mask;
diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
index 9096ffd89e50..fbf714d027d8 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_trap.c
@@ -643,7 +643,7 @@ static int mlxsw_sp_trap_policer_bs(u64 burst, u8 *p_burst_size,
{
int bs = fls64(burst) - 1;
- if (burst != (1 << bs)) {
+ if (burst != (BIT_ULL(bs))) {
NL_SET_ERR_MSG_MOD(extack, "Policer burst size is not power of two");
return -EINVAL;
}
diff --git a/drivers/net/ethernet/qlogic/qed/qed_l2.c b/drivers/net/ethernet/qlogic/qed/qed_l2.c
index 1a5fc2ae351c..29810a1aa210 100644
--- a/drivers/net/ethernet/qlogic/qed/qed_l2.c
+++ b/drivers/net/ethernet/qlogic/qed/qed_l2.c
@@ -369,8 +369,8 @@ int qed_sp_eth_vport_start(struct qed_hwfn *p_hwfn,
struct qed_spq_entry *p_ent = NULL;
struct qed_sp_init_data init_data;
u8 abs_vport_id = 0;
- int rc = -EINVAL;
u16 rx_mode = 0;
+ int rc;
rc = qed_fw_vport(p_hwfn, p_params->vport_id, &abs_vport_id);
if (rc)
diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
index 1305522f72d6..40efe60eff8d 100644
--- a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
+++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
@@ -282,7 +282,6 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[],
{
struct rmnet_priv *priv = netdev_priv(dev);
struct net_device *real_dev;
- struct rmnet_endpoint *ep;
struct rmnet_port *port;
u16 mux_id;
@@ -297,19 +296,27 @@ static int rmnet_changelink(struct net_device *dev, struct nlattr *tb[],
if (data[IFLA_RMNET_MUX_ID]) {
mux_id = nla_get_u16(data[IFLA_RMNET_MUX_ID]);
- if (rmnet_get_endpoint(port, mux_id)) {
- NL_SET_ERR_MSG_MOD(extack, "MUX ID already exists");
- return -EINVAL;
- }
- ep = rmnet_get_endpoint(port, priv->mux_id);
- if (!ep)
- return -ENODEV;
- hlist_del_init_rcu(&ep->hlnode);
- hlist_add_head_rcu(&ep->hlnode, &port->muxed_ep[mux_id]);
+ if (mux_id != priv->mux_id) {
+ struct rmnet_endpoint *ep;
+
+ ep = rmnet_get_endpoint(port, priv->mux_id);
+ if (!ep)
+ return -ENODEV;
- ep->mux_id = mux_id;
- priv->mux_id = mux_id;
+ if (rmnet_get_endpoint(port, mux_id)) {
+ NL_SET_ERR_MSG_MOD(extack,
+ "MUX ID already exists");
+ return -EINVAL;
+ }
+
+ hlist_del_init_rcu(&ep->hlnode);
+ hlist_add_head_rcu(&ep->hlnode,
+ &port->muxed_ep[mux_id]);
+
+ ep->mux_id = mux_id;
+ priv->mux_id = mux_id;
+ }
}
if (data[IFLA_RMNET_FLAGS]) {
diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
index 55cb5730beb6..bf5bf05970a2 100644
--- a/drivers/net/ethernet/realtek/r8169_main.c
+++ b/drivers/net/ethernet/realtek/r8169_main.c
@@ -5441,9 +5441,8 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
netif_napi_add(dev, &tp->napi, rtl8169_poll, NAPI_POLL_WEIGHT);
- dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
- NETIF_F_RXCSUM | NETIF_F_HW_VLAN_CTAG_TX |
- NETIF_F_HW_VLAN_CTAG_RX;
+ dev->hw_features = NETIF_F_IP_CSUM | NETIF_F_RXCSUM |
+ NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO |
NETIF_F_HIGHDMA;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
@@ -5460,26 +5459,26 @@ static int rtl_init_one(struct pci_dev *pdev, const struct pci_device_id *ent)
/* Disallow toggling */
dev->hw_features &= ~NETIF_F_HW_VLAN_CTAG_RX;
+ if (rtl_chip_supports_csum_v2(tp))
+ dev->hw_features |= NETIF_F_IPV6_CSUM;
+
+ dev->features |= dev->hw_features;
+
+ /* There has been a number of reports that using SG/TSO results in
+ * tx timeouts. However for a lot of people SG/TSO works fine.
+ * Therefore disable both features by default, but allow users to
+ * enable them. Use at own risk!
+ */
if (rtl_chip_supports_csum_v2(tp)) {
- dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6;
+ dev->hw_features |= NETIF_F_SG | NETIF_F_TSO | NETIF_F_TSO6;
dev->gso_max_size = RTL_GSO_MAX_SIZE_V2;
dev->gso_max_segs = RTL_GSO_MAX_SEGS_V2;
} else {
+ dev->hw_features |= NETIF_F_SG | NETIF_F_TSO;
dev->gso_max_size = RTL_GSO_MAX_SIZE_V1;
dev->gso_max_segs = RTL_GSO_MAX_SEGS_V1;
}
- /* RTL8168e-vl and one RTL8168c variant are known to have a
- * HW issue with TSO.
- */
- if (tp->mac_version == RTL_GIGA_MAC_VER_34 ||
- tp->mac_version == RTL_GIGA_MAC_VER_22) {
- dev->vlan_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
- dev->hw_features &= ~(NETIF_F_ALL_TSO | NETIF_F_SG);
- }
-
- dev->features |= dev->hw_features;
-
dev->hw_features |= NETIF_F_RXALL;
dev->hw_features |= NETIF_F_RXFCS;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index 542784300620..efc6ec1b8027 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -207,7 +207,7 @@ static void dwmac1000_set_filter(struct mac_device_info *hw,
reg++;
}
- while (reg <= perfect_addr_number) {
+ while (reg < perfect_addr_number) {
writel(0, ioaddr + GMAC_ADDR_HIGH(reg));
writel(0, ioaddr + GMAC_ADDR_LOW(reg));
reg++;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
index 0e4575f7bedb..ad4df9bddcf3 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
@@ -577,8 +577,13 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash,
value |= XGMAC_VLAN_EDVLP;
value |= XGMAC_VLAN_ESVL;
value |= XGMAC_VLAN_DOVLTC;
+ } else {
+ value &= ~XGMAC_VLAN_EDVLP;
+ value &= ~XGMAC_VLAN_ESVL;
+ value &= ~XGMAC_VLAN_DOVLTC;
}
+ value &= ~XGMAC_VLAN_VID;
writel(value, ioaddr + XGMAC_VLAN_TAG);
} else if (perfect_match) {
u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
@@ -589,13 +594,19 @@ static void dwxgmac2_update_vlan_hash(struct mac_device_info *hw, u32 hash,
value = readl(ioaddr + XGMAC_VLAN_TAG);
+ value &= ~XGMAC_VLAN_VTHM;
value |= XGMAC_VLAN_ETV;
if (is_double) {
value |= XGMAC_VLAN_EDVLP;
value |= XGMAC_VLAN_ESVL;
value |= XGMAC_VLAN_DOVLTC;
+ } else {
+ value &= ~XGMAC_VLAN_EDVLP;
+ value &= ~XGMAC_VLAN_ESVL;
+ value &= ~XGMAC_VLAN_DOVLTC;
}
+ value &= ~XGMAC_VLAN_VID;
writel(value | perfect_match, ioaddr + XGMAC_VLAN_TAG);
} else {
u32 value = readl(ioaddr + XGMAC_PACKET_FILTER);
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 2fb671e61ee8..e6898fd5223f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -4566,9 +4566,13 @@ static int stmmac_vlan_rx_add_vid(struct net_device *ndev, __be16 proto, u16 vid
return ret;
}
- ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
+ if (priv->hw->num_vlan) {
+ ret = stmmac_add_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
+ if (ret)
+ return ret;
+ }
- return ret;
+ return 0;
}
static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vid)
@@ -4581,9 +4585,12 @@ static int stmmac_vlan_rx_kill_vid(struct net_device *ndev, __be16 proto, u16 vi
is_double = true;
clear_bit(vid, priv->active_vlans);
- ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
- if (ret)
- return ret;
+
+ if (priv->hw->num_vlan) {
+ ret = stmmac_del_hw_vlan_rx_fltr(priv, ndev, priv->hw, proto, vid);
+ if (ret)
+ return ret;
+ }
return stmmac_vlan_update(priv, is_double);
}
diff --git a/drivers/net/macsec.c b/drivers/net/macsec.c
index da82d7f16a09..0d580d81d910 100644
--- a/drivers/net/macsec.c
+++ b/drivers/net/macsec.c
@@ -2594,6 +2594,9 @@ static int macsec_upd_offload(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(dev);
macsec = macsec_priv(dev);
+ if (!tb_offload[MACSEC_OFFLOAD_ATTR_TYPE])
+ return -EINVAL;
+
offload = nla_get_u8(tb_offload[MACSEC_OFFLOAD_ATTR_TYPE]);
if (macsec->offload == offload)
return 0;
diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
index 481cf48c9b9e..31f731e6df72 100644
--- a/drivers/net/phy/at803x.c
+++ b/drivers/net/phy/at803x.c
@@ -425,8 +425,8 @@ static int at803x_parse_dt(struct phy_device *phydev)
*/
if (at803x_match_phy_id(phydev, ATH8030_PHY_ID) ||
at803x_match_phy_id(phydev, ATH8035_PHY_ID)) {
- priv->clk_25m_reg &= ~AT8035_CLK_OUT_MASK;
- priv->clk_25m_mask &= ~AT8035_CLK_OUT_MASK;
+ priv->clk_25m_reg &= AT8035_CLK_OUT_MASK;
+ priv->clk_25m_mask &= AT8035_CLK_OUT_MASK;
}
}
diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
index 2ec19e5540bf..05d20343b816 100644
--- a/drivers/net/phy/micrel.c
+++ b/drivers/net/phy/micrel.c
@@ -25,6 +25,7 @@
#include <linux/micrel_phy.h>
#include <linux/of.h>
#include <linux/clk.h>
+#include <linux/delay.h>
/* Operation Mode Strap Override */
#define MII_KSZPHY_OMSO 0x16
@@ -952,6 +953,12 @@ static int kszphy_resume(struct phy_device *phydev)
genphy_resume(phydev);
+ /* After switching from power-down to normal mode, an internal global
+ * reset is automatically generated. Wait a minimum of 1 ms before
+ * read/write access to the PHY registers.
+ */
+ usleep_range(1000, 2000);
+
ret = kszphy_config_reset(phydev);
if (ret)
return ret;
diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index 228fe449dc6d..07476c6510f2 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1678,8 +1678,12 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
alloc_frag->offset += buflen;
}
err = tun_xdp_act(tun, xdp_prog, &xdp, act);
- if (err < 0)
- goto err_xdp;
+ if (err < 0) {
+ if (act == XDP_REDIRECT || act == XDP_TX)
+ put_page(alloc_frag->page);
+ goto out;
+ }
+
if (err == XDP_REDIRECT)
xdp_do_flush();
if (err != XDP_PASS)
@@ -1693,8 +1697,6 @@ static struct sk_buff *tun_build_skb(struct tun_struct *tun,
return __tun_build_skb(tfile, alloc_frag, buf, buflen, len, pad);
-err_xdp:
- put_page(alloc_frag->page);
out:
rcu_read_unlock();
local_bh_enable();
diff --git a/drivers/net/usb/pegasus.c b/drivers/net/usb/pegasus.c
index 8783e2ab3ec0..0ef7e1f443e3 100644
--- a/drivers/net/usb/pegasus.c
+++ b/drivers/net/usb/pegasus.c
@@ -54,6 +54,7 @@ static const char driver_name[] = "pegasus";
#undef PEGASUS_WRITE_EEPROM
#define BMSR_MEDIA (BMSR_10HALF | BMSR_10FULL | BMSR_100HALF | \
BMSR_100FULL | BMSR_ANEGCAPABLE)
+#define CARRIER_CHECK_DELAY (2 * HZ)
static bool loopback;
static bool mii_mode;
@@ -1089,17 +1090,12 @@ static inline void setup_pegasus_II(pegasus_t *pegasus)
set_register(pegasus, Reg81, 2);
}
-
-static int pegasus_count;
-static struct workqueue_struct *pegasus_workqueue;
-#define CARRIER_CHECK_DELAY (2 * HZ)
-
static void check_carrier(struct work_struct *work)
{
pegasus_t *pegasus = container_of(work, pegasus_t, carrier_check.work);
set_carrier(pegasus->net);
if (!(pegasus->flags & PEGASUS_UNPLUG)) {
- queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+ queue_delayed_work(system_long_wq, &pegasus->carrier_check,
CARRIER_CHECK_DELAY);
}
}
@@ -1120,18 +1116,6 @@ static int pegasus_blacklisted(struct usb_device *udev)
return 0;
}
-/* we rely on probe() and remove() being serialized so we
- * don't need extra locking on pegasus_count.
- */
-static void pegasus_dec_workqueue(void)
-{
- pegasus_count--;
- if (pegasus_count == 0) {
- destroy_workqueue(pegasus_workqueue);
- pegasus_workqueue = NULL;
- }
-}
-
static int pegasus_probe(struct usb_interface *intf,
const struct usb_device_id *id)
{
@@ -1144,14 +1128,6 @@ static int pegasus_probe(struct usb_interface *intf,
if (pegasus_blacklisted(dev))
return -ENODEV;
- if (pegasus_count == 0) {
- pegasus_workqueue = alloc_workqueue("pegasus", WQ_MEM_RECLAIM,
- 0);
- if (!pegasus_workqueue)
- return -ENOMEM;
- }
- pegasus_count++;
-
net = alloc_etherdev(sizeof(struct pegasus));
if (!net)
goto out;
@@ -1209,7 +1185,7 @@ static int pegasus_probe(struct usb_interface *intf,
res = register_netdev(net);
if (res)
goto out3;
- queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+ queue_delayed_work(system_long_wq, &pegasus->carrier_check,
CARRIER_CHECK_DELAY);
dev_info(&intf->dev, "%s, %s, %pM\n", net->name,
usb_dev_id[dev_index].name, net->dev_addr);
@@ -1222,7 +1198,6 @@ out2:
out1:
free_netdev(net);
out:
- pegasus_dec_workqueue();
return res;
}
@@ -1237,7 +1212,7 @@ static void pegasus_disconnect(struct usb_interface *intf)
}
pegasus->flags |= PEGASUS_UNPLUG;
- cancel_delayed_work(&pegasus->carrier_check);
+ cancel_delayed_work_sync(&pegasus->carrier_check);
unregister_netdev(pegasus->net);
unlink_all_urbs(pegasus);
free_all_urbs(pegasus);
@@ -1246,7 +1221,6 @@ static void pegasus_disconnect(struct usb_interface *intf)
pegasus->rx_skb = NULL;
}
free_netdev(pegasus->net);
- pegasus_dec_workqueue();
}
static int pegasus_suspend(struct usb_interface *intf, pm_message_t message)
@@ -1254,7 +1228,7 @@ static int pegasus_suspend(struct usb_interface *intf, pm_message_t message)
struct pegasus *pegasus = usb_get_intfdata(intf);
netif_device_detach(pegasus->net);
- cancel_delayed_work(&pegasus->carrier_check);
+ cancel_delayed_work_sync(&pegasus->carrier_check);
if (netif_running(pegasus->net)) {
usb_kill_urb(pegasus->rx_urb);
usb_kill_urb(pegasus->intr_urb);
@@ -1276,7 +1250,7 @@ static int pegasus_resume(struct usb_interface *intf)
pegasus->intr_urb->actual_length = 0;
intr_callback(pegasus->intr_urb);
}
- queue_delayed_work(pegasus_workqueue, &pegasus->carrier_check,
+ queue_delayed_work(system_long_wq, &pegasus->carrier_check,
CARRIER_CHECK_DELAY);
return 0;
}
diff --git a/drivers/net/wimax/i2400m/driver.c b/drivers/net/wimax/i2400m/driver.c
index f66c0f8f6f4a..ecb3fccca603 100644
--- a/drivers/net/wimax/i2400m/driver.c
+++ b/drivers/net/wimax/i2400m/driver.c
@@ -740,9 +740,6 @@ EXPORT_SYMBOL_GPL(i2400m_error_recovery);
static
int i2400m_bm_buf_alloc(struct i2400m *i2400m)
{
- int result;
-
- result = -ENOMEM;
i2400m->bm_cmd_buf = kzalloc(I2400M_BM_CMD_BUF_SIZE, GFP_KERNEL);
if (i2400m->bm_cmd_buf == NULL)
goto error_bm_cmd_kzalloc;
@@ -754,7 +751,7 @@ int i2400m_bm_buf_alloc(struct i2400m *i2400m)
error_bm_ack_buf_kzalloc:
kfree(i2400m->bm_cmd_buf);
error_bm_cmd_kzalloc:
- return result;
+ return -ENOMEM;
}
@@ -843,7 +840,7 @@ EXPORT_SYMBOL_GPL(i2400m_reset);
*/
int i2400m_setup(struct i2400m *i2400m, enum i2400m_bri bm_flags)
{
- int result = -ENODEV;
+ int result;
struct device *dev = i2400m_dev(i2400m);
struct wimax_dev *wimax_dev = &i2400m->wimax_dev;
struct net_device *net_dev = i2400m->wimax_dev.net_dev;
diff --git a/drivers/parisc/eisa.c b/drivers/parisc/eisa.c
index 9d00a24277aa..f96e5eaee87e 100644
--- a/drivers/parisc/eisa.c
+++ b/drivers/parisc/eisa.c
@@ -243,11 +243,6 @@ static irqreturn_t dummy_irq2_handler(int _, void *dev)
return IRQ_HANDLED;
}
-static struct irqaction irq2_action = {
- .handler = dummy_irq2_handler,
- .name = "cascade",
-};
-
static void init_eisa_pic(void)
{
unsigned long flags;
@@ -335,7 +330,8 @@ static int __init eisa_probe(struct parisc_device *dev)
}
/* Reserve IRQ2 */
- setup_irq(2, &irq2_action);
+ if (request_irq(2, dummy_irq2_handler, 0, "cascade", NULL))
+ pr_err("Failed to request irq 2 (cascade)\n");
for (i = 0; i < 16; i++) {
irq_set_chip_and_handler(i, &eisa_interrupt_type,
handle_simple_irq);
diff --git a/drivers/pcmcia/cs_internal.h b/drivers/pcmcia/cs_internal.h
index 33c9b6ea7364..fb9b17fa0fb5 100644
--- a/drivers/pcmcia/cs_internal.h
+++ b/drivers/pcmcia/cs_internal.h
@@ -40,7 +40,7 @@ struct cis_cache_entry {
unsigned int addr;
unsigned int len;
unsigned int attr;
- unsigned char cache[0];
+ unsigned char cache[];
};
struct pccard_resource_ops {
diff --git a/drivers/pcmcia/omap_cf.c b/drivers/pcmcia/omap_cf.c
index 0a04eb04f3a2..d3ef5534991e 100644
--- a/drivers/pcmcia/omap_cf.c
+++ b/drivers/pcmcia/omap_cf.c
@@ -329,7 +329,7 @@ static int __exit omap_cf_remove(struct platform_device *pdev)
static struct platform_driver omap_cf_driver = {
.driver = {
- .name = (char *) driver_name,
+ .name = driver_name,
},
.remove = __exit_p(omap_cf_remove),
};
diff --git a/drivers/pcmcia/rsrc_nonstatic.c b/drivers/pcmcia/rsrc_nonstatic.c
index 9e6922c08ef6..3b05760e69d6 100644
--- a/drivers/pcmcia/rsrc_nonstatic.c
+++ b/drivers/pcmcia/rsrc_nonstatic.c
@@ -1076,7 +1076,7 @@ static ssize_t show_io_db(struct device *dev,
for (p = data->io_db.next; p != &data->io_db; p = p->next) {
if (ret > (PAGE_SIZE - 10))
continue;
- ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1),
+ ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1),
"0x%08lx - 0x%08lx\n",
((unsigned long) p->base),
((unsigned long) p->base + p->num - 1));
@@ -1133,7 +1133,7 @@ static ssize_t show_mem_db(struct device *dev,
p = p->next) {
if (ret > (PAGE_SIZE - 10))
continue;
- ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1),
+ ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1),
"0x%08lx - 0x%08lx\n",
((unsigned long) p->base),
((unsigned long) p->base + p->num - 1));
@@ -1142,7 +1142,7 @@ static ssize_t show_mem_db(struct device *dev,
for (p = data->mem_db.next; p != &data->mem_db; p = p->next) {
if (ret > (PAGE_SIZE - 10))
continue;
- ret += snprintf(&buf[ret], (PAGE_SIZE - ret - 1),
+ ret += scnprintf(&buf[ret], (PAGE_SIZE - ret - 1),
"0x%08lx - 0x%08lx\n",
((unsigned long) p->base),
((unsigned long) p->base + p->num - 1));
diff --git a/drivers/pcmcia/sa1100_simpad.c b/drivers/pcmcia/sa1100_simpad.c
index e2e8729afd9d..784ada5b8c4f 100644
--- a/drivers/pcmcia/sa1100_simpad.c
+++ b/drivers/pcmcia/sa1100_simpad.c
@@ -14,7 +14,7 @@
#include <asm/mach-types.h>
#include <mach/simpad.h>
#include "sa1100_generic.h"
-
+
static int simpad_pcmcia_hw_init(struct soc_pcmcia_socket *skt)
{
@@ -66,7 +66,7 @@ simpad_pcmcia_configure_socket(struct soc_pcmcia_socket *skt,
simpad_clear_cs3_bit(VCC_3V_EN|VCC_5V_EN|EN0|EN1);
break;
- case 33:
+ case 33:
simpad_clear_cs3_bit(VCC_3V_EN|EN1);
simpad_set_cs3_bit(VCC_5V_EN|EN0);
break;
@@ -95,7 +95,7 @@ static void simpad_pcmcia_socket_suspend(struct soc_pcmcia_socket *skt)
simpad_set_cs3_bit(PCMCIA_RESET);
}
-static struct pcmcia_low_level simpad_pcmcia_ops = {
+static struct pcmcia_low_level simpad_pcmcia_ops = {
.owner = THIS_MODULE,
.hw_init = simpad_pcmcia_hw_init,
.hw_shutdown = simpad_pcmcia_hw_shutdown,
diff --git a/drivers/pcmcia/soc_common.h b/drivers/pcmcia/soc_common.h
index b7f993f1bbd0..222e81c79365 100644
--- a/drivers/pcmcia/soc_common.h
+++ b/drivers/pcmcia/soc_common.h
@@ -88,7 +88,7 @@ struct soc_pcmcia_socket {
struct skt_dev_info {
int nskt;
- struct soc_pcmcia_socket skt[0];
+ struct soc_pcmcia_socket skt[];
};
struct pcmcia_state {
diff --git a/drivers/pcmcia/yenta_socket.c b/drivers/pcmcia/yenta_socket.c
index 49b1c6a1bdbe..bf6529b0b5b0 100644
--- a/drivers/pcmcia/yenta_socket.c
+++ b/drivers/pcmcia/yenta_socket.c
@@ -180,12 +180,12 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri
for (i = 0; i < 0x24; i += 4) {
unsigned val;
if (!(i & 15))
- offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
val = cb_readl(socket, i);
- offset += snprintf(buf + offset, PAGE_SIZE - offset, " %08x", val);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %08x", val);
}
- offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:");
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n\nExCA registers:");
for (i = 0; i < 0x45; i++) {
unsigned char val;
if (!(i & 7)) {
@@ -193,10 +193,10 @@ static ssize_t show_yenta_registers(struct device *yentadev, struct device_attri
memcpy(buf + offset, " -", 2);
offset += 2;
} else
- offset += snprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, "\n%02x:", i);
}
val = exca_readb(socket, i);
- offset += snprintf(buf + offset, PAGE_SIZE - offset, " %02x", val);
+ offset += scnprintf(buf + offset, PAGE_SIZE - offset, " %02x", val);
}
buf[offset++] = '\n';
return offset;
diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig
index f0faada30f30..bb68d21e1f8c 100644
--- a/fs/f2fs/Kconfig
+++ b/fs/f2fs/Kconfig
@@ -118,3 +118,12 @@ config F2FS_FS_LZ4
default y
help
Support LZ4 compress algorithm, if unsure, say Y.
+
+config F2FS_FS_ZSTD
+ bool "ZSTD compression support"
+ depends on F2FS_FS_COMPRESSION
+ select ZSTD_COMPRESS
+ select ZSTD_DECOMPRESS
+ default y
+ help
+ Support ZSTD compress algorithm, if unsure, say Y.
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 44e84ac5c941..852890b72d6a 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -50,9 +50,6 @@ repeat:
return page;
}
-/*
- * We guarantee no failure on the returned page.
- */
static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index,
bool is_meta)
{
@@ -206,7 +203,7 @@ bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi,
}
/*
- * Readahead CP/NAT/SIT/SSA pages
+ * Readahead CP/NAT/SIT/SSA/POR pages
*/
int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages,
int type, bool sync)
@@ -898,7 +895,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi)
return -ENOMEM;
/*
* Finding out valid cp block involves read both
- * sets( cp pack1 and cp pack 2)
+ * sets( cp pack 1 and cp pack 2)
*/
cp_start_blk_no = le32_to_cpu(fsb->cp_blkaddr);
cp1 = validate_checkpoint(sbi, cp_start_blk_no, &cp1_version);
@@ -1250,20 +1247,20 @@ static void unblock_operations(struct f2fs_sb_info *sbi)
f2fs_unlock_all(sbi);
}
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi)
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type)
{
DEFINE_WAIT(wait);
for (;;) {
prepare_to_wait(&sbi->cp_wait, &wait, TASK_UNINTERRUPTIBLE);
- if (!get_pages(sbi, F2FS_WB_CP_DATA))
+ if (!get_pages(sbi, type))
break;
if (unlikely(f2fs_cp_error(sbi)))
break;
- io_schedule_timeout(5*HZ);
+ io_schedule_timeout(DEFAULT_IO_TIMEOUT);
}
finish_wait(&sbi->cp_wait, &wait);
}
@@ -1301,10 +1298,14 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc)
else
__clear_ckpt_flags(ckpt, CP_ORPHAN_PRESENT_FLAG);
- if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) ||
- is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+ if (is_sbi_flag_set(sbi, SBI_NEED_FSCK))
__set_ckpt_flags(ckpt, CP_FSCK_FLAG);
+ if (is_sbi_flag_set(sbi, SBI_IS_RESIZEFS))
+ __set_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+ else
+ __clear_ckpt_flags(ckpt, CP_RESIZEFS_FLAG);
+
if (is_sbi_flag_set(sbi, SBI_CP_DISABLED))
__set_ckpt_flags(ckpt, CP_DISABLED_FLAG);
else
@@ -1384,13 +1385,8 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Flush all the NAT/SIT pages */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
- /*
- * modify checkpoint
- * version number is already updated
- */
+ /* start to update checkpoint, cp ver is already updated previously */
ckpt->elapsed_time = cpu_to_le64(get_mtime(sbi, true));
ckpt->free_segment_count = cpu_to_le32(free_segments(sbi));
for (i = 0; i < NR_CURSEG_NODE_TYPE; i++) {
@@ -1493,11 +1489,11 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* Here, we have one bio having CP pack except cp pack 2 page */
f2fs_sync_meta_pages(sbi, META, LONG_MAX, FS_CP_META_IO);
- f2fs_bug_on(sbi, get_pages(sbi, F2FS_DIRTY_META) &&
- !f2fs_cp_error(sbi));
+ /* Wait for all dirty meta pages to be submitted for IO */
+ f2fs_wait_on_all_pages(sbi, F2FS_DIRTY_META);
/* wait for previous submitted meta pages writeback */
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/* flush all device cache */
err = f2fs_flush_device_cache(sbi);
@@ -1506,7 +1502,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
/* barrier and flush checkpoint cp pack 2 page if it can */
commit_checkpoint(sbi, ckpt, start_blk);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
/*
* invalidate intermediate page cache borrowed from meta inode which are
@@ -1543,9 +1539,6 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
return unlikely(f2fs_cp_error(sbi)) ? -EIO : 0;
}
-/*
- * We guarantee that this checkpoint procedure will not fail.
- */
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
{
struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi);
@@ -1613,7 +1606,6 @@ int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc)
f2fs_flush_sit_entries(sbi, cpc);
- /* unlock all the fs_lock[] in do_checkpoint() */
err = do_checkpoint(sbi, cpc);
if (err)
f2fs_release_discard_addrs(sbi);
@@ -1626,7 +1618,7 @@ stop:
if (cpc->reason & CP_RECOVERY)
f2fs_notice(sbi, "checkpoint: version = %llx", ckpt_ver);
- /* do checkpoint periodically */
+ /* update CP_TIME to trigger checkpoint periodically */
f2fs_update_time(sbi, CP_TIME);
trace_f2fs_write_checkpoint(sbi->sb, cpc->reason, "finish checkpoint");
out:
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index d8a64be90a50..df7b2d15eacd 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -11,6 +11,7 @@
#include <linux/backing-dev.h>
#include <linux/lzo.h>
#include <linux/lz4.h>
+#include <linux/zstd.h>
#include "f2fs.h"
#include "node.h"
@@ -20,6 +21,8 @@ struct f2fs_compress_ops {
int (*init_compress_ctx)(struct compress_ctx *cc);
void (*destroy_compress_ctx)(struct compress_ctx *cc);
int (*compress_pages)(struct compress_ctx *cc);
+ int (*init_decompress_ctx)(struct decompress_io_ctx *dic);
+ void (*destroy_decompress_ctx)(struct decompress_io_ctx *dic);
int (*decompress_pages)(struct decompress_io_ctx *dic);
};
@@ -52,7 +55,7 @@ bool f2fs_is_compressed_page(struct page *page)
}
static void f2fs_set_compressed_page(struct page *page,
- struct inode *inode, pgoff_t index, void *data, refcount_t *r)
+ struct inode *inode, pgoff_t index, void *data)
{
SetPagePrivate(page);
set_page_private(page, (unsigned long)data);
@@ -60,8 +63,6 @@ static void f2fs_set_compressed_page(struct page *page,
/* i_crypto_info and iv index */
page->index = index;
page->mapping = inode->i_mapping;
- if (r)
- refcount_inc(r);
}
static void f2fs_put_compressed_page(struct page *page)
@@ -291,6 +292,165 @@ static const struct f2fs_compress_ops f2fs_lz4_ops = {
};
#endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+#define F2FS_ZSTD_DEFAULT_CLEVEL 1
+
+static int zstd_init_compress_ctx(struct compress_ctx *cc)
+{
+ ZSTD_parameters params;
+ ZSTD_CStream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+
+ params = ZSTD_getParams(F2FS_ZSTD_DEFAULT_CLEVEL, cc->rlen, 0);
+ workspace_size = ZSTD_CStreamWorkspaceBound(params.cParams);
+
+ workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode),
+ workspace_size, GFP_NOFS);
+ if (!workspace)
+ return -ENOMEM;
+
+ stream = ZSTD_initCStream(params, 0, workspace, workspace_size);
+ if (!stream) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initCStream failed\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__);
+ kvfree(workspace);
+ return -EIO;
+ }
+
+ cc->private = workspace;
+ cc->private2 = stream;
+
+ cc->clen = cc->rlen - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+ return 0;
+}
+
+static void zstd_destroy_compress_ctx(struct compress_ctx *cc)
+{
+ kvfree(cc->private);
+ cc->private = NULL;
+ cc->private2 = NULL;
+}
+
+static int zstd_compress_pages(struct compress_ctx *cc)
+{
+ ZSTD_CStream *stream = cc->private2;
+ ZSTD_inBuffer inbuf;
+ ZSTD_outBuffer outbuf;
+ int src_size = cc->rlen;
+ int dst_size = src_size - PAGE_SIZE - COMPRESS_HEADER_SIZE;
+ int ret;
+
+ inbuf.pos = 0;
+ inbuf.src = cc->rbuf;
+ inbuf.size = src_size;
+
+ outbuf.pos = 0;
+ outbuf.dst = cc->cbuf->cdata;
+ outbuf.size = dst_size;
+
+ ret = ZSTD_compressStream(stream, &outbuf, &inbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ ret = ZSTD_endStream(stream, &outbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_endStream returned %d\n",
+ KERN_ERR, F2FS_I_SB(cc->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ cc->clen = outbuf.pos;
+ return 0;
+}
+
+static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic)
+{
+ ZSTD_DStream *stream;
+ void *workspace;
+ unsigned int workspace_size;
+
+ workspace_size = ZSTD_DStreamWorkspaceBound(MAX_COMPRESS_WINDOW_SIZE);
+
+ workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode),
+ workspace_size, GFP_NOFS);
+ if (!workspace)
+ return -ENOMEM;
+
+ stream = ZSTD_initDStream(MAX_COMPRESS_WINDOW_SIZE,
+ workspace, workspace_size);
+ if (!stream) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_initDStream failed\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__);
+ kvfree(workspace);
+ return -EIO;
+ }
+
+ dic->private = workspace;
+ dic->private2 = stream;
+
+ return 0;
+}
+
+static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic)
+{
+ kvfree(dic->private);
+ dic->private = NULL;
+ dic->private2 = NULL;
+}
+
+static int zstd_decompress_pages(struct decompress_io_ctx *dic)
+{
+ ZSTD_DStream *stream = dic->private2;
+ ZSTD_inBuffer inbuf;
+ ZSTD_outBuffer outbuf;
+ int ret;
+
+ inbuf.pos = 0;
+ inbuf.src = dic->cbuf->cdata;
+ inbuf.size = dic->clen;
+
+ outbuf.pos = 0;
+ outbuf.dst = dic->rbuf;
+ outbuf.size = dic->rlen;
+
+ ret = ZSTD_decompressStream(stream, &outbuf, &inbuf);
+ if (ZSTD_isError(ret)) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD_compressStream failed, ret: %d\n",
+ KERN_ERR, F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__, ZSTD_getErrorCode(ret));
+ return -EIO;
+ }
+
+ if (dic->rlen != outbuf.pos) {
+ printk_ratelimited("%sF2FS-fs (%s): %s ZSTD invalid rlen:%zu, "
+ "expected:%lu\n", KERN_ERR,
+ F2FS_I_SB(dic->inode)->sb->s_id,
+ __func__, dic->rlen,
+ PAGE_SIZE << dic->log_cluster_size);
+ return -EIO;
+ }
+
+ return 0;
+}
+
+static const struct f2fs_compress_ops f2fs_zstd_ops = {
+ .init_compress_ctx = zstd_init_compress_ctx,
+ .destroy_compress_ctx = zstd_destroy_compress_ctx,
+ .compress_pages = zstd_compress_pages,
+ .init_decompress_ctx = zstd_init_decompress_ctx,
+ .destroy_decompress_ctx = zstd_destroy_decompress_ctx,
+ .decompress_pages = zstd_decompress_pages,
+};
+#endif
+
static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#ifdef CONFIG_F2FS_FS_LZO
&f2fs_lzo_ops,
@@ -302,6 +462,11 @@ static const struct f2fs_compress_ops *f2fs_cops[COMPRESS_MAX] = {
#else
NULL,
#endif
+#ifdef CONFIG_F2FS_FS_ZSTD
+ &f2fs_zstd_ops,
+#else
+ NULL,
+#endif
};
bool f2fs_is_compress_backend_ready(struct inode *inode)
@@ -334,9 +499,11 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
trace_f2fs_compress_pages_start(cc->inode, cc->cluster_idx,
cc->cluster_size, fi->i_compress_algorithm);
- ret = cops->init_compress_ctx(cc);
- if (ret)
- goto out;
+ if (cops->init_compress_ctx) {
+ ret = cops->init_compress_ctx(cc);
+ if (ret)
+ goto out;
+ }
max_len = COMPRESS_HEADER_SIZE + cc->clen;
cc->nr_cpages = DIV_ROUND_UP(max_len, PAGE_SIZE);
@@ -380,21 +547,27 @@ static int f2fs_compress_pages(struct compress_ctx *cc)
}
cc->cbuf->clen = cpu_to_le32(cc->clen);
- cc->cbuf->chksum = cpu_to_le32(0);
for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++)
cc->cbuf->reserved[i] = cpu_to_le32(0);
+ nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
+
+ /* zero out any unused part of the last page */
+ memset(&cc->cbuf->cdata[cc->clen], 0,
+ (nr_cpages * PAGE_SIZE) - (cc->clen + COMPRESS_HEADER_SIZE));
+
vunmap(cc->cbuf);
vunmap(cc->rbuf);
- nr_cpages = DIV_ROUND_UP(cc->clen + COMPRESS_HEADER_SIZE, PAGE_SIZE);
-
for (i = nr_cpages; i < cc->nr_cpages; i++) {
f2fs_put_compressed_page(cc->cpages[i]);
cc->cpages[i] = NULL;
}
+ if (cops->destroy_compress_ctx)
+ cops->destroy_compress_ctx(cc);
+
cc->nr_cpages = nr_cpages;
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
@@ -413,7 +586,8 @@ out_free_cpages:
kfree(cc->cpages);
cc->cpages = NULL;
destroy_compress_ctx:
- cops->destroy_compress_ctx(cc);
+ if (cops->destroy_compress_ctx)
+ cops->destroy_compress_ctx(cc);
out:
trace_f2fs_compress_pages_end(cc->inode, cc->cluster_idx,
cc->clen, ret);
@@ -447,10 +621,16 @@ void f2fs_decompress_pages(struct bio *bio, struct page *page, bool verity)
goto out_free_dic;
}
+ if (cops->init_decompress_ctx) {
+ ret = cops->init_decompress_ctx(dic);
+ if (ret)
+ goto out_free_dic;
+ }
+
dic->rbuf = vmap(dic->tpages, dic->cluster_size, VM_MAP, PAGE_KERNEL);
if (!dic->rbuf) {
ret = -ENOMEM;
- goto out_free_dic;
+ goto destroy_decompress_ctx;
}
dic->cbuf = vmap(dic->cpages, dic->nr_cpages, VM_MAP, PAGE_KERNEL_RO);
@@ -473,7 +653,12 @@ out_vunmap_cbuf:
vunmap(dic->cbuf);
out_vunmap_rbuf:
vunmap(dic->rbuf);
+destroy_decompress_ctx:
+ if (cops->destroy_decompress_ctx)
+ cops->destroy_decompress_ctx(dic);
out_free_dic:
+ if (verity)
+ refcount_set(&dic->ref, dic->nr_cpages);
if (!verity)
f2fs_decompress_end_io(dic->rpages, dic->cluster_size,
ret, false);
@@ -532,8 +717,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc)
return true;
}
-/* return # of compressed block addresses */
-static int f2fs_compressed_blocks(struct compress_ctx *cc)
+static int __f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
{
struct dnode_of_data dn;
int ret;
@@ -554,10 +738,15 @@ static int f2fs_compressed_blocks(struct compress_ctx *cc)
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode,
+ blkaddr = data_blkaddr(dn.inode,
dn.node_page, dn.ofs_in_node + i);
- if (blkaddr != NULL_ADDR)
- ret++;
+ if (compr) {
+ if (__is_valid_data_blkaddr(blkaddr))
+ ret++;
+ } else {
+ if (blkaddr != NULL_ADDR)
+ ret++;
+ }
}
}
fail:
@@ -565,6 +754,18 @@ fail:
return ret;
}
+/* return # of compressed blocks in compressed cluster */
+static int f2fs_compressed_blocks(struct compress_ctx *cc)
+{
+ return __f2fs_cluster_blocks(cc, true);
+}
+
+/* return # of valid blocks in compressed cluster */
+static int f2fs_cluster_blocks(struct compress_ctx *cc, bool compr)
+{
+ return __f2fs_cluster_blocks(cc, false);
+}
+
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
{
struct compress_ctx cc = {
@@ -574,7 +775,7 @@ int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index)
.cluster_idx = index >> F2FS_I(inode)->i_log_cluster_size,
};
- return f2fs_compressed_blocks(&cc);
+ return f2fs_cluster_blocks(&cc, false);
}
static bool cluster_may_compress(struct compress_ctx *cc)
@@ -623,7 +824,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc,
bool prealloc;
retry:
- ret = f2fs_compressed_blocks(cc);
+ ret = f2fs_cluster_blocks(cc, false);
if (ret <= 0)
return ret;
@@ -653,7 +854,7 @@ retry:
struct bio *bio = NULL;
ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size,
- &last_block_in_bio, false);
+ &last_block_in_bio, false, true);
f2fs_destroy_compress_ctx(cc);
if (ret)
goto release_pages;
@@ -772,7 +973,6 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
.encrypted_page = NULL,
.compressed_page = NULL,
.submitted = false,
- .need_lock = LOCK_RETRY,
.io_type = io_type,
.io_wbc = wbc,
.encrypted = f2fs_encrypted_file(cc->inode),
@@ -785,16 +985,17 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
loff_t psize;
int i, err;
- set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
+ if (!f2fs_trylock_op(sbi))
+ return -EAGAIN;
- f2fs_lock_op(sbi);
+ set_new_dnode(&dn, cc->inode, NULL, NULL, 0);
err = f2fs_get_dnode_of_data(&dn, start_idx, LOOKUP_NODE);
if (err)
goto out_unlock_op;
for (i = 0; i < cc->cluster_size; i++) {
- if (datablock_addr(dn.inode, dn.node_page,
+ if (data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i) == NULL_ADDR)
goto out_put_dnode;
}
@@ -813,7 +1014,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
cic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
cic->inode = inode;
- refcount_set(&cic->ref, 1);
+ refcount_set(&cic->ref, cc->nr_cpages);
cic->rpages = f2fs_kzalloc(sbi, sizeof(struct page *) <<
cc->log_cluster_size, GFP_NOFS);
if (!cic->rpages)
@@ -823,8 +1024,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
for (i = 0; i < cc->nr_cpages; i++) {
f2fs_set_compressed_page(cc->cpages[i], inode,
- cc->rpages[i + 1]->index,
- cic, i ? &cic->ref : NULL);
+ cc->rpages[i + 1]->index, cic);
fio.compressed_page = cc->cpages[i];
if (fio.encrypted) {
fio.page = cc->rpages[i + 1];
@@ -843,9 +1043,8 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc,
for (i = 0; i < cc->cluster_size; i++, dn.ofs_in_node++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
- dn.ofs_in_node);
- fio.page = cic->rpages[i];
+ blkaddr = f2fs_data_blkaddr(&dn);
+ fio.page = cc->rpages[i];
fio.old_blkaddr = blkaddr;
/* cluster header */
@@ -895,10 +1094,10 @@ unlock_continue:
f2fs_put_dnode(&dn);
f2fs_unlock_op(sbi);
- down_write(&fi->i_sem);
+ spin_lock(&fi->i_size_lock);
if (fi->last_disk_size < psize)
fi->last_disk_size = psize;
- up_write(&fi->i_sem);
+ spin_unlock(&fi->i_size_lock);
f2fs_put_rpages(cc);
f2fs_destroy_compress_ctx(cc);
@@ -984,24 +1183,30 @@ retry_write:
unlock_page(cc->rpages[i]);
ret = 0;
} else if (ret == -EAGAIN) {
+ /*
+ * for quota file, just redirty left pages to
+ * avoid deadlock caused by cluster update race
+ * from foreground operation.
+ */
+ if (IS_NOQUOTA(cc->inode)) {
+ err = 0;
+ goto out_err;
+ }
ret = 0;
cond_resched();
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
lock_page(cc->rpages[i]);
clear_page_dirty_for_io(cc->rpages[i]);
goto retry_write;
}
err = ret;
- goto out_fail;
+ goto out_err;
}
*submitted += _submitted;
}
return 0;
-
-out_fail:
- /* TODO: revoke partially updated block addresses */
- BUG_ON(compr_blocks);
out_err:
for (++i; i < cc->cluster_size; i++) {
if (!cc->rpages[i])
@@ -1069,7 +1274,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
dic->magic = F2FS_COMPRESSED_PAGE_MAGIC;
dic->inode = cc->inode;
- refcount_set(&dic->ref, 1);
+ refcount_set(&dic->ref, cc->nr_cpages);
dic->cluster_idx = cc->cluster_idx;
dic->cluster_size = cc->cluster_size;
dic->log_cluster_size = cc->log_cluster_size;
@@ -1093,8 +1298,7 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
goto out_free;
f2fs_set_compressed_page(page, cc->inode,
- start_idx + i + 1,
- dic, i ? &dic->ref : NULL);
+ start_idx + i + 1, dic);
dic->cpages[i] = page;
}
@@ -1104,20 +1308,16 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc)
goto out_free;
for (i = 0; i < dic->cluster_size; i++) {
- if (cc->rpages[i])
+ if (cc->rpages[i]) {
+ dic->tpages[i] = cc->rpages[i];
continue;
+ }
dic->tpages[i] = f2fs_grab_page();
if (!dic->tpages[i])
goto out_free;
}
- for (i = 0; i < dic->cluster_size; i++) {
- if (dic->tpages[i])
- continue;
- dic->tpages[i] = cc->rpages[i];
- }
-
return dic;
out_free:
@@ -1133,7 +1333,10 @@ void f2fs_free_dic(struct decompress_io_ctx *dic)
for (i = 0; i < dic->cluster_size; i++) {
if (dic->rpages[i])
continue;
- f2fs_put_page(dic->tpages[i], 1);
+ if (!dic->tpages[i])
+ continue;
+ unlock_page(dic->tpages[i]);
+ put_page(dic->tpages[i]);
}
kfree(dic->tpages);
}
@@ -1162,15 +1365,17 @@ void f2fs_decompress_end_io(struct page **rpages,
if (!rpage)
continue;
- if (err || PageError(rpage)) {
- ClearPageUptodate(rpage);
- ClearPageError(rpage);
- } else {
- if (!verity || fsverity_verify_page(rpage))
- SetPageUptodate(rpage);
- else
- SetPageError(rpage);
+ if (err || PageError(rpage))
+ goto clear_uptodate;
+
+ if (!verity || fsverity_verify_page(rpage)) {
+ SetPageUptodate(rpage);
+ goto unlock;
}
+clear_uptodate:
+ ClearPageUptodate(rpage);
+ ClearPageError(rpage);
+unlock:
unlock_page(rpage);
}
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index b27b72107911..cdf2f626bea7 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -54,17 +54,13 @@ static inline struct bio *__f2fs_bio_alloc(gfp_t gfp_mask,
return bio_alloc_bioset(gfp_mask, nr_iovecs, &f2fs_bioset);
}
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail)
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio)
{
- struct bio *bio;
-
- if (no_fail) {
+ if (noio) {
/* No failure on bio allocation */
- bio = __f2fs_bio_alloc(GFP_NOIO, npages);
- if (!bio)
- bio = __f2fs_bio_alloc(GFP_NOIO | __GFP_NOFAIL, npages);
- return bio;
+ return __f2fs_bio_alloc(GFP_NOIO, npages);
}
+
if (time_to_inject(sbi, FAULT_ALLOC_BIO)) {
f2fs_show_injection_info(sbi, FAULT_ALLOC_BIO);
return NULL;
@@ -143,6 +139,8 @@ static void __read_end_io(struct bio *bio, bool compr, bool verity)
f2fs_decompress_pages(bio, page, verity);
continue;
}
+ if (verity)
+ continue;
#endif
/* PG_error was set if any post_read step failed */
@@ -191,12 +189,38 @@ static void f2fs_verify_pages(struct page **rpages, unsigned int cluster_size)
static void f2fs_verify_bio(struct bio *bio)
{
- struct page *page = bio_first_page_all(bio);
- struct decompress_io_ctx *dic =
- (struct decompress_io_ctx *)page_private(page);
+ struct bio_vec *bv;
+ struct bvec_iter_all iter_all;
- f2fs_verify_pages(dic->rpages, dic->cluster_size);
- f2fs_free_dic(dic);
+ bio_for_each_segment_all(bv, bio, iter_all) {
+ struct page *page = bv->bv_page;
+ struct decompress_io_ctx *dic;
+
+ dic = (struct decompress_io_ctx *)page_private(page);
+
+ if (dic) {
+ if (refcount_dec_not_one(&dic->ref))
+ continue;
+ f2fs_verify_pages(dic->rpages,
+ dic->cluster_size);
+ f2fs_free_dic(dic);
+ continue;
+ }
+
+ if (bio->bi_status || PageError(page))
+ goto clear_uptodate;
+
+ if (fsverity_verify_page(page)) {
+ SetPageUptodate(page);
+ goto unlock;
+ }
+clear_uptodate:
+ ClearPageUptodate(page);
+ ClearPageError(page);
+unlock:
+ dec_page_count(F2FS_P_SB(page), __read_io_type(page));
+ unlock_page(page);
+ }
}
#endif
@@ -364,9 +388,6 @@ static void f2fs_write_end_io(struct bio *bio)
bio_put(bio);
}
-/*
- * Return true, if pre_bio's bdev is same as its target device.
- */
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
block_t blk_addr, struct bio *bio)
{
@@ -403,6 +424,9 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
return 0;
}
+/*
+ * Return true, if pre_bio's bdev is same as its target device.
+ */
static bool __same_bdev(struct f2fs_sb_info *sbi,
block_t blk_addr, struct bio *bio)
{
@@ -410,9 +434,6 @@ static bool __same_bdev(struct f2fs_sb_info *sbi,
return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
}
-/*
- * Low-level block read/write IO operations.
- */
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
{
struct f2fs_sb_info *sbi = fio->sbi;
@@ -445,7 +466,7 @@ static inline void __submit_bio(struct f2fs_sb_info *sbi,
if (type != DATA && type != NODE)
goto submit_io;
- if (test_opt(sbi, LFS) && current->plug)
+ if (f2fs_lfs_mode(sbi) && current->plug)
blk_finish_plug(current->plug);
if (F2FS_IO_ALIGNED(sbi))
@@ -928,14 +949,15 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
unsigned nr_pages, unsigned op_flag,
- pgoff_t first_idx)
+ pgoff_t first_idx, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
struct bio_post_read_ctx *ctx;
unsigned int post_read_steps = 0;
- bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
+ bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES),
+ for_write);
if (!bio)
return ERR_PTR(-ENOMEM);
f2fs_target_device(sbi, blkaddr, bio);
@@ -970,12 +992,12 @@ static void f2fs_release_read_bio(struct bio *bio)
/* This can handle encryption stuffs */
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
- block_t blkaddr)
+ block_t blkaddr, bool for_write)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct bio *bio;
- bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
+ bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index, for_write);
if (IS_ERR(bio))
return PTR_ERR(bio);
@@ -1047,8 +1069,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
for (; count > 0; dn->ofs_in_node++) {
- block_t blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ block_t blkaddr = f2fs_data_blkaddr(dn);
if (blkaddr == NULL_ADDR) {
dn->data_blkaddr = NEW_ADDR;
__set_data_blkaddr(dn);
@@ -1162,7 +1183,7 @@ got_it:
return page;
}
- err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
+ err = f2fs_submit_page_read(inode, page, dn.data_blkaddr, for_write);
if (err)
goto put_err;
return page;
@@ -1300,8 +1321,7 @@ static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
if (err)
return err;
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
if (dn->data_blkaddr != NULL_ADDR)
goto alloc;
@@ -1388,13 +1408,9 @@ void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
}
/*
- * f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
- * f2fs_map_blocks structure.
- * If original data blocks are allocated, then give them to blockdev.
- * Otherwise,
- * a. preallocate requested block addresses
- * b. do not use extent cache for better performance
- * c. give the block addresses to blockdev
+ * f2fs_map_blocks() tries to find or build mapping relationship which
+ * maps continuous logical blocks to physical blocks, and return such
+ * info via f2fs_map_blocks structure.
*/
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
int create, int flag)
@@ -1422,7 +1438,7 @@ int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
end = pgofs + maxblocks;
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
- if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create)
goto next_dnode;
@@ -1467,7 +1483,7 @@ next_dnode:
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
next_block:
- blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
+ blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
@@ -1477,7 +1493,7 @@ next_block:
if (__is_valid_data_blkaddr(blkaddr)) {
/* use out-place-update for driect IO under LFS mode */
- if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
+ if (f2fs_lfs_mode(sbi) && flag == F2FS_GET_BLOCK_DIO &&
map->m_may_create) {
err = __allocate_data_block(&dn, map->m_seg_type);
if (err)
@@ -1980,7 +1996,8 @@ submit_and_realloc:
}
if (bio == NULL) {
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
- is_readahead ? REQ_RAHEAD : 0, page->index);
+ is_readahead ? REQ_RAHEAD : 0, page->index,
+ false);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2015,7 +2032,7 @@ out:
#ifdef CONFIG_F2FS_FS_COMPRESSION
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
- bool is_readahead)
+ bool is_readahead, bool for_write)
{
struct dnode_of_data dn;
struct inode *inode = cc->inode;
@@ -2031,7 +2048,8 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc));
- last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits;
+ last_block_in_file = (f2fs_readpage_limit(inode) +
+ blocksize - 1) >> blkbits;
/* get rid of pages beyond EOF */
for (i = 0; i < cc->cluster_size; i++) {
@@ -2067,7 +2085,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
for (i = 1; i < cc->cluster_size; i++) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
+ blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i);
if (!__is_valid_data_blkaddr(blkaddr))
@@ -2096,7 +2114,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
struct page *page = dic->cpages[i];
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode, dn.node_page,
+ blkaddr = data_blkaddr(dn.inode, dn.node_page,
dn.ofs_in_node + i + 1);
if (bio && !page_is_mergeable(sbi, bio,
@@ -2109,7 +2127,7 @@ submit_and_realloc:
if (!bio) {
bio = f2fs_grab_read_bio(inode, blkaddr, nr_pages,
is_readahead ? REQ_RAHEAD : 0,
- page->index);
+ page->index, for_write);
if (IS_ERR(bio)) {
ret = PTR_ERR(bio);
bio = NULL;
@@ -2210,7 +2228,7 @@ int f2fs_mpage_readpages(struct address_space *mapping,
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ is_readahead, false);
f2fs_destroy_compress_ctx(&cc);
if (ret)
goto set_error_page;
@@ -2253,7 +2271,7 @@ next_page:
ret = f2fs_read_multi_pages(&cc, &bio,
max_nr_pages,
&last_block_in_bio,
- is_readahead);
+ is_readahead, false);
f2fs_destroy_compress_ctx(&cc);
}
}
@@ -2326,7 +2344,7 @@ retry_encrypt:
/* flush pending IOs and wait for a while in the ENOMEM case */
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
f2fs_flush_merged_writes(fio->sbi);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
gfp_flags |= __GFP_NOFAIL;
goto retry_encrypt;
}
@@ -2397,7 +2415,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
{
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
- if (test_opt(sbi, LFS))
+ if (f2fs_lfs_mode(sbi))
return true;
if (S_ISDIR(inode->i_mode))
return true;
@@ -2647,10 +2665,10 @@ write:
if (err) {
file_set_keep_isize(inode);
} else {
- down_write(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
if (F2FS_I(inode)->last_disk_size < psize)
F2FS_I(inode)->last_disk_size = psize;
- up_write(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
}
done:
@@ -2917,7 +2935,7 @@ result:
if (wbc->sync_mode == WB_SYNC_ALL) {
cond_resched();
congestion_wait(BLK_RW_ASYNC,
- HZ/50);
+ DEFAULT_IO_TIMEOUT);
goto retry_write;
}
goto next;
@@ -2973,15 +2991,17 @@ next:
static inline bool __should_serialize_io(struct inode *inode,
struct writeback_control *wbc)
{
+ /* to avoid deadlock in path of data flush */
+ if (F2FS_I(inode)->cp_task)
+ return false;
+
if (!S_ISREG(inode->i_mode))
return false;
- if (f2fs_compressed_file(inode))
- return true;
if (IS_NOQUOTA(inode))
return false;
- /* to avoid deadlock in path of data flush */
- if (F2FS_I(inode)->cp_task)
- return false;
+
+ if (f2fs_compressed_file(inode))
+ return true;
if (wbc->sync_mode != WB_SYNC_ALL)
return true;
if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
@@ -3283,7 +3303,7 @@ repeat:
err = -EFSCORRUPTED;
goto fail;
}
- err = f2fs_submit_page_read(inode, page, blkaddr);
+ err = f2fs_submit_page_read(inode, page, blkaddr, true);
if (err)
goto fail;
@@ -3464,7 +3484,8 @@ static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
iter, rw == WRITE ? get_data_block_dio_write :
get_data_block_dio, NULL, f2fs_dio_submit_bio,
- DIO_LOCKING | DIO_SKIP_HOLES);
+ rw == WRITE ? DIO_LOCKING | DIO_SKIP_HOLES :
+ DIO_SKIP_HOLES);
if (do_opu)
up_read(&fi->i_gc_rwsem[READ]);
@@ -3861,7 +3882,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi)
int __init f2fs_init_bio_entry_cache(void)
{
- bio_entry_slab = f2fs_kmem_cache_create("bio_entry_slab",
+ bio_entry_slab = f2fs_kmem_cache_create("f2fs_bio_entry_slab",
sizeof(struct bio_entry));
if (!bio_entry_slab)
return -ENOMEM;
diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c
index 6b89eae5e4ca..0dbcb0f9c019 100644
--- a/fs/f2fs/debug.c
+++ b/fs/f2fs/debug.c
@@ -301,6 +301,9 @@ static int stat_show(struct seq_file *s, void *v)
si->ssa_area_segs, si->main_area_segs);
seq_printf(s, "(OverProv:%d Resv:%d)]\n\n",
si->overp_segs, si->rsvd_segs);
+ seq_printf(s, "Current Time Sec: %llu / Mounted Time Sec: %llu\n\n",
+ ktime_get_boottime_seconds(),
+ SIT_I(si->sbi)->mounted_time);
if (test_opt(si->sbi, DISCARD))
seq_printf(s, "Utilization: %u%% (%u valid blocks, %u discard blocks)\n",
si->utilization, si->valid_count, si->discard_blks);
diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c
index 27d0dd7a16d6..44bfc464df78 100644
--- a/fs/f2fs/dir.c
+++ b/fs/f2fs/dir.c
@@ -471,7 +471,6 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
struct page *dpage)
{
struct page *page;
- int dummy_encrypt = DUMMY_ENCRYPTION_ENABLED(F2FS_I_SB(dir));
int err;
if (is_inode_flag_set(inode, FI_NEW_INODE)) {
@@ -498,8 +497,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir,
if (err)
goto put_error;
- if ((IS_ENCRYPTED(dir) || dummy_encrypt) &&
- f2fs_may_encrypt(inode)) {
+ if (IS_ENCRYPTED(inode)) {
err = fscrypt_inherit_context(dir, inode, page, false);
if (err)
goto put_error;
@@ -850,12 +848,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
0);
set_page_dirty(page);
- dir->i_ctime = dir->i_mtime = current_time(dir);
- f2fs_mark_inode_dirty_sync(dir, false);
-
- if (inode)
- f2fs_drop_nlink(dir, inode);
-
if (bit_pos == NR_DENTRY_IN_BLOCK &&
!f2fs_truncate_hole(dir, page->index, page->index + 1)) {
f2fs_clear_page_cache_dirty_tag(page);
@@ -867,6 +859,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page,
f2fs_remove_dirty_inode(dir);
}
f2fs_put_page(page, 1);
+
+ dir->i_ctime = dir->i_mtime = current_time(dir);
+ f2fs_mark_inode_dirty_sync(dir, false);
+
+ if (inode)
+ f2fs_drop_nlink(dir, inode);
}
bool f2fs_empty_dir(struct inode *dir)
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 088c3e7a1080..ba470d5687fe 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -75,7 +75,6 @@ extern const char *f2fs_fault_name[FAULT_MAX];
/*
* For mount options
*/
-#define F2FS_MOUNT_BG_GC 0x00000001
#define F2FS_MOUNT_DISABLE_ROLL_FORWARD 0x00000002
#define F2FS_MOUNT_DISCARD 0x00000004
#define F2FS_MOUNT_NOHEAP 0x00000008
@@ -89,11 +88,8 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_NOBARRIER 0x00000800
#define F2FS_MOUNT_FASTBOOT 0x00001000
#define F2FS_MOUNT_EXTENT_CACHE 0x00002000
-#define F2FS_MOUNT_FORCE_FG_GC 0x00004000
#define F2FS_MOUNT_DATA_FLUSH 0x00008000
#define F2FS_MOUNT_FAULT_INJECTION 0x00010000
-#define F2FS_MOUNT_ADAPTIVE 0x00020000
-#define F2FS_MOUNT_LFS 0x00040000
#define F2FS_MOUNT_USRQUOTA 0x00080000
#define F2FS_MOUNT_GRPQUOTA 0x00100000
#define F2FS_MOUNT_PRJQUOTA 0x00200000
@@ -101,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX];
#define F2FS_MOUNT_INLINE_XATTR_SIZE 0x00800000
#define F2FS_MOUNT_RESERVE_ROOT 0x01000000
#define F2FS_MOUNT_DISABLE_CHECKPOINT 0x02000000
+#define F2FS_MOUNT_NORECOVERY 0x04000000
#define F2FS_OPTION(sbi) ((sbi)->mount_opt)
#define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option)
@@ -139,6 +136,8 @@ struct f2fs_mount_info {
int whint_mode;
int alloc_mode; /* segment allocation policy */
int fsync_mode; /* fsync policy */
+ int fs_mode; /* fs mode: LFS or ADAPTIVE */
+ int bggc_mode; /* bggc mode: off, on or sync */
bool test_dummy_encryption; /* test dummy encryption */
block_t unusable_cap; /* Amount of space allowed to be
* unusable when disabling checkpoint
@@ -332,8 +331,8 @@ struct discard_policy {
bool io_aware; /* issue discard in idle time */
bool sync; /* submit discard with REQ_SYNC flag */
bool ordered; /* issue discard by lba order */
+ bool timeout; /* discard timeout for put_super */
unsigned int granularity; /* discard granularity */
- int timeout; /* discard timeout for put_super */
};
struct discard_cmd_control {
@@ -428,6 +427,7 @@ static inline bool __has_cursum_space(struct f2fs_journal *journal,
#define F2FS_IOC_GET_PIN_FILE _IOR(F2FS_IOCTL_MAGIC, 14, __u32)
#define F2FS_IOC_PRECACHE_EXTENTS _IO(F2FS_IOCTL_MAGIC, 15)
#define F2FS_IOC_RESIZE_FS _IOW(F2FS_IOCTL_MAGIC, 16, __u64)
+#define F2FS_IOC_GET_COMPRESS_BLOCKS _IOR(F2FS_IOCTL_MAGIC, 17, __u64)
#define F2FS_IOC_GET_VOLUME_NAME FS_IOC_GETFSLABEL
#define F2FS_IOC_SET_VOLUME_NAME FS_IOC_SETFSLABEL
@@ -560,6 +560,9 @@ enum {
#define DEFAULT_RETRY_IO_COUNT 8 /* maximum retry read IO count */
+/* congestion wait timeout value, default: 20ms */
+#define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20))
+
/* maximum retry quota flush count */
#define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8
@@ -676,6 +679,44 @@ enum {
MAX_GC_FAILURE
};
+/* used for f2fs_inode_info->flags */
+enum {
+ FI_NEW_INODE, /* indicate newly allocated inode */
+ FI_DIRTY_INODE, /* indicate inode is dirty or not */
+ FI_AUTO_RECOVER, /* indicate inode is recoverable */
+ FI_DIRTY_DIR, /* indicate directory has dirty pages */
+ FI_INC_LINK, /* need to increment i_nlink */
+ FI_ACL_MODE, /* indicate acl mode */
+ FI_NO_ALLOC, /* should not allocate any blocks */
+ FI_FREE_NID, /* free allocated nide */
+ FI_NO_EXTENT, /* not to use the extent cache */
+ FI_INLINE_XATTR, /* used for inline xattr */
+ FI_INLINE_DATA, /* used for inline data*/
+ FI_INLINE_DENTRY, /* used for inline dentry */
+ FI_APPEND_WRITE, /* inode has appended data */
+ FI_UPDATE_WRITE, /* inode has in-place-update data */
+ FI_NEED_IPU, /* used for ipu per file */
+ FI_ATOMIC_FILE, /* indicate atomic file */
+ FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
+ FI_VOLATILE_FILE, /* indicate volatile file */
+ FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
+ FI_DROP_CACHE, /* drop dirty page cache */
+ FI_DATA_EXIST, /* indicate data exists */
+ FI_INLINE_DOTS, /* indicate inline dot dentries */
+ FI_DO_DEFRAG, /* indicate defragment is running */
+ FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
+ FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
+ FI_HOT_DATA, /* indicate file is hot */
+ FI_EXTRA_ATTR, /* indicate file has extra attribute */
+ FI_PROJ_INHERIT, /* indicate file inherits projectid */
+ FI_PIN_FILE, /* indicate file should not be gced */
+ FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
+ FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
+ FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
+ FI_MMAP_FILE, /* indicate file was mmapped */
+ FI_MAX, /* max flag, never be used */
+};
+
struct f2fs_inode_info {
struct inode vfs_inode; /* serve a vfs inode */
unsigned long i_flags; /* keep an inode flags for ioctl */
@@ -688,7 +729,7 @@ struct f2fs_inode_info {
umode_t i_acl_mode; /* keep file acl mode temporarily */
/* Use below internally in f2fs*/
- unsigned long flags; /* use to pass per-file flags */
+ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */
struct rw_semaphore i_sem; /* protect fi info */
atomic_t dirty_pages; /* # of dirty pages */
f2fs_hash_t chash; /* hash value of given file name */
@@ -697,6 +738,7 @@ struct f2fs_inode_info {
struct task_struct *cp_task; /* separate cp/wb IO stats*/
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
+ spinlock_t i_size_lock; /* protect last_disk_size */
#ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS];
@@ -1173,6 +1215,20 @@ enum {
};
enum {
+ BGGC_MODE_ON, /* background gc is on */
+ BGGC_MODE_OFF, /* background gc is off */
+ BGGC_MODE_SYNC, /*
+ * background gc is on, migrating blocks
+ * like foreground gc
+ */
+};
+
+enum {
+ FS_MODE_ADAPTIVE, /* use both lfs/ssr allocation */
+ FS_MODE_LFS, /* use lfs allocation only */
+};
+
+enum {
WHINT_MODE_OFF, /* not pass down write hints */
WHINT_MODE_USER, /* try to pass down hints given by users */
WHINT_MODE_FS, /* pass down hints with F2FS policy */
@@ -1212,13 +1268,13 @@ enum fsync_mode {
enum compress_algorithm_type {
COMPRESS_LZO,
COMPRESS_LZ4,
+ COMPRESS_ZSTD,
COMPRESS_MAX,
};
-#define COMPRESS_DATA_RESERVED_SIZE 4
+#define COMPRESS_DATA_RESERVED_SIZE 5
struct compress_data {
__le32 clen; /* compressed data size */
- __le32 chksum; /* checksum of compressed data */
__le32 reserved[COMPRESS_DATA_RESERVED_SIZE]; /* reserved */
u8 cdata[]; /* compressed data */
};
@@ -1242,6 +1298,7 @@ struct compress_ctx {
size_t rlen; /* valid data length in rbuf */
size_t clen; /* valid data length in cbuf */
void *private; /* payload buffer for specified compression algorithm */
+ void *private2; /* extra payload buffer */
};
/* compress context for write IO path */
@@ -1271,11 +1328,14 @@ struct decompress_io_ctx {
size_t clen; /* valid data length in cbuf */
refcount_t ref; /* referrence count of compressed page */
bool failed; /* indicate IO error during decompression */
+ void *private; /* payload buffer for specified decompression algorithm */
+ void *private2; /* extra payload buffer */
};
#define NULL_CLUSTER ((unsigned int)(~0))
#define MIN_COMPRESS_LOG_SIZE 2
#define MAX_COMPRESS_LOG_SIZE 8
+#define MAX_COMPRESS_WINDOW_SIZE ((PAGE_SIZE) << MAX_COMPRESS_LOG_SIZE)
struct f2fs_sb_info {
struct super_block *sb; /* pointer to VFS super block */
@@ -1471,6 +1531,9 @@ struct f2fs_sb_info {
__u32 s_chksum_seed;
struct workqueue_struct *post_read_wq; /* post read workqueue */
+
+ struct kmem_cache *inline_xattr_slab; /* inline xattr entry */
+ unsigned int inline_xattr_slab_size; /* default inline xattr slab size */
};
struct f2fs_private_dio {
@@ -2211,7 +2274,7 @@ static inline void dec_valid_node_count(struct f2fs_sb_info *sbi,
dquot_free_inode(inode);
} else {
if (unlikely(inode->i_blocks == 0)) {
- f2fs_warn(sbi, "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
+ f2fs_warn(sbi, "dec_valid_node_count: inconsistent i_blocks, ino:%lu, iblocks:%llu",
inode->i_ino,
(unsigned long long)inode->i_blocks);
set_sbi_flag(sbi, SBI_NEED_FSCK);
@@ -2379,7 +2442,7 @@ static inline __le32 *blkaddr_in_node(struct f2fs_node *node)
}
static inline int f2fs_has_extra_attr(struct inode *inode);
-static inline block_t datablock_addr(struct inode *inode,
+static inline block_t data_blkaddr(struct inode *inode,
struct page *node_page, unsigned int offset)
{
struct f2fs_node *raw_node;
@@ -2389,9 +2452,9 @@ static inline block_t datablock_addr(struct inode *inode,
raw_node = F2FS_NODE(node_page);
- /* from GC path only */
if (is_inode) {
if (!inode)
+ /* from GC path only */
base = offset_in_addr(&raw_node->i);
else if (f2fs_has_extra_attr(inode))
base = get_extra_isize(inode);
@@ -2401,6 +2464,11 @@ static inline block_t datablock_addr(struct inode *inode,
return le32_to_cpu(addr_array[base + offset]);
}
+static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn)
+{
+ return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node);
+}
+
static inline int f2fs_test_bit(unsigned int nr, char *addr)
{
int mask;
@@ -2498,43 +2566,6 @@ static inline __u32 f2fs_mask_flags(umode_t mode, __u32 flags)
return flags & F2FS_OTHER_FLMASK;
}
-/* used for f2fs_inode_info->flags */
-enum {
- FI_NEW_INODE, /* indicate newly allocated inode */
- FI_DIRTY_INODE, /* indicate inode is dirty or not */
- FI_AUTO_RECOVER, /* indicate inode is recoverable */
- FI_DIRTY_DIR, /* indicate directory has dirty pages */
- FI_INC_LINK, /* need to increment i_nlink */
- FI_ACL_MODE, /* indicate acl mode */
- FI_NO_ALLOC, /* should not allocate any blocks */
- FI_FREE_NID, /* free allocated nide */
- FI_NO_EXTENT, /* not to use the extent cache */
- FI_INLINE_XATTR, /* used for inline xattr */
- FI_INLINE_DATA, /* used for inline data*/
- FI_INLINE_DENTRY, /* used for inline dentry */
- FI_APPEND_WRITE, /* inode has appended data */
- FI_UPDATE_WRITE, /* inode has in-place-update data */
- FI_NEED_IPU, /* used for ipu per file */
- FI_ATOMIC_FILE, /* indicate atomic file */
- FI_ATOMIC_COMMIT, /* indicate the state of atomical committing */
- FI_VOLATILE_FILE, /* indicate volatile file */
- FI_FIRST_BLOCK_WRITTEN, /* indicate #0 data block was written */
- FI_DROP_CACHE, /* drop dirty page cache */
- FI_DATA_EXIST, /* indicate data exists */
- FI_INLINE_DOTS, /* indicate inline dot dentries */
- FI_DO_DEFRAG, /* indicate defragment is running */
- FI_DIRTY_FILE, /* indicate regular/symlink has dirty pages */
- FI_NO_PREALLOC, /* indicate skipped preallocated blocks */
- FI_HOT_DATA, /* indicate file is hot */
- FI_EXTRA_ATTR, /* indicate file has extra attribute */
- FI_PROJ_INHERIT, /* indicate file inherits projectid */
- FI_PIN_FILE, /* indicate file should not be gced */
- FI_ATOMIC_REVOKE_REQUEST, /* request to drop atomic data */
- FI_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
- FI_COMPRESSED_FILE, /* indicate file's data can be compressed */
- FI_MMAP_FILE, /* indicate file was mmapped */
-};
-
static inline void __mark_inode_dirty_flag(struct inode *inode,
int flag, bool set)
{
@@ -2549,27 +2580,24 @@ static inline void __mark_inode_dirty_flag(struct inode *inode,
case FI_DATA_EXIST:
case FI_INLINE_DOTS:
case FI_PIN_FILE:
- case FI_COMPRESSED_FILE:
f2fs_mark_inode_dirty_sync(inode, true);
}
}
static inline void set_inode_flag(struct inode *inode, int flag)
{
- if (!test_bit(flag, &F2FS_I(inode)->flags))
- set_bit(flag, &F2FS_I(inode)->flags);
+ test_and_set_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, true);
}
static inline int is_inode_flag_set(struct inode *inode, int flag)
{
- return test_bit(flag, &F2FS_I(inode)->flags);
+ return test_bit(flag, F2FS_I(inode)->flags);
}
static inline void clear_inode_flag(struct inode *inode, int flag)
{
- if (test_bit(flag, &F2FS_I(inode)->flags))
- clear_bit(flag, &F2FS_I(inode)->flags);
+ test_and_clear_bit(flag, F2FS_I(inode)->flags);
__mark_inode_dirty_flag(inode, flag, false);
}
@@ -2660,19 +2688,19 @@ static inline void get_inline_info(struct inode *inode, struct f2fs_inode *ri)
struct f2fs_inode_info *fi = F2FS_I(inode);
if (ri->i_inline & F2FS_INLINE_XATTR)
- set_bit(FI_INLINE_XATTR, &fi->flags);
+ set_bit(FI_INLINE_XATTR, fi->flags);
if (ri->i_inline & F2FS_INLINE_DATA)
- set_bit(FI_INLINE_DATA, &fi->flags);
+ set_bit(FI_INLINE_DATA, fi->flags);
if (ri->i_inline & F2FS_INLINE_DENTRY)
- set_bit(FI_INLINE_DENTRY, &fi->flags);
+ set_bit(FI_INLINE_DENTRY, fi->flags);
if (ri->i_inline & F2FS_DATA_EXIST)
- set_bit(FI_DATA_EXIST, &fi->flags);
+ set_bit(FI_DATA_EXIST, fi->flags);
if (ri->i_inline & F2FS_INLINE_DOTS)
- set_bit(FI_INLINE_DOTS, &fi->flags);
+ set_bit(FI_INLINE_DOTS, fi->flags);
if (ri->i_inline & F2FS_EXTRA_ATTR)
- set_bit(FI_EXTRA_ATTR, &fi->flags);
+ set_bit(FI_EXTRA_ATTR, fi->flags);
if (ri->i_inline & F2FS_PIN_FILE)
- set_bit(FI_PIN_FILE, &fi->flags);
+ set_bit(FI_PIN_FILE, fi->flags);
}
static inline void set_raw_inline(struct inode *inode, struct f2fs_inode *ri)
@@ -2857,9 +2885,9 @@ static inline bool f2fs_skip_inode_update(struct inode *inode, int dsync)
if (!f2fs_is_time_consistent(inode))
return false;
- down_read(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
ret = F2FS_I(inode)->last_disk_size == i_size_read(inode);
- up_read(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
return ret;
}
@@ -3213,7 +3241,7 @@ void f2fs_drop_inmem_pages(struct inode *inode);
void f2fs_drop_inmem_page(struct inode *inode, struct page *page);
int f2fs_commit_inmem_pages(struct inode *inode);
void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need);
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi);
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg);
int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino);
int f2fs_create_flush_cmd_control(struct f2fs_sb_info *sbi);
int f2fs_flush_device_cache(struct f2fs_sb_info *sbi);
@@ -3309,7 +3337,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_page(struct inode *inode, struct page *page);
void f2fs_remove_dirty_inode(struct inode *inode);
int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
-void f2fs_wait_on_all_pages_writeback(struct f2fs_sb_info *sbi);
+void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
int __init f2fs_create_checkpoint_caches(void);
@@ -3320,7 +3348,7 @@ void f2fs_destroy_checkpoint_caches(void);
*/
int __init f2fs_init_bioset(void);
void f2fs_destroy_bioset(void);
-struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool no_fail);
+struct bio *f2fs_bio_alloc(struct f2fs_sb_info *sbi, int npages, bool noio);
int f2fs_init_bio_entry_cache(void);
void f2fs_destroy_bio_entry_cache(void);
void f2fs_submit_bio(struct f2fs_sb_info *sbi,
@@ -3776,7 +3804,7 @@ int f2fs_write_multi_pages(struct compress_ctx *cc,
int f2fs_is_compressed_cluster(struct inode *inode, pgoff_t index);
int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret,
unsigned nr_pages, sector_t *last_block_in_bio,
- bool is_readahead);
+ bool is_readahead, bool for_write);
struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc);
void f2fs_free_dic(struct decompress_io_ctx *dic);
void f2fs_decompress_end_io(struct page **rpages,
@@ -3813,6 +3841,7 @@ static inline void set_compress_context(struct inode *inode)
F2FS_I(inode)->i_flags |= F2FS_COMPR_FL;
set_inode_flag(inode, FI_COMPRESSED_FILE);
stat_inc_compr_inode(inode);
+ f2fs_mark_inode_dirty_sync(inode, true);
}
static inline u64 f2fs_disable_compressed_file(struct inode *inode)
@@ -3821,12 +3850,17 @@ static inline u64 f2fs_disable_compressed_file(struct inode *inode)
if (!f2fs_compressed_file(inode))
return 0;
- if (fi->i_compr_blocks)
- return fi->i_compr_blocks;
+ if (S_ISREG(inode->i_mode)) {
+ if (get_dirty_pages(inode))
+ return 1;
+ if (fi->i_compr_blocks)
+ return fi->i_compr_blocks;
+ }
fi->i_flags &= ~F2FS_COMPR_FL;
- clear_inode_flag(inode, FI_COMPRESSED_FILE);
stat_dec_compr_inode(inode);
+ clear_inode_flag(inode, FI_COMPRESSED_FILE);
+ f2fs_mark_inode_dirty_sync(inode, true);
return 0;
}
@@ -3903,31 +3937,25 @@ static inline bool f2fs_hw_is_readonly(struct f2fs_sb_info *sbi)
return false;
}
-
-static inline void set_opt_mode(struct f2fs_sb_info *sbi, unsigned int mt)
+static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi)
{
- clear_opt(sbi, ADAPTIVE);
- clear_opt(sbi, LFS);
-
- switch (mt) {
- case F2FS_MOUNT_ADAPTIVE:
- set_opt(sbi, ADAPTIVE);
- break;
- case F2FS_MOUNT_LFS:
- set_opt(sbi, LFS);
- break;
- }
+ return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS;
}
-static inline bool f2fs_may_encrypt(struct inode *inode)
+static inline bool f2fs_may_encrypt(struct inode *dir, struct inode *inode)
{
#ifdef CONFIG_FS_ENCRYPTION
+ struct f2fs_sb_info *sbi = F2FS_I_SB(dir);
umode_t mode = inode->i_mode;
- return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
-#else
- return false;
+ /*
+ * If the directory encrypted or dummy encryption enabled,
+ * then we should encrypt the inode.
+ */
+ if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi))
+ return (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode));
#endif
+ return false;
}
static inline bool f2fs_may_compress(struct inode *inode)
@@ -3971,7 +3999,7 @@ static inline int allow_outplace_dio(struct inode *inode,
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
int rw = iov_iter_rw(iter);
- return (test_opt(sbi, LFS) && (rw == WRITE) &&
+ return (f2fs_lfs_mode(sbi) && (rw == WRITE) &&
!block_unaligned_IO(inode, iocb, iter));
}
@@ -3993,7 +4021,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode,
*/
if (f2fs_sb_has_blkzoned(sbi))
return true;
- if (test_opt(sbi, LFS) && (rw == WRITE)) {
+ if (f2fs_lfs_mode(sbi) && (rw == WRITE)) {
if (block_unaligned_IO(inode, iocb, iter))
return true;
if (F2FS_IO_ALIGNED(sbi))
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 351762f77840..6ab8f621a3c5 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -106,13 +106,20 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf)
err = f2fs_get_block(&dn, page->index);
f2fs_put_dnode(&dn);
__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO, false);
- if (err) {
- unlock_page(page);
- goto out_sem;
- }
}
- /* fill the page */
+#ifdef CONFIG_F2FS_FS_COMPRESSION
+ if (!need_alloc) {
+ set_new_dnode(&dn, inode, NULL, NULL, 0);
+ err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
+ f2fs_put_dnode(&dn);
+ }
+#endif
+ if (err) {
+ unlock_page(page);
+ goto out_sem;
+ }
+
f2fs_wait_on_page_writeback(page, DATA, false, true);
/* wait for GCed page writeback via META_MAPPING */
@@ -448,8 +455,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence)
data_ofs = (loff_t)pgofs << PAGE_SHIFT) {
block_t blkaddr;
- blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(blkaddr) &&
!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
@@ -793,6 +799,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
}
flags = fi->i_flags;
+ if (flags & F2FS_COMPR_FL)
+ stat->attributes |= STATX_ATTR_COMPRESSED;
if (flags & F2FS_APPEND_FL)
stat->attributes |= STATX_ATTR_APPEND;
if (IS_ENCRYPTED(inode))
@@ -804,7 +812,8 @@ int f2fs_getattr(const struct path *path, struct kstat *stat,
if (IS_VERITY(inode))
stat->attributes |= STATX_ATTR_VERITY;
- stat->attributes_mask |= (STATX_ATTR_APPEND |
+ stat->attributes_mask |= (STATX_ATTR_COMPRESSED |
+ STATX_ATTR_APPEND |
STATX_ATTR_ENCRYPTED |
STATX_ATTR_IMMUTABLE |
STATX_ATTR_NODUMP |
@@ -929,10 +938,10 @@ int f2fs_setattr(struct dentry *dentry, struct iattr *attr)
if (err)
return err;
- down_write(&F2FS_I(inode)->i_sem);
+ spin_lock(&F2FS_I(inode)->i_size_lock);
inode->i_mtime = inode->i_ctime = current_time(inode);
F2FS_I(inode)->last_disk_size = i_size_read(inode);
- up_write(&F2FS_I(inode)->i_sem);
+ spin_unlock(&F2FS_I(inode)->i_size_lock);
}
__setattr_copy(inode, attr);
@@ -1109,8 +1118,7 @@ next_dnode:
done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) -
dn.ofs_in_node, len);
for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) {
- *blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ *blkaddr = f2fs_data_blkaddr(&dn);
if (__is_valid_data_blkaddr(*blkaddr) &&
!f2fs_is_valid_blkaddr(sbi, *blkaddr,
@@ -1121,7 +1129,7 @@ next_dnode:
if (!f2fs_is_checkpointed_data(sbi, *blkaddr)) {
- if (test_opt(sbi, LFS)) {
+ if (f2fs_lfs_mode(sbi)) {
f2fs_put_dnode(&dn);
return -EOPNOTSUPP;
}
@@ -1199,8 +1207,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode,
ADDRS_PER_PAGE(dn.node_page, dst_inode) -
dn.ofs_in_node, len - i);
do {
- dn.data_blkaddr = datablock_addr(dn.inode,
- dn.node_page, dn.ofs_in_node);
+ dn.data_blkaddr = f2fs_data_blkaddr(&dn);
f2fs_truncate_data_blocks_range(&dn, 1);
if (do_replace[i]) {
@@ -1376,8 +1383,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
int ret;
for (; index < end; index++, dn->ofs_in_node++) {
- if (datablock_addr(dn->inode, dn->node_page,
- dn->ofs_in_node) == NULL_ADDR)
+ if (f2fs_data_blkaddr(dn) == NULL_ADDR)
count++;
}
@@ -1388,8 +1394,7 @@ static int f2fs_do_zero_range(struct dnode_of_data *dn, pgoff_t start,
dn->ofs_in_node = ofs_in_node;
for (index = start; index < end; index++, dn->ofs_in_node++) {
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
/*
* f2fs_reserve_new_blocks will not guarantee entire block
* allocation.
@@ -1787,12 +1792,15 @@ static int f2fs_file_flush(struct file *file, fl_owner_t id)
static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
{
struct f2fs_inode_info *fi = F2FS_I(inode);
+ u32 masked_flags = fi->i_flags & mask;
+
+ f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask));
/* Is it quota file? Do not allow user to mess with it */
if (IS_NOQUOTA(inode))
return -EPERM;
- if ((iflags ^ fi->i_flags) & F2FS_CASEFOLD_FL) {
+ if ((iflags ^ masked_flags) & F2FS_CASEFOLD_FL) {
if (!f2fs_sb_has_casefold(F2FS_I_SB(inode)))
return -EOPNOTSUPP;
if (!f2fs_empty_dir(inode))
@@ -1806,27 +1814,22 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask)
return -EINVAL;
}
- if ((iflags ^ fi->i_flags) & F2FS_COMPR_FL) {
- if (S_ISREG(inode->i_mode) &&
- (fi->i_flags & F2FS_COMPR_FL || i_size_read(inode) ||
- F2FS_HAS_BLOCKS(inode)))
- return -EINVAL;
+ if ((iflags ^ masked_flags) & F2FS_COMPR_FL) {
+ if (masked_flags & F2FS_COMPR_FL) {
+ if (f2fs_disable_compressed_file(inode))
+ return -EINVAL;
+ }
if (iflags & F2FS_NOCOMP_FL)
return -EINVAL;
if (iflags & F2FS_COMPR_FL) {
- int err = f2fs_convert_inline_inode(inode);
-
- if (err)
- return err;
-
if (!f2fs_may_compress(inode))
return -EINVAL;
set_compress_context(inode);
}
}
- if ((iflags ^ fi->i_flags) & F2FS_NOCOMP_FL) {
- if (fi->i_flags & F2FS_COMPR_FL)
+ if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
+ if (masked_flags & F2FS_COMPR_FL)
return -EINVAL;
}
@@ -3401,6 +3404,21 @@ out:
return err;
}
+static int f2fs_get_compress_blocks(struct file *filp, unsigned long arg)
+{
+ struct inode *inode = file_inode(filp);
+ __u64 blocks;
+
+ if (!f2fs_sb_has_compression(F2FS_I_SB(inode)))
+ return -EOPNOTSUPP;
+
+ if (!f2fs_compressed_file(inode))
+ return -EINVAL;
+
+ blocks = F2FS_I(inode)->i_compr_blocks;
+ return put_user(blocks, (u64 __user *)arg);
+}
+
long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
{
if (unlikely(f2fs_cp_error(F2FS_I_SB(file_inode(filp)))))
@@ -3481,6 +3499,8 @@ long f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
return f2fs_get_volume_name(filp, arg);
case F2FS_IOC_SET_VOLUME_NAME:
return f2fs_set_volume_name(filp, arg);
+ case F2FS_IOC_GET_COMPRESS_BLOCKS:
+ return f2fs_get_compress_blocks(filp, arg);
default:
return -ENOTTY;
}
@@ -3508,8 +3528,10 @@ static ssize_t f2fs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
goto out;
}
- if (!f2fs_is_compress_backend_ready(inode))
- return -EOPNOTSUPP;
+ if (!f2fs_is_compress_backend_ready(inode)) {
+ ret = -EOPNOTSUPP;
+ goto out;
+ }
if (iocb->ki_flags & IOCB_NOWAIT) {
if (!inode_trylock(inode)) {
@@ -3639,6 +3661,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
case FS_IOC_MEASURE_VERITY:
case F2FS_IOC_GET_VOLUME_NAME:
case F2FS_IOC_SET_VOLUME_NAME:
+ case F2FS_IOC_GET_COMPRESS_BLOCKS:
break;
default:
return -ENOIOCTLCMD;
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index db8725d473b5..26248c8936db 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -31,6 +31,8 @@ static int gc_thread_func(void *data)
set_freezable();
do {
+ bool sync_mode;
+
wait_event_interruptible_timeout(*wq,
kthread_should_stop() || freezing(current) ||
gc_th->gc_wake,
@@ -101,15 +103,17 @@ static int gc_thread_func(void *data)
do_gc:
stat_inc_bggc_count(sbi->stat_info);
+ sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC;
+
/* if return value is not zero, no victim was selected */
- if (f2fs_gc(sbi, test_opt(sbi, FORCE_FG_GC), true, NULL_SEGNO))
+ if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO))
wait_ms = gc_th->no_gc_sleep_time;
trace_f2fs_background_gc(sbi->sb, wait_ms,
prefree_segments(sbi), free_segments(sbi));
/* balancing f2fs's metadata periodically */
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, true);
next:
sb_end_write(sbi->sb);
@@ -192,7 +196,10 @@ static void select_policy(struct f2fs_sb_info *sbi, int gc_type,
p->ofs_unit = sbi->segs_per_sec;
}
- /* we need to check every dirty segments in the FG_GC case */
+ /*
+ * adjust candidates range, should select all dirty segments for
+ * foreground GC and urgent GC cases.
+ */
if (gc_type != FG_GC &&
(sbi->gc_mode != GC_URGENT) &&
p->max_search > sbi->max_victim_search)
@@ -634,7 +641,7 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum,
}
*nofs = ofs_of_node(node_page);
- source_blkaddr = datablock_addr(NULL, node_page, ofs_in_node);
+ source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
f2fs_put_page(node_page, 1);
if (source_blkaddr != blkaddr) {
@@ -762,7 +769,7 @@ static int move_data_block(struct inode *inode, block_t bidx,
struct page *page, *mpage;
block_t newaddr;
int err = 0;
- bool lfs_mode = test_opt(fio.sbi, LFS);
+ bool lfs_mode = f2fs_lfs_mode(fio.sbi);
/* do not read out */
page = f2fs_grab_cache_page(inode->i_mapping, bidx, false);
@@ -970,7 +977,8 @@ retry:
if (err) {
clear_cold_data(page);
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry;
}
if (is_dirty)
@@ -1018,8 +1026,8 @@ next_step:
* race condition along with SSR block allocation.
*/
if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) ||
- get_valid_blocks(sbi, segno, false) ==
- sbi->blocks_per_seg)
+ get_valid_blocks(sbi, segno, true) ==
+ BLKS_PER_SEC(sbi))
return submitted;
if (check_valid_map(sbi, segno, off) == 0)
@@ -1203,7 +1211,7 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
if (get_valid_blocks(sbi, segno, false) == 0)
goto freed;
- if (__is_large_section(sbi) &&
+ if (gc_type == BG_GC && __is_large_section(sbi) &&
migrated >= sbi->migration_granularity)
goto skip;
if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi)))
@@ -1233,12 +1241,12 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi,
segno, gc_type);
stat_inc_seg_count(sbi, type, gc_type);
+ migrated++;
freed:
if (gc_type == FG_GC &&
get_valid_blocks(sbi, segno, false) == 0)
seg_freed++;
- migrated++;
if (__is_large_section(sbi) && segno + 1 < end_segno)
sbi->next_victim_seg[gc_type] = segno + 1;
@@ -1434,12 +1442,19 @@ static int free_segment_range(struct f2fs_sb_info *sbi, unsigned int start,
static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
{
struct f2fs_super_block *raw_sb = F2FS_RAW_SUPER(sbi);
- int section_count = le32_to_cpu(raw_sb->section_count);
- int segment_count = le32_to_cpu(raw_sb->segment_count);
- int segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
- long long block_count = le64_to_cpu(raw_sb->block_count);
+ int section_count;
+ int segment_count;
+ int segment_count_main;
+ long long block_count;
int segs = secs * sbi->segs_per_sec;
+ down_write(&sbi->sb_lock);
+
+ section_count = le32_to_cpu(raw_sb->section_count);
+ segment_count = le32_to_cpu(raw_sb->segment_count);
+ segment_count_main = le32_to_cpu(raw_sb->segment_count_main);
+ block_count = le64_to_cpu(raw_sb->block_count);
+
raw_sb->section_count = cpu_to_le32(section_count + secs);
raw_sb->segment_count = cpu_to_le32(segment_count + segs);
raw_sb->segment_count_main = cpu_to_le32(segment_count_main + segs);
@@ -1453,6 +1468,8 @@ static void update_sb_metadata(struct f2fs_sb_info *sbi, int secs)
raw_sb->devs[last_dev].total_segments =
cpu_to_le32(dev_segs + segs);
}
+
+ up_write(&sbi->sb_lock);
}
static void update_fs_metadata(struct f2fs_sb_info *sbi, int secs)
@@ -1570,11 +1587,17 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count)
goto out;
}
+ mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, -secs);
clear_sbi_flag(sbi, SBI_IS_RESIZEFS);
+ set_sbi_flag(sbi, SBI_IS_DIRTY);
+ mutex_unlock(&sbi->cp_mutex);
+
err = f2fs_sync_fs(sbi->sb, 1);
if (err) {
+ mutex_lock(&sbi->cp_mutex);
update_fs_metadata(sbi, secs);
+ mutex_unlock(&sbi->cp_mutex);
update_sb_metadata(sbi, secs);
f2fs_commit_super(sbi, false);
}
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index 78c3f1d70f1d..44582a4db513 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -291,13 +291,30 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page)
fi->i_flags & F2FS_COMPR_FL &&
F2FS_FITS_IN_INODE(ri, fi->i_extra_isize,
i_log_cluster_size)) {
- if (ri->i_compress_algorithm >= COMPRESS_MAX)
+ if (ri->i_compress_algorithm >= COMPRESS_MAX) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+ "compress algorithm: %u, run fsck to fix",
+ __func__, inode->i_ino,
+ ri->i_compress_algorithm);
return false;
- if (le64_to_cpu(ri->i_compr_blocks) > inode->i_blocks)
+ }
+ if (le64_to_cpu(ri->i_compr_blocks) >
+ SECTOR_TO_BLOCK(inode->i_blocks)) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has inconsistent "
+ "i_compr_blocks:%llu, i_blocks:%llu, run fsck to fix",
+ __func__, inode->i_ino,
+ le64_to_cpu(ri->i_compr_blocks),
+ SECTOR_TO_BLOCK(inode->i_blocks));
return false;
+ }
if (ri->i_log_cluster_size < MIN_COMPRESS_LOG_SIZE ||
- ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE)
+ ri->i_log_cluster_size > MAX_COMPRESS_LOG_SIZE) {
+ f2fs_warn(sbi, "%s: inode (ino=%lx) has unsupported "
+ "log cluster size: %u, run fsck to fix",
+ __func__, inode->i_ino,
+ ri->i_log_cluster_size);
return false;
+ }
}
return true;
@@ -345,7 +362,7 @@ static int do_read_inode(struct inode *inode)
fi->i_flags = le32_to_cpu(ri->i_flags);
if (S_ISREG(inode->i_mode))
fi->i_flags &= ~F2FS_PROJINHERIT_FL;
- fi->flags = 0;
+ bitmap_zero(fi->flags, FI_MAX);
fi->i_advise = ri->i_advise;
fi->i_pino = le32_to_cpu(ri->i_pino);
fi->i_dir_level = ri->i_dir_level;
@@ -518,7 +535,7 @@ retry:
inode = f2fs_iget(sb, ino);
if (IS_ERR(inode)) {
if (PTR_ERR(inode) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry;
}
}
@@ -759,7 +776,7 @@ no_delete:
else
f2fs_inode_synced(inode);
- /* ino == 0, if f2fs_new_inode() was failed t*/
+ /* for the case f2fs_new_inode() was failed, .i_ino is zero, skip it */
if (inode->i_ino)
invalidate_mapping_pages(NODE_MAPPING(sbi), inode->i_ino,
inode->i_ino);
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 2aa035422c0f..f54119da2217 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -75,9 +75,7 @@ static struct inode *f2fs_new_inode(struct inode *dir, umode_t mode)
set_inode_flag(inode, FI_NEW_INODE);
- /* If the directory encrypted, then we should encrypt the inode. */
- if ((IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) &&
- f2fs_may_encrypt(inode))
+ if (f2fs_may_encrypt(dir, inode))
f2fs_set_encrypted_inode(inode);
if (f2fs_sb_has_extra_attr(sbi)) {
@@ -177,7 +175,7 @@ static inline int is_extension_exist(const unsigned char *s, const char *sub)
}
/*
- * Set multimedia files as cold files for hot/cold data separation
+ * Set file's temperature for hot/cold data separation
*/
static inline void set_file_temperature(struct f2fs_sb_info *sbi, struct inode *inode,
const unsigned char *name)
@@ -876,12 +874,6 @@ static int f2fs_tmpfile(struct inode *dir, struct dentry *dentry, umode_t mode)
if (!f2fs_is_checkpoint_ready(sbi))
return -ENOSPC;
- if (IS_ENCRYPTED(dir) || DUMMY_ENCRYPTION_ENABLED(sbi)) {
- int err = fscrypt_get_encryption_info(dir);
- if (err)
- return err;
- }
-
return __f2fs_tmpfile(dir, dentry, mode, NULL);
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 9d02cdcdbb07..ecbd6bd14a49 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -510,9 +510,6 @@ int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink)
return nr - nr_shrink;
}
-/*
- * This function always returns success
- */
int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid,
struct node_info *ni)
{
@@ -716,8 +713,7 @@ got:
/*
* Caller should call f2fs_put_dnode(dn).
* Also, it should grab and release a rwsem by calling f2fs_lock_op() and
- * f2fs_unlock_op() only if ro is not set RDONLY_NODE.
- * In the case of RDONLY_NODE, we don't need to care about mutex.
+ * f2fs_unlock_op() only if mode is set with ALLOC_NODE.
*/
int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
{
@@ -809,8 +805,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode)
dn->nid = nids[level];
dn->ofs_in_node = offset[level];
dn->node_page = npage[level];
- dn->data_blkaddr = datablock_addr(dn->inode,
- dn->node_page, dn->ofs_in_node);
+ dn->data_blkaddr = f2fs_data_blkaddr(dn);
return 0;
release_pages:
@@ -1188,8 +1183,9 @@ int f2fs_remove_inode_page(struct inode *inode)
}
if (unlikely(inode->i_blocks != 0 && inode->i_blocks != 8)) {
- f2fs_warn(F2FS_I_SB(inode), "Inconsistent i_blocks, ino:%lu, iblocks:%llu",
- inode->i_ino, (unsigned long long)inode->i_blocks);
+ f2fs_warn(F2FS_I_SB(inode),
+ "f2fs_remove_inode_page: inconsistent i_blocks, ino:%lu, iblocks:%llu",
+ inode->i_ino, (unsigned long long)inode->i_blocks);
set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK);
}
@@ -1562,15 +1558,16 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted,
if (atomic && !test_opt(sbi, NOBARRIER))
fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
- set_page_writeback(page);
- ClearPageError(page);
-
+ /* should add to global list before clearing PAGECACHE status */
if (f2fs_in_warm_node_list(sbi, page)) {
seq = f2fs_add_fsync_node_entry(sbi, page);
if (seq_id)
*seq_id = seq;
}
+ set_page_writeback(page);
+ ClearPageError(page);
+
fio.old_blkaddr = ni.blk_addr;
f2fs_do_write_node_page(nid, &fio);
set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page));
@@ -1979,7 +1976,7 @@ static int f2fs_write_node_pages(struct address_space *mapping,
goto skip_write;
/* balancing f2fs's metadata in background */
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, true);
/* collect a number of dirty node pages and write together */
if (wbc->sync_mode != WB_SYNC_ALL &&
@@ -2602,7 +2599,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page)
retry:
ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false);
if (!ipage) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry;
}
@@ -3193,22 +3190,22 @@ void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi)
int __init f2fs_create_node_manager_caches(void)
{
- nat_entry_slab = f2fs_kmem_cache_create("nat_entry",
+ nat_entry_slab = f2fs_kmem_cache_create("f2fs_nat_entry",
sizeof(struct nat_entry));
if (!nat_entry_slab)
goto fail;
- free_nid_slab = f2fs_kmem_cache_create("free_nid",
+ free_nid_slab = f2fs_kmem_cache_create("f2fs_free_nid",
sizeof(struct free_nid));
if (!free_nid_slab)
goto destroy_nat_entry;
- nat_entry_set_slab = f2fs_kmem_cache_create("nat_entry_set",
+ nat_entry_set_slab = f2fs_kmem_cache_create("f2fs_nat_entry_set",
sizeof(struct nat_entry_set));
if (!nat_entry_set_slab)
goto destroy_free_nid;
- fsync_node_entry_slab = f2fs_kmem_cache_create("fsync_node_entry",
+ fsync_node_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_node_entry",
sizeof(struct fsync_node_entry));
if (!fsync_node_entry_slab)
goto destroy_nat_entry_set;
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 763d5c0951d1..dd804c07eeb0 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -496,8 +496,7 @@ out:
return 0;
truncate_out:
- if (datablock_addr(tdn.inode, tdn.node_page,
- tdn.ofs_in_node) == blkaddr)
+ if (f2fs_data_blkaddr(&tdn) == blkaddr)
f2fs_truncate_data_blocks_range(&tdn, 1);
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
unlock_page(dn->inode_page);
@@ -535,7 +534,7 @@ retry_dn:
err = f2fs_get_dnode_of_data(&dn, start, ALLOC_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto retry_dn;
}
goto out;
@@ -560,8 +559,8 @@ retry_dn:
for (; start < end; start++, dn.ofs_in_node++) {
block_t src, dest;
- src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
- dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
+ src = f2fs_data_blkaddr(&dn);
+ dest = data_blkaddr(dn.inode, page, dn.ofs_in_node);
if (__is_valid_data_blkaddr(src) &&
!f2fs_is_valid_blkaddr(sbi, src, META_POR)) {
@@ -618,7 +617,8 @@ retry_prev:
err = check_index_in_prev_nodes(sbi, dest, &dn);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry_prev;
}
goto err;
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index cf0eb002cfd4..b7a9421472a7 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -172,7 +172,7 @@ bool f2fs_need_SSR(struct f2fs_sb_info *sbi)
int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
- if (test_opt(sbi, LFS))
+ if (f2fs_lfs_mode(sbi))
return false;
if (sbi->gc_mode == GC_URGENT)
return true;
@@ -245,7 +245,8 @@ retry:
LOOKUP_NODE);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry;
}
@@ -312,7 +313,7 @@ next:
skip:
iput(inode);
}
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
cond_resched();
if (gc_failure) {
if (++looped >= count)
@@ -415,7 +416,8 @@ retry:
err = f2fs_do_write_data_page(&fio);
if (err) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
cond_resched();
goto retry;
}
@@ -494,7 +496,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
/* balance_fs_bg is able to be pending */
if (need && excess_cached_nats(sbi))
- f2fs_balance_fs_bg(sbi);
+ f2fs_balance_fs_bg(sbi, false);
if (!f2fs_is_checkpoint_ready(sbi))
return;
@@ -509,7 +511,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need)
}
}
-void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
+void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi, bool from_bg)
{
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return;
@@ -538,7 +540,7 @@ void f2fs_balance_fs_bg(struct f2fs_sb_info *sbi)
excess_dirty_nats(sbi) ||
excess_dirty_nodes(sbi) ||
f2fs_time_over(sbi, CP_TIME)) {
- if (test_opt(sbi, DATA_FLUSH)) {
+ if (test_opt(sbi, DATA_FLUSH) && from_bg) {
struct blk_plug plug;
mutex_lock(&sbi->flush_lock);
@@ -1078,7 +1080,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->max_requests = DEF_MAX_DISCARD_REQUEST;
dpolicy->io_aware_gran = MAX_PLIST_NUM;
- dpolicy->timeout = 0;
+ dpolicy->timeout = false;
if (discard_type == DPOLICY_BG) {
dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME;
@@ -1103,6 +1105,7 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi,
dpolicy->io_aware = false;
/* we need to issue all to keep CP_TRIMMED_FLAG */
dpolicy->granularity = 1;
+ dpolicy->timeout = true;
}
}
@@ -1471,12 +1474,12 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
int i, issued = 0;
bool io_interrupted = false;
- if (dpolicy->timeout != 0)
- f2fs_update_time(sbi, dpolicy->timeout);
+ if (dpolicy->timeout)
+ f2fs_update_time(sbi, UMOUNT_DISCARD_TIMEOUT);
for (i = MAX_PLIST_NUM - 1; i >= 0; i--) {
- if (dpolicy->timeout != 0 &&
- f2fs_time_over(sbi, dpolicy->timeout))
+ if (dpolicy->timeout &&
+ f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
break;
if (i + 1 < dpolicy->granularity)
@@ -1497,8 +1500,8 @@ static int __issue_discard_cmd(struct f2fs_sb_info *sbi,
list_for_each_entry_safe(dc, tmp, pend_list, list) {
f2fs_bug_on(sbi, dc->state != D_PREP);
- if (dpolicy->timeout != 0 &&
- f2fs_time_over(sbi, dpolicy->timeout))
+ if (dpolicy->timeout &&
+ f2fs_time_over(sbi, UMOUNT_DISCARD_TIMEOUT))
break;
if (dpolicy->io_aware && i < dpolicy->io_aware_gran &&
@@ -1677,7 +1680,6 @@ bool f2fs_issue_discard_timeout(struct f2fs_sb_info *sbi)
__init_discard_policy(sbi, &dpolicy, DPOLICY_UMOUNT,
dcc->discard_granularity);
- dpolicy.timeout = UMOUNT_DISCARD_TIMEOUT;
__issue_discard_cmd(sbi, &dpolicy);
dropped = __drop_discard_cmd(sbi);
@@ -1940,7 +1942,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
unsigned int start = 0, end = -1;
unsigned int secno, start_segno;
bool force = (cpc->reason & CP_DISCARD);
- bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+ bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
mutex_lock(&dirty_i->seglist_lock);
@@ -1972,7 +1974,7 @@ void f2fs_clear_prefree_segments(struct f2fs_sb_info *sbi,
(end - 1) <= cpc->trim_end)
continue;
- if (!test_opt(sbi, LFS) || !__is_large_section(sbi)) {
+ if (!f2fs_lfs_mode(sbi) || !__is_large_section(sbi)) {
f2fs_issue_discard(sbi, START_BLOCK(sbi, start),
(end - start) << sbi->log_blocks_per_seg);
continue;
@@ -2801,7 +2803,7 @@ next:
blk_finish_plug(&plug);
mutex_unlock(&dcc->cmd_lock);
trimmed += __wait_all_discard_cmd(sbi, NULL);
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC, DEFAULT_IO_TIMEOUT);
goto next;
}
skip:
@@ -2830,7 +2832,7 @@ int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range)
struct discard_policy dpolicy;
unsigned long long trimmed = 0;
int err = 0;
- bool need_align = test_opt(sbi, LFS) && __is_large_section(sbi);
+ bool need_align = f2fs_lfs_mode(sbi) && __is_large_section(sbi);
if (start >= MAX_BLKADDR(sbi) || range->len < sbi->blocksize)
return -EINVAL;
@@ -3193,7 +3195,7 @@ static void update_device_state(struct f2fs_io_info *fio)
static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio)
{
int type = __get_segment_type(fio);
- bool keep_order = (test_opt(fio->sbi, LFS) && type == CURSEG_COLD_DATA);
+ bool keep_order = (f2fs_lfs_mode(fio->sbi) && type == CURSEG_COLD_DATA);
if (keep_order)
down_read(&fio->sbi->io_order_lock);
@@ -4071,7 +4073,7 @@ static int build_sit_info(struct f2fs_sb_info *sbi)
sit_i->dirty_sentries = 0;
sit_i->sents_per_block = SIT_ENTRY_PER_BLOCK;
sit_i->elapsed_time = le64_to_cpu(sbi->ckpt->elapsed_time);
- sit_i->mounted_time = ktime_get_real_seconds();
+ sit_i->mounted_time = ktime_get_boottime_seconds();
init_rwsem(&sit_i->sentry_lock);
return 0;
}
@@ -4678,7 +4680,7 @@ int f2fs_build_segment_manager(struct f2fs_sb_info *sbi)
if (sm_info->rec_prefree_segments > DEF_MAX_RECLAIM_PREFREE_SEGMENTS)
sm_info->rec_prefree_segments = DEF_MAX_RECLAIM_PREFREE_SEGMENTS;
- if (!test_opt(sbi, LFS))
+ if (!f2fs_lfs_mode(sbi))
sm_info->ipu_policy = 1 << F2FS_IPU_FSYNC;
sm_info->min_ipu_util = DEF_MIN_IPU_UTIL;
sm_info->min_fsync_blocks = DEF_MIN_FSYNC_BLOCKS;
@@ -4830,22 +4832,22 @@ void f2fs_destroy_segment_manager(struct f2fs_sb_info *sbi)
int __init f2fs_create_segment_manager_caches(void)
{
- discard_entry_slab = f2fs_kmem_cache_create("discard_entry",
+ discard_entry_slab = f2fs_kmem_cache_create("f2fs_discard_entry",
sizeof(struct discard_entry));
if (!discard_entry_slab)
goto fail;
- discard_cmd_slab = f2fs_kmem_cache_create("discard_cmd",
+ discard_cmd_slab = f2fs_kmem_cache_create("f2fs_discard_cmd",
sizeof(struct discard_cmd));
if (!discard_cmd_slab)
goto destroy_discard_entry;
- sit_entry_set_slab = f2fs_kmem_cache_create("sit_entry_set",
+ sit_entry_set_slab = f2fs_kmem_cache_create("f2fs_sit_entry_set",
sizeof(struct sit_entry_set));
if (!sit_entry_set_slab)
goto destroy_discard_cmd;
- inmem_entry_slab = f2fs_kmem_cache_create("inmem_page_entry",
+ inmem_entry_slab = f2fs_kmem_cache_create("f2fs_inmem_page_entry",
sizeof(struct inmem_pages));
if (!inmem_entry_slab)
goto destroy_sit_entry_set;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index 459dc3901a57..7a83bd530812 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -756,7 +756,7 @@ static inline unsigned long long get_mtime(struct f2fs_sb_info *sbi,
bool base_time)
{
struct sit_info *sit_i = SIT_I(sbi);
- time64_t diff, now = ktime_get_real_seconds();
+ time64_t diff, now = ktime_get_boottime_seconds();
if (now >= sit_i->mounted_time)
return sit_i->elapsed_time + now - sit_i->mounted_time;
diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c
index a467aca29cfe..d66de5999a26 100644
--- a/fs/f2fs/shrinker.c
+++ b/fs/f2fs/shrinker.c
@@ -58,7 +58,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink,
/* count extent cache entries */
count += __count_extent_cache(sbi);
- /* shrink clean nat cache entries */
+ /* count clean nat cache entries */
count += __count_nat_entries(sbi);
/* count free nids cache entries */
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index d398b2d90c6c..f2dfc21c6abb 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -428,14 +428,11 @@ static int parse_options(struct super_block *sb, char *options)
if (!name)
return -ENOMEM;
if (strlen(name) == 2 && !strncmp(name, "on", 2)) {
- set_opt(sbi, BG_GC);
- clear_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
} else if (strlen(name) == 3 && !strncmp(name, "off", 3)) {
- clear_opt(sbi, BG_GC);
- clear_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_OFF;
} else if (strlen(name) == 4 && !strncmp(name, "sync", 4)) {
- set_opt(sbi, BG_GC);
- set_opt(sbi, FORCE_FG_GC);
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_SYNC;
} else {
kvfree(name);
return -EINVAL;
@@ -447,7 +444,7 @@ static int parse_options(struct super_block *sb, char *options)
break;
case Opt_norecovery:
/* this option mounts f2fs with ro */
- set_opt(sbi, DISABLE_ROLL_FORWARD);
+ set_opt(sbi, NORECOVERY);
if (!f2fs_readonly(sb))
return -EINVAL;
break;
@@ -601,10 +598,10 @@ static int parse_options(struct super_block *sb, char *options)
kvfree(name);
return -EINVAL;
}
- set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
} else if (strlen(name) == 3 &&
!strncmp(name, "lfs", 3)) {
- set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
} else {
kvfree(name);
return -EINVAL;
@@ -833,6 +830,10 @@ static int parse_options(struct super_block *sb, char *options)
!strcmp(name, "lz4")) {
F2FS_OPTION(sbi).compress_algorithm =
COMPRESS_LZ4;
+ } else if (strlen(name) == 4 &&
+ !strcmp(name, "zstd")) {
+ F2FS_OPTION(sbi).compress_algorithm =
+ COMPRESS_ZSTD;
} else {
kfree(name);
return -EINVAL;
@@ -905,7 +906,7 @@ static int parse_options(struct super_block *sb, char *options)
}
#endif
- if (F2FS_IO_SIZE_BITS(sbi) && !test_opt(sbi, LFS)) {
+ if (F2FS_IO_SIZE_BITS(sbi) && !f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "Should set mode=lfs with %uKB-sized IO",
F2FS_IO_SIZE_KB(sbi));
return -EINVAL;
@@ -935,7 +936,7 @@ static int parse_options(struct super_block *sb, char *options)
}
}
- if (test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, LFS)) {
+ if (test_opt(sbi, DISABLE_CHECKPOINT) && f2fs_lfs_mode(sbi)) {
f2fs_err(sbi, "LFS not compatible with checkpoint=disable\n");
return -EINVAL;
}
@@ -961,6 +962,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb)
/* Initialize f2fs-specific inode info */
atomic_set(&fi->dirty_pages, 0);
init_rwsem(&fi->i_sem);
+ spin_lock_init(&fi->i_size_lock);
INIT_LIST_HEAD(&fi->dirty_list);
INIT_LIST_HEAD(&fi->gdirty_list);
INIT_LIST_HEAD(&fi->inmem_ilist);
@@ -1173,7 +1175,7 @@ static void f2fs_put_super(struct super_block *sb)
/* our cp_error case, we can wait for any writeback page */
f2fs_flush_merged_writes(sbi);
- f2fs_wait_on_all_pages_writeback(sbi);
+ f2fs_wait_on_all_pages(sbi, F2FS_WB_CP_DATA);
f2fs_bug_on(sbi, sbi->fsync_node_num);
@@ -1205,6 +1207,7 @@ static void f2fs_put_super(struct super_block *sb)
kvfree(sbi->raw_super);
destroy_device_list(sbi);
+ f2fs_destroy_xattr_caches(sbi);
mempool_destroy(sbi->write_io_dummy);
#ifdef CONFIG_QUOTA
for (i = 0; i < MAXQUOTAS; i++)
@@ -1421,6 +1424,9 @@ static inline void f2fs_show_compress_options(struct seq_file *seq,
case COMPRESS_LZ4:
algtype = "lz4";
break;
+ case COMPRESS_ZSTD:
+ algtype = "zstd";
+ break;
}
seq_printf(seq, ",compress_algorithm=%s", algtype);
@@ -1437,16 +1443,17 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
{
struct f2fs_sb_info *sbi = F2FS_SB(root->d_sb);
- if (!f2fs_readonly(sbi->sb) && test_opt(sbi, BG_GC)) {
- if (test_opt(sbi, FORCE_FG_GC))
- seq_printf(seq, ",background_gc=%s", "sync");
- else
- seq_printf(seq, ",background_gc=%s", "on");
- } else {
+ if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC)
+ seq_printf(seq, ",background_gc=%s", "sync");
+ else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_ON)
+ seq_printf(seq, ",background_gc=%s", "on");
+ else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF)
seq_printf(seq, ",background_gc=%s", "off");
- }
+
if (test_opt(sbi, DISABLE_ROLL_FORWARD))
seq_puts(seq, ",disable_roll_forward");
+ if (test_opt(sbi, NORECOVERY))
+ seq_puts(seq, ",norecovery");
if (test_opt(sbi, DISCARD))
seq_puts(seq, ",discard");
else
@@ -1498,9 +1505,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root)
seq_puts(seq, ",data_flush");
seq_puts(seq, ",mode=");
- if (test_opt(sbi, ADAPTIVE))
+ if (F2FS_OPTION(sbi).fs_mode == FS_MODE_ADAPTIVE)
seq_puts(seq, "adaptive");
- else if (test_opt(sbi, LFS))
+ else if (F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS)
seq_puts(seq, "lfs");
seq_printf(seq, ",active_logs=%u", F2FS_OPTION(sbi).active_logs);
if (test_opt(sbi, RESERVE_ROOT))
@@ -1571,11 +1578,11 @@ static void default_options(struct f2fs_sb_info *sbi)
F2FS_OPTION(sbi).test_dummy_encryption = false;
F2FS_OPTION(sbi).s_resuid = make_kuid(&init_user_ns, F2FS_DEF_RESUID);
F2FS_OPTION(sbi).s_resgid = make_kgid(&init_user_ns, F2FS_DEF_RESGID);
- F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZO;
+ F2FS_OPTION(sbi).compress_algorithm = COMPRESS_LZ4;
F2FS_OPTION(sbi).compress_log_size = MIN_COMPRESS_LOG_SIZE;
F2FS_OPTION(sbi).compress_ext_cnt = 0;
+ F2FS_OPTION(sbi).bggc_mode = BGGC_MODE_ON;
- set_opt(sbi, BG_GC);
set_opt(sbi, INLINE_XATTR);
set_opt(sbi, INLINE_DATA);
set_opt(sbi, INLINE_DENTRY);
@@ -1587,9 +1594,9 @@ static void default_options(struct f2fs_sb_info *sbi)
set_opt(sbi, FLUSH_MERGE);
set_opt(sbi, DISCARD);
if (f2fs_sb_has_blkzoned(sbi))
- set_opt_mode(sbi, F2FS_MOUNT_LFS);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_LFS;
else
- set_opt_mode(sbi, F2FS_MOUNT_ADAPTIVE);
+ F2FS_OPTION(sbi).fs_mode = FS_MODE_ADAPTIVE;
#ifdef CONFIG_F2FS_FS_XATTR
set_opt(sbi, XATTR_USER);
@@ -1658,7 +1665,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi)
out_unlock:
up_write(&sbi->gc_lock);
restore_flag:
- sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
+ sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */
return err;
}
@@ -1781,7 +1788,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data)
* or if background_gc = off is passed in mount
* option. Also sync the filesystem.
*/
- if ((*flags & SB_RDONLY) || !test_opt(sbi, BG_GC)) {
+ if ((*flags & SB_RDONLY) ||
+ F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) {
if (sbi->gc_thread) {
f2fs_stop_gc_thread(sbi);
need_restart_gc = true;
@@ -1886,7 +1894,8 @@ repeat:
page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS);
if (IS_ERR(page)) {
if (PTR_ERR(page) == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto repeat;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1928,6 +1937,7 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
int offset = off & (sb->s_blocksize - 1);
size_t towrite = len;
struct page *page;
+ void *fsdata = NULL;
char *kaddr;
int err = 0;
int tocopy;
@@ -1937,10 +1947,11 @@ static ssize_t f2fs_quota_write(struct super_block *sb, int type,
towrite);
retry:
err = a_ops->write_begin(NULL, mapping, off, tocopy, 0,
- &page, NULL);
+ &page, &fsdata);
if (unlikely(err)) {
if (err == -ENOMEM) {
- congestion_wait(BLK_RW_ASYNC, HZ/50);
+ congestion_wait(BLK_RW_ASYNC,
+ DEFAULT_IO_TIMEOUT);
goto retry;
}
set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR);
@@ -1953,7 +1964,7 @@ retry:
flush_dcache_page(page);
a_ops->write_end(NULL, mapping, off, tocopy, tocopy,
- page, NULL);
+ page, fsdata);
offset = 0;
towrite -= tocopy;
off += tocopy;
@@ -3457,12 +3468,17 @@ try_onemore:
}
}
+ /* init per sbi slab cache */
+ err = f2fs_init_xattr_caches(sbi);
+ if (err)
+ goto free_io_dummy;
+
/* get an inode for meta space */
sbi->meta_inode = f2fs_iget(sb, F2FS_META_INO(sbi));
if (IS_ERR(sbi->meta_inode)) {
f2fs_err(sbi, "Failed to read F2FS meta data inode");
err = PTR_ERR(sbi->meta_inode);
- goto free_io_dummy;
+ goto free_xattr_cache;
}
err = f2fs_get_valid_checkpoint(sbi);
@@ -3590,7 +3606,7 @@ try_onemore:
f2fs_err(sbi, "Cannot turn on quotas: error %d", err);
}
#endif
- /* if there are nt orphan nodes free them */
+ /* if there are any orphan inodes, free them */
err = f2fs_recover_orphan_inodes(sbi);
if (err)
goto free_meta;
@@ -3599,7 +3615,8 @@ try_onemore:
goto reset_checkpoint;
/* recover fsynced data */
- if (!test_opt(sbi, DISABLE_ROLL_FORWARD)) {
+ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) &&
+ !test_opt(sbi, NORECOVERY)) {
/*
* mount should be failed, when device has readonly mode, and
* previous checkpoint was not done by clean system shutdown.
@@ -3665,7 +3682,7 @@ reset_checkpoint:
* If filesystem is not mounted as read-only then
* do start the gc_thread.
*/
- if (test_opt(sbi, BG_GC) && !f2fs_readonly(sb)) {
+ if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) {
/* After POR, we can run background GC thread.*/
err = f2fs_start_gc_thread(sbi);
if (err)
@@ -3734,6 +3751,8 @@ free_meta_inode:
make_bad_inode(sbi->meta_inode);
iput(sbi->meta_inode);
sbi->meta_inode = NULL;
+free_xattr_cache:
+ f2fs_destroy_xattr_caches(sbi);
free_io_dummy:
mempool_destroy(sbi->write_io_dummy);
free_percpu:
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 91d649790b1b..e3bbbef9b4f0 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -109,47 +109,47 @@ static ssize_t features_show(struct f2fs_attr *a,
return sprintf(buf, "0\n");
if (f2fs_sb_has_encrypt(sbi))
- len += snprintf(buf, PAGE_SIZE - len, "%s",
+ len += scnprintf(buf, PAGE_SIZE - len, "%s",
"encryption");
if (f2fs_sb_has_blkzoned(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "blkzoned");
if (f2fs_sb_has_extra_attr(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "extra_attr");
if (f2fs_sb_has_project_quota(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "projquota");
if (f2fs_sb_has_inode_chksum(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_checksum");
if (f2fs_sb_has_flexible_inline_xattr(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "flexible_inline_xattr");
if (f2fs_sb_has_quota_ino(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "quota_ino");
if (f2fs_sb_has_inode_crtime(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "inode_crtime");
if (f2fs_sb_has_lost_found(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "lost_found");
if (f2fs_sb_has_verity(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "verity");
if (f2fs_sb_has_sb_chksum(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "sb_checksum");
if (f2fs_sb_has_casefold(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "casefold");
if (f2fs_sb_has_compression(sbi))
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "compression");
- len += snprintf(buf + len, PAGE_SIZE - len, "%s%s",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s%s",
len ? ", " : "", "pin_file");
- len += snprintf(buf + len, PAGE_SIZE - len, "\n");
+ len += scnprintf(buf + len, PAGE_SIZE - len, "\n");
return len;
}
@@ -185,6 +185,12 @@ static ssize_t encoding_show(struct f2fs_attr *a,
return sprintf(buf, "(none)");
}
+static ssize_t mounted_time_sec_show(struct f2fs_attr *a,
+ struct f2fs_sb_info *sbi, char *buf)
+{
+ return sprintf(buf, "%llu", SIT_I(sbi)->mounted_time);
+}
+
#ifdef CONFIG_F2FS_STAT_FS
static ssize_t moved_blocks_foreground_show(struct f2fs_attr *a,
struct f2fs_sb_info *sbi, char *buf)
@@ -233,16 +239,16 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a,
int hot_count = sbi->raw_super->hot_ext_count;
int len = 0, i;
- len += snprintf(buf + len, PAGE_SIZE - len,
+ len += scnprintf(buf + len, PAGE_SIZE - len,
"cold file extension:\n");
for (i = 0; i < cold_count; i++)
- len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
- len += snprintf(buf + len, PAGE_SIZE - len,
+ len += scnprintf(buf + len, PAGE_SIZE - len,
"hot file extension:\n");
for (i = cold_count; i < cold_count + hot_count; i++)
- len += snprintf(buf + len, PAGE_SIZE - len, "%s\n",
+ len += scnprintf(buf + len, PAGE_SIZE - len, "%s\n",
extlist[i]);
return len;
}
@@ -544,6 +550,7 @@ F2FS_GENERAL_RO_ATTR(features);
F2FS_GENERAL_RO_ATTR(current_reserved_blocks);
F2FS_GENERAL_RO_ATTR(unusable);
F2FS_GENERAL_RO_ATTR(encoding);
+F2FS_GENERAL_RO_ATTR(mounted_time_sec);
#ifdef CONFIG_F2FS_STAT_FS
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_foreground_calls, cp_count);
F2FS_STAT_ATTR(STAT_INFO, f2fs_stat_info, cp_background_calls, bg_cp_count);
@@ -573,7 +580,9 @@ F2FS_FEATURE_RO_ATTR(verity, FEAT_VERITY);
#endif
F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM);
F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD);
+#ifdef CONFIG_F2FS_FS_COMPRESSION
F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION);
+#endif
#define ATTR_LIST(name) (&f2fs_attr_##name.attr)
static struct attribute *f2fs_attrs[] = {
@@ -621,6 +630,7 @@ static struct attribute *f2fs_attrs[] = {
ATTR_LIST(reserved_blocks),
ATTR_LIST(current_reserved_blocks),
ATTR_LIST(encoding),
+ ATTR_LIST(mounted_time_sec),
#ifdef CONFIG_F2FS_STAT_FS
ATTR_LIST(cp_foreground_calls),
ATTR_LIST(cp_background_calls),
@@ -654,7 +664,9 @@ static struct attribute *f2fs_feat_attrs[] = {
#endif
ATTR_LIST(sb_checksum),
ATTR_LIST(casefold),
+#ifdef CONFIG_F2FS_FS_COMPRESSION
ATTR_LIST(compression),
+#endif
NULL,
};
ATTRIBUTE_GROUPS(f2fs_feat);
diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c
index 296b3189448a..4f6582ef7ee3 100644
--- a/fs/f2fs/xattr.c
+++ b/fs/f2fs/xattr.c
@@ -23,6 +23,25 @@
#include "xattr.h"
#include "segment.h"
+static void *xattr_alloc(struct f2fs_sb_info *sbi, int size, bool *is_inline)
+{
+ if (likely(size == sbi->inline_xattr_slab_size)) {
+ *is_inline = true;
+ return kmem_cache_zalloc(sbi->inline_xattr_slab, GFP_NOFS);
+ }
+ *is_inline = false;
+ return f2fs_kzalloc(sbi, size, GFP_NOFS);
+}
+
+static void xattr_free(struct f2fs_sb_info *sbi, void *xattr_addr,
+ bool is_inline)
+{
+ if (is_inline)
+ kmem_cache_free(sbi->inline_xattr_slab, xattr_addr);
+ else
+ kvfree(xattr_addr);
+}
+
static int f2fs_xattr_generic_get(const struct xattr_handler *handler,
struct dentry *unused, struct inode *inode,
const char *name, void *buffer, size_t size)
@@ -301,7 +320,8 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr)
static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
unsigned int index, unsigned int len,
const char *name, struct f2fs_xattr_entry **xe,
- void **base_addr, int *base_size)
+ void **base_addr, int *base_size,
+ bool *is_inline)
{
void *cur_addr, *txattr_addr, *last_txattr_addr;
void *last_addr = NULL;
@@ -312,12 +332,12 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage,
if (!xnid && !inline_size)
return -ENODATA;
- *base_size = XATTR_SIZE(xnid, inode) + XATTR_PADDING_SIZE;
- txattr_addr = f2fs_kzalloc(F2FS_I_SB(inode), *base_size, GFP_NOFS);
+ *base_size = XATTR_SIZE(inode) + XATTR_PADDING_SIZE;
+ txattr_addr = xattr_alloc(F2FS_I_SB(inode), *base_size, is_inline);
if (!txattr_addr)
return -ENOMEM;
- last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(xnid, inode);
+ last_txattr_addr = (void *)txattr_addr + XATTR_SIZE(inode);
/* read from inline xattr */
if (inline_size) {
@@ -362,7 +382,7 @@ check:
*base_addr = txattr_addr;
return 0;
out:
- kvfree(txattr_addr);
+ xattr_free(F2FS_I_SB(inode), txattr_addr, *is_inline);
return err;
}
@@ -499,6 +519,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
unsigned int size, len;
void *base_addr = NULL;
int base_size;
+ bool is_inline;
if (name == NULL)
return -EINVAL;
@@ -509,7 +530,7 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
down_read(&F2FS_I(inode)->i_xattr_sem);
error = lookup_all_xattrs(inode, ipage, index, len, name,
- &entry, &base_addr, &base_size);
+ &entry, &base_addr, &base_size, &is_inline);
up_read(&F2FS_I(inode)->i_xattr_sem);
if (error)
return error;
@@ -532,14 +553,13 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name,
}
error = size;
out:
- kvfree(base_addr);
+ xattr_free(F2FS_I_SB(inode), base_addr, is_inline);
return error;
}
ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
{
struct inode *inode = d_inode(dentry);
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
struct f2fs_xattr_entry *entry;
void *base_addr, *last_base_addr;
int error = 0;
@@ -551,7 +571,7 @@ ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer, size_t buffer_size)
if (error)
return error;
- last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+ last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
list_for_each_xattr(entry, base_addr) {
const struct xattr_handler *handler =
@@ -609,7 +629,6 @@ static int __f2fs_setxattr(struct inode *inode, int index,
{
struct f2fs_xattr_entry *here, *last;
void *base_addr, *last_base_addr;
- nid_t xnid = F2FS_I(inode)->i_xattr_nid;
int found, newsize;
size_t len;
__u32 new_hsize;
@@ -633,7 +652,7 @@ static int __f2fs_setxattr(struct inode *inode, int index,
if (error)
return error;
- last_base_addr = (void *)base_addr + XATTR_SIZE(xnid, inode);
+ last_base_addr = (void *)base_addr + XATTR_SIZE(inode);
/* find entry with wanted name. */
here = __find_xattr(base_addr, last_base_addr, index, len, name);
@@ -758,14 +777,34 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name,
f2fs_balance_fs(sbi, true);
f2fs_lock_op(sbi);
- /* protect xattr_ver */
- down_write(&F2FS_I(inode)->i_sem);
down_write(&F2FS_I(inode)->i_xattr_sem);
err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags);
up_write(&F2FS_I(inode)->i_xattr_sem);
- up_write(&F2FS_I(inode)->i_sem);
f2fs_unlock_op(sbi);
f2fs_update_time(sbi, REQ_TIME);
return err;
}
+
+int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi)
+{
+ dev_t dev = sbi->sb->s_bdev->bd_dev;
+ char slab_name[32];
+
+ sprintf(slab_name, "f2fs_xattr_entry-%u:%u", MAJOR(dev), MINOR(dev));
+
+ sbi->inline_xattr_slab_size = F2FS_OPTION(sbi).inline_xattr_size *
+ sizeof(__le32) + XATTR_PADDING_SIZE;
+
+ sbi->inline_xattr_slab = f2fs_kmem_cache_create(slab_name,
+ sbi->inline_xattr_slab_size);
+ if (!sbi->inline_xattr_slab)
+ return -ENOMEM;
+
+ return 0;
+}
+
+void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi)
+{
+ kmem_cache_destroy(sbi->inline_xattr_slab);
+}
diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h
index de0c600b9cab..938fcd20565d 100644
--- a/fs/f2fs/xattr.h
+++ b/fs/f2fs/xattr.h
@@ -49,7 +49,7 @@ struct f2fs_xattr_entry {
__u8 e_name_index;
__u8 e_name_len;
__le16 e_value_size; /* size of attribute value */
- char e_name[0]; /* attribute name */
+ char e_name[]; /* attribute name */
};
#define XATTR_HDR(ptr) ((struct f2fs_xattr_header *)(ptr))
@@ -73,7 +73,8 @@ struct f2fs_xattr_entry {
entry = XATTR_NEXT_ENTRY(entry))
#define VALID_XATTR_BLOCK_SIZE (PAGE_SIZE - sizeof(struct node_footer))
#define XATTR_PADDING_SIZE (sizeof(__u32))
-#define XATTR_SIZE(x,i) (((x) ? VALID_XATTR_BLOCK_SIZE : 0) + \
+#define XATTR_SIZE(i) ((F2FS_I(i)->i_xattr_nid ? \
+ VALID_XATTR_BLOCK_SIZE : 0) + \
(inline_xattr_size(i)))
#define MIN_OFFSET(i) XATTR_ALIGN(inline_xattr_size(i) + \
VALID_XATTR_BLOCK_SIZE)
@@ -130,6 +131,8 @@ extern int f2fs_setxattr(struct inode *, int, const char *,
extern int f2fs_getxattr(struct inode *, int, const char *, void *,
size_t, struct page *);
extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t);
+extern int f2fs_init_xattr_caches(struct f2fs_sb_info *);
+extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *);
#else
#define f2fs_xattr_handlers NULL
@@ -150,6 +153,8 @@ static inline ssize_t f2fs_listxattr(struct dentry *dentry, char *buffer,
{
return -EOPNOTSUPP;
}
+static inline int f2fs_init_xattr_caches(struct f2fs_sb_info *sbi) { return 0; }
+static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { }
#endif
#ifdef CONFIG_F2FS_FS_SECURITY
diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c
index e6b8c49076bb..c070c0d8e3e9 100644
--- a/fs/hostfs/hostfs_kern.c
+++ b/fs/hostfs/hostfs_kern.c
@@ -139,8 +139,8 @@ static char *inode_name(struct inode *ino)
static char *follow_link(char *link)
{
- int len, n;
char *name, *resolved, *end;
+ int n;
name = __getname();
if (!name) {
@@ -164,15 +164,13 @@ static char *follow_link(char *link)
return name;
*(end + 1) = '\0';
- len = strlen(link) + strlen(name) + 1;
- resolved = kmalloc(len, GFP_KERNEL);
+ resolved = kasprintf(GFP_KERNEL, "%s%s", link, name);
if (resolved == NULL) {
n = -ENOMEM;
goto out_free;
}
- sprintf(resolved, "%s%s", link, name);
__putname(name);
kfree(link);
return resolved;
@@ -921,18 +919,16 @@ static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
- /* NULL is printed as <NULL> by sprintf: avoid that. */
+ /* NULL is printed as '(null)' by printf(): avoid that. */
if (req_root == NULL)
req_root = "";
err = -ENOMEM;
sb->s_fs_info = host_root_path =
- kmalloc(strlen(root_ino) + strlen(req_root) + 2, GFP_KERNEL);
+ kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
if (host_root_path == NULL)
goto out;
- sprintf(host_root_path, "%s/%s", root_ino, req_root);
-
root_inode = new_inode(sb);
if (!root_inode)
goto out;
diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c
index 690221747b47..d1a0e2c8b1b4 100644
--- a/fs/nfs/blocklayout/blocklayout.c
+++ b/fs/nfs/blocklayout/blocklayout.c
@@ -476,7 +476,7 @@ static void bl_free_layout_hdr(struct pnfs_layout_hdr *lo)
err = ext_tree_remove(bl, true, 0, LLONG_MAX);
WARN_ON(err);
- kfree(bl);
+ kfree_rcu(bl, bl_layout.plh_rcu);
}
static struct pnfs_layout_hdr *__bl_alloc_layout_hdr(struct inode *inode,
diff --git a/fs/nfs/callback.h b/fs/nfs/callback.h
index 549350259840..6a2033131c06 100644
--- a/fs/nfs/callback.h
+++ b/fs/nfs/callback.h
@@ -127,7 +127,9 @@ extern __be32 nfs4_callback_sequence(void *argp, void *resp,
#define RCA4_TYPE_MASK_OBJ_LAYOUT_MAX 9
#define RCA4_TYPE_MASK_OTHER_LAYOUT_MIN 12
#define RCA4_TYPE_MASK_OTHER_LAYOUT_MAX 15
-#define RCA4_TYPE_MASK_ALL 0xf31f
+#define PNFS_FF_RCA4_TYPE_MASK_READ 16
+#define PNFS_FF_RCA4_TYPE_MASK_RW 17
+#define RCA4_TYPE_MASK_ALL 0x3f31f
struct cb_recallanyargs {
uint32_t craa_objs_to_keep;
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index cd4c6bc81cae..e61dbc9b86ae 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -121,31 +121,31 @@ out:
*/
static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp,
const nfs4_stateid *stateid)
+ __must_hold(RCU)
{
struct nfs_server *server;
struct inode *inode;
struct pnfs_layout_hdr *lo;
+ rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_is_valid(lo))
+ continue;
if (stateid != NULL &&
!nfs4_stateid_match_other(stateid, &lo->plh_stateid))
continue;
+ if (!nfs_sb_active(server->super))
+ continue;
inode = igrab(lo->plh_inode);
- if (!inode)
- return ERR_PTR(-EAGAIN);
- if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- return ERR_PTR(-EAGAIN);
- }
- return inode;
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
}
}
-
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
@@ -163,28 +163,25 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp,
struct inode *inode;
struct pnfs_layout_hdr *lo;
+ rcu_read_lock();
list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry(lo, &server->layouts, plh_layouts) {
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
nfsi = NFS_I(lo->plh_inode);
if (nfs_compare_fh(fh, &nfsi->fh))
continue;
if (nfsi->layout != lo)
continue;
+ if (!nfs_sb_active(server->super))
+ continue;
inode = igrab(lo->plh_inode);
- if (!inode)
- return ERR_PTR(-EAGAIN);
- if (!nfs_sb_active(inode->i_sb)) {
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
- iput(inode);
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
- return ERR_PTR(-EAGAIN);
- }
- return inode;
+ rcu_read_unlock();
+ if (inode)
+ return inode;
+ nfs_sb_deactive(server->super);
+ return ERR_PTR(-EAGAIN);
}
}
-
+ rcu_read_unlock();
return ERR_PTR(-ENOENT);
}
@@ -194,14 +191,9 @@ static struct inode *nfs_layout_find_inode(struct nfs_client *clp,
{
struct inode *inode;
- spin_lock(&clp->cl_lock);
- rcu_read_lock();
inode = nfs_layout_find_inode_by_stateid(clp, stateid);
if (inode == ERR_PTR(-ENOENT))
inode = nfs_layout_find_inode_by_fh(clp, fh);
- rcu_read_unlock();
- spin_unlock(&clp->cl_lock);
-
return inode;
}
@@ -280,7 +272,7 @@ static u32 initiate_file_draining(struct nfs_client *clp,
goto unlock;
}
- pnfs_set_layout_stateid(lo, &args->cbl_stateid, true);
+ pnfs_set_layout_stateid(lo, &args->cbl_stateid, NULL, true);
switch (pnfs_mark_matching_lsegs_return(lo, &free_me_list,
&args->cbl_range,
be32_to_cpu(args->cbl_stateid.seqid))) {
@@ -605,6 +597,7 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
struct cb_recallanyargs *args = argp;
__be32 status;
fmode_t flags = 0;
+ bool schedule_manager = false;
status = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
if (!cps->clp) /* set in cb_sequence */
@@ -627,6 +620,18 @@ __be32 nfs4_callback_recallany(void *argp, void *resp,
if (args->craa_type_mask & BIT(RCA4_TYPE_MASK_FILE_LAYOUT))
pnfs_recall_all_layouts(cps->clp);
+
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_READ)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (args->craa_type_mask & BIT(PNFS_FF_RCA4_TYPE_MASK_RW)) {
+ set_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &cps->clp->cl_state);
+ schedule_manager = true;
+ }
+ if (schedule_manager)
+ nfs4_schedule_state_manager(cps->clp);
+
out:
dprintk("%s: exit with status = %d\n", __func__, ntohl(status));
return status;
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index 1865322de142..816e1427f17e 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -378,6 +378,18 @@ nfs_inode_detach_delegation(struct inode *inode)
}
static void
+nfs_update_delegation_cred(struct nfs_delegation *delegation,
+ const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred_fscmp(delegation->cred, cred) != 0) {
+ old = xchg(&delegation->cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
+static void
nfs_update_inplace_delegation(struct nfs_delegation *delegation,
const struct nfs_delegation *update)
{
@@ -385,8 +397,14 @@ nfs_update_inplace_delegation(struct nfs_delegation *delegation,
delegation->stateid.seqid = update->stateid.seqid;
smp_wmb();
delegation->type = update->type;
- if (test_and_clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags))
+ delegation->pagemod_limit = update->pagemod_limit;
+ if (test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) {
+ delegation->change_attr = update->change_attr;
+ nfs_update_delegation_cred(delegation, update->cred);
+ /* smp_mb__before_atomic() is implicit due to xchg() */
+ clear_bit(NFS_DELEGATION_REVOKED, &delegation->flags);
atomic_long_inc(&nfs_active_delegations);
+ }
}
}
@@ -545,21 +563,11 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation)
return ret;
}
-/**
- * nfs_client_return_marked_delegations - return previously marked delegations
- * @clp: nfs_client to process
- *
- * Note that this function is designed to be called by the state
- * manager thread. For this reason, it cannot flush the dirty data,
- * since that could deadlock in case of a state recovery error.
- *
- * Returns zero on success, or a negative errno value.
- */
-int nfs_client_return_marked_delegations(struct nfs_client *clp)
+static int nfs_server_return_marked_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
struct nfs_delegation *prev;
- struct nfs_server *server;
struct inode *inode;
struct inode *place_holder = NULL;
struct nfs_delegation *place_holder_deleg = NULL;
@@ -569,78 +577,79 @@ restart:
/*
* To avoid quadratic looping we hold a reference
* to an inode place_holder. Each time we restart, we
- * list nfs_servers from the server of that inode, and
- * delegation in the server from the delegations of that
- * inode.
+ * list delegation in the server from the delegations
+ * of that inode.
* prev is an RCU-protected pointer to a delegation which
* wasn't marked for return and might be a good choice for
* the next place_holder.
*/
- rcu_read_lock();
prev = NULL;
+ delegation = NULL;
+ rcu_read_lock();
if (place_holder)
- server = NFS_SERVER(place_holder);
- else
- server = list_entry_rcu(clp->cl_superblocks.next,
- struct nfs_server, client_link);
- list_for_each_entry_from_rcu(server, &clp->cl_superblocks, client_link) {
- delegation = NULL;
- if (place_holder && server == NFS_SERVER(place_holder))
- delegation = rcu_dereference(NFS_I(place_holder)->delegation);
- if (!delegation || delegation != place_holder_deleg)
- delegation = list_entry_rcu(server->delegations.next,
- struct nfs_delegation, super_list);
- list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
- struct inode *to_put = NULL;
-
- if (!nfs_delegation_need_return(delegation)) {
+ delegation = rcu_dereference(NFS_I(place_holder)->delegation);
+ if (!delegation || delegation != place_holder_deleg)
+ delegation = list_entry_rcu(server->delegations.next,
+ struct nfs_delegation, super_list);
+ list_for_each_entry_from_rcu(delegation, &server->delegations, super_list) {
+ struct inode *to_put = NULL;
+
+ if (test_bit(NFS_DELEGATION_INODE_FREEING, &delegation->flags))
+ continue;
+ if (!nfs_delegation_need_return(delegation)) {
+ if (nfs4_is_valid_delegation(delegation, 0))
prev = delegation;
- continue;
- }
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
-
- if (prev) {
- struct inode *tmp;
-
- tmp = nfs_delegation_grab_inode(prev);
- if (tmp) {
- to_put = place_holder;
- place_holder = tmp;
- place_holder_deleg = prev;
- }
- }
+ continue;
+ }
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- if (to_put)
- iput(to_put);
- nfs_sb_deactive(server->super);
- goto restart;
+ if (prev) {
+ struct inode *tmp = nfs_delegation_grab_inode(prev);
+ if (tmp) {
+ to_put = place_holder;
+ place_holder = tmp;
+ place_holder_deleg = prev;
}
- delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ }
+
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL) {
rcu_read_unlock();
+ iput(to_put);
+ goto restart;
+ }
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
- if (to_put)
- iput(to_put);
+ iput(to_put);
- err = nfs_end_delegation_return(inode, delegation, 0);
- iput(inode);
- nfs_sb_deactive(server->super);
- cond_resched();
- if (!err)
- goto restart;
- set_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state);
- if (place_holder)
- iput(place_holder);
- return err;
- }
+ err = nfs_end_delegation_return(inode, delegation, 0);
+ iput(inode);
+ cond_resched();
+ if (!err)
+ goto restart;
+ set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state);
+ goto out;
}
rcu_read_unlock();
- if (place_holder)
- iput(place_holder);
- return 0;
+out:
+ iput(place_holder);
+ return err;
+}
+
+/**
+ * nfs_client_return_marked_delegations - return previously marked delegations
+ * @clp: nfs_client to process
+ *
+ * Note that this function is designed to be called by the state
+ * manager thread. For this reason, it cannot flush the dirty data,
+ * since that could deadlock in case of a state recovery error.
+ *
+ * Returns zero on success, or a negative errno value.
+ */
+int nfs_client_return_marked_delegations(struct nfs_client *clp)
+{
+ return nfs_client_for_each_server(clp,
+ nfs_server_return_marked_delegations, NULL);
}
/**
@@ -1083,53 +1092,51 @@ void nfs_delegation_mark_reclaim(struct nfs_client *clp)
rcu_read_unlock();
}
-/**
- * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
- * @clp: nfs_client to process
- *
- */
-void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+static int nfs_server_reap_unclaimed_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
- struct nfs_server *server;
struct inode *inode;
-
restart:
rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry_rcu(delegation, &server->delegations,
- super_list) {
- if (test_bit(NFS_DELEGATION_INODE_FREEING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_NEED_RECLAIM,
- &delegation->flags) == 0)
- continue;
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- nfs_sb_deactive(server->super);
- goto restart;
- }
- delegation = nfs_start_delegation_return_locked(NFS_I(inode));
- rcu_read_unlock();
- if (delegation != NULL) {
- if (nfs_detach_delegation(NFS_I(inode), delegation,
- server) != NULL)
- nfs_free_delegation(delegation);
- /* Match nfs_start_delegation_return_locked */
- nfs_put_delegation(delegation);
- }
- iput(inode);
- nfs_sb_deactive(server->super);
- cond_resched();
- goto restart;
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_NEED_RECLAIM,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ delegation = nfs_start_delegation_return_locked(NFS_I(inode));
+ rcu_read_unlock();
+ if (delegation != NULL) {
+ if (nfs_detach_delegation(NFS_I(inode), delegation,
+ server) != NULL)
+ nfs_free_delegation(delegation);
+ /* Match nfs_start_delegation_return_locked */
+ nfs_put_delegation(delegation);
}
+ iput(inode);
+ cond_resched();
+ goto restart;
}
rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_delegation_reap_unclaimed - reap unclaimed delegations after reboot recovery is done
+ * @clp: nfs_client to process
+ *
+ */
+void nfs_delegation_reap_unclaimed(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_unclaimed_delegations,
+ NULL);
}
static inline bool nfs4_server_rebooted(const struct nfs_client *clp)
@@ -1215,62 +1222,61 @@ nfs_delegation_test_free_expired(struct inode *inode,
nfs_remove_bad_delegation(inode, stateid);
}
-/**
- * nfs_reap_expired_delegations - reap expired delegations
- * @clp: nfs_client to process
- *
- * Iterates through all the delegations associated with this server and
- * checks if they have may have been revoked. This function is usually
- * expected to be called in cases where the server may have lost its
- * lease.
- */
-void nfs_reap_expired_delegations(struct nfs_client *clp)
+static int nfs_server_reap_expired_delegations(struct nfs_server *server,
+ void __always_unused *data)
{
struct nfs_delegation *delegation;
- struct nfs_server *server;
struct inode *inode;
const struct cred *cred;
nfs4_stateid stateid;
-
restart:
rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) {
- list_for_each_entry_rcu(delegation, &server->delegations,
- super_list) {
- if (test_bit(NFS_DELEGATION_INODE_FREEING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_RETURNING,
- &delegation->flags) ||
- test_bit(NFS_DELEGATION_TEST_EXPIRED,
- &delegation->flags) == 0)
- continue;
- if (!nfs_sb_active(server->super))
- break; /* continue in outer loop */
- inode = nfs_delegation_grab_inode(delegation);
- if (inode == NULL) {
- rcu_read_unlock();
- nfs_sb_deactive(server->super);
- goto restart;
- }
- cred = get_cred_rcu(delegation->cred);
- nfs4_stateid_copy(&stateid, &delegation->stateid);
- clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
- rcu_read_unlock();
- nfs_delegation_test_free_expired(inode, &stateid, cred);
- put_cred(cred);
- if (nfs4_server_rebooted(clp)) {
- nfs_inode_mark_test_expired_delegation(server,inode);
- iput(inode);
- nfs_sb_deactive(server->super);
- return;
- }
+restart_locked:
+ list_for_each_entry_rcu(delegation, &server->delegations, super_list) {
+ if (test_bit(NFS_DELEGATION_INODE_FREEING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_RETURNING,
+ &delegation->flags) ||
+ test_bit(NFS_DELEGATION_TEST_EXPIRED,
+ &delegation->flags) == 0)
+ continue;
+ inode = nfs_delegation_grab_inode(delegation);
+ if (inode == NULL)
+ goto restart_locked;
+ spin_lock(&delegation->lock);
+ cred = get_cred_rcu(delegation->cred);
+ nfs4_stateid_copy(&stateid, &delegation->stateid);
+ spin_unlock(&delegation->lock);
+ clear_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags);
+ rcu_read_unlock();
+ nfs_delegation_test_free_expired(inode, &stateid, cred);
+ put_cred(cred);
+ if (!nfs4_server_rebooted(server->nfs_client)) {
iput(inode);
- nfs_sb_deactive(server->super);
cond_resched();
goto restart;
}
+ nfs_inode_mark_test_expired_delegation(server,inode);
+ iput(inode);
+ return -EAGAIN;
}
rcu_read_unlock();
+ return 0;
+}
+
+/**
+ * nfs_reap_expired_delegations - reap expired delegations
+ * @clp: nfs_client to process
+ *
+ * Iterates through all the delegations associated with this server and
+ * checks if they have may have been revoked. This function is usually
+ * expected to be called in cases where the server may have lost its
+ * lease.
+ */
+void nfs_reap_expired_delegations(struct nfs_client *clp)
+{
+ nfs_client_for_each_server(clp, nfs_server_reap_expired_delegations,
+ NULL);
}
void nfs_inode_find_delegation_state_and_recover(struct inode *inode,
@@ -1359,11 +1365,14 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
{
struct nfs_inode *nfsi = NFS_I(inode);
struct nfs_delegation *delegation;
- bool ret;
+ bool ret = false;
flags &= FMODE_READ|FMODE_WRITE;
rcu_read_lock();
delegation = rcu_dereference(nfsi->delegation);
+ if (!delegation)
+ goto out;
+ spin_lock(&delegation->lock);
ret = nfs4_is_valid_delegation(delegation, flags);
if (ret) {
nfs4_stateid_copy(dst, &delegation->stateid);
@@ -1371,6 +1380,8 @@ bool nfs4_copy_delegation_stateid(struct inode *inode, fmode_t flags,
if (cred)
*cred = get_cred(delegation->cred);
}
+ spin_unlock(&delegation->lock);
+out:
rcu_read_unlock();
return ret;
}
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index d4b839b6cf89..5a331da5f55a 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -141,10 +141,9 @@ struct nfs_cache_array {
int size;
int eof_index;
u64 last_cookie;
- struct nfs_cache_array_entry array[0];
+ struct nfs_cache_array_entry array[];
};
-typedef int (*decode_dirent_t)(struct xdr_stream *, struct nfs_entry *, bool);
typedef struct {
struct file *file;
struct page *page;
@@ -153,7 +152,7 @@ typedef struct {
u64 *dir_cookie;
u64 last_cookie;
loff_t current_index;
- decode_dirent_t decode;
+ loff_t prev_index;
unsigned long dir_verifier;
unsigned long timestamp;
@@ -240,6 +239,25 @@ out:
return ret;
}
+static inline
+int is_32bit_api(void)
+{
+#ifdef CONFIG_COMPAT
+ return in_compat_syscall();
+#else
+ return (BITS_PER_LONG == 32);
+#endif
+}
+
+static
+bool nfs_readdir_use_cookie(const struct file *filp)
+{
+ if ((filp->f_mode & FMODE_32BITHASH) ||
+ (!(filp->f_mode & FMODE_64BITHASH) && is_32bit_api()))
+ return false;
+ return true;
+}
+
static
int nfs_readdir_search_for_pos(struct nfs_cache_array *array, nfs_readdir_descriptor_t *desc)
{
@@ -289,7 +307,7 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
!nfs_readdir_inode_mapping_valid(nfsi)) {
ctx->duped = 0;
ctx->attr_gencount = nfsi->attr_gencount;
- } else if (new_pos < desc->ctx->pos) {
+ } else if (new_pos < desc->prev_index) {
if (ctx->duped > 0
&& ctx->dup_cookie == *desc->dir_cookie) {
if (printk_ratelimit()) {
@@ -305,7 +323,11 @@ int nfs_readdir_search_for_cookie(struct nfs_cache_array *array, nfs_readdir_des
ctx->dup_cookie = *desc->dir_cookie;
ctx->duped = -1;
}
- desc->ctx->pos = new_pos;
+ if (nfs_readdir_use_cookie(desc->file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos = new_pos;
+ desc->prev_index = new_pos;
desc->cache_entry_index = i;
return 0;
}
@@ -376,9 +398,10 @@ error:
static int xdr_decode(nfs_readdir_descriptor_t *desc,
struct nfs_entry *entry, struct xdr_stream *xdr)
{
+ struct inode *inode = file_inode(desc->file);
int error;
- error = desc->decode(xdr, entry, desc->plus);
+ error = NFS_PROTO(inode)->decode_dirent(xdr, entry, desc->plus);
if (error)
return error;
entry->fattr->time_start = desc->timestamp;
@@ -756,6 +779,7 @@ int readdir_search_pagecache(nfs_readdir_descriptor_t *desc)
if (desc->page_index == 0) {
desc->current_index = 0;
+ desc->prev_index = 0;
desc->last_cookie = 0;
}
do {
@@ -786,11 +810,14 @@ int nfs_do_filldir(nfs_readdir_descriptor_t *desc)
desc->eof = true;
break;
}
- desc->ctx->pos++;
if (i < (array->size-1))
*desc->dir_cookie = array->array[i+1].cookie;
else
*desc->dir_cookie = array->last_cookie;
+ if (nfs_readdir_use_cookie(file))
+ desc->ctx->pos = *desc->dir_cookie;
+ else
+ desc->ctx->pos++;
if (ctx->duped != 0)
ctx->duped = 1;
}
@@ -860,9 +887,14 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
{
struct dentry *dentry = file_dentry(file);
struct inode *inode = d_inode(dentry);
- nfs_readdir_descriptor_t my_desc,
- *desc = &my_desc;
struct nfs_open_dir_context *dir_ctx = file->private_data;
+ nfs_readdir_descriptor_t my_desc = {
+ .file = file,
+ .ctx = ctx,
+ .dir_cookie = &dir_ctx->dir_cookie,
+ .plus = nfs_use_readdirplus(inode, ctx),
+ },
+ *desc = &my_desc;
int res = 0;
dfprintk(FILE, "NFS: readdir(%pD2) starting at cookie %llu\n",
@@ -875,14 +907,6 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx)
* to either find the entry with the appropriate number or
* revalidate the cookie.
*/
- memset(desc, 0, sizeof(*desc));
-
- desc->file = file;
- desc->ctx = ctx;
- desc->dir_cookie = &dir_ctx->dir_cookie;
- desc->decode = NFS_PROTO(inode)->decode_dirent;
- desc->plus = nfs_use_readdirplus(inode, ctx);
-
if (ctx->pos == 0 || nfs_attribute_cache_expired(inode))
res = nfs_revalidate_mapping(inode, file->f_mapping);
if (res < 0)
@@ -954,7 +978,10 @@ static loff_t nfs_llseek_dir(struct file *filp, loff_t offset, int whence)
}
if (offset != filp->f_pos) {
filp->f_pos = offset;
- dir_ctx->dir_cookie = 0;
+ if (nfs_readdir_use_cookie(filp))
+ dir_ctx->dir_cookie = offset;
+ else
+ dir_ctx->dir_cookie = 0;
dir_ctx->duped = 0;
}
inode_unlock(inode);
@@ -2282,7 +2309,7 @@ static DEFINE_SPINLOCK(nfs_access_lru_lock);
static LIST_HEAD(nfs_access_lru_list);
static atomic_long_t nfs_access_nr_entries;
-static unsigned long nfs_access_max_cachesize = ULONG_MAX;
+static unsigned long nfs_access_max_cachesize = 4*1024*1024;
module_param(nfs_access_max_cachesize, ulong, 0644);
MODULE_PARM_DESC(nfs_access_max_cachesize, "NFS access maximum total cache length");
@@ -2642,9 +2669,10 @@ static int nfs_do_access(struct inode *inode, const struct cred *cred, int mask)
status = NFS_PROTO(inode)->access(inode, &cache);
if (status != 0) {
if (status == -ESTALE) {
- nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
}
goto out;
}
@@ -2732,14 +2760,7 @@ force_lookup:
if (!NFS_PROTO(inode)->access)
goto out_notsup;
- /* Always try fast lookups first */
- rcu_read_lock();
- res = nfs_do_access(inode, cred, mask|MAY_NOT_BLOCK);
- rcu_read_unlock();
- if (res == -ECHILD && !(mask & MAY_NOT_BLOCK)) {
- /* Fast lookup failed, try the slow way */
- res = nfs_do_access(inode, cred, mask);
- }
+ res = nfs_do_access(inode, cred, mask);
out:
if (!res && (mask & MAY_EXEC))
res = nfs_execute_ok(inode, mask);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index b768a0b42e82..a57e7c72c7f4 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -94,7 +94,7 @@ struct nfs_direct_req {
#define NFS_ODIRECT_RESCHED_WRITES (2) /* write verification failed */
/* for read */
#define NFS_ODIRECT_SHOULD_DIRTY (3) /* dirty user-space page after read */
- struct nfs_writeverf verf; /* unstable write verifier */
+#define NFS_ODIRECT_DONE INT_MAX /* write verification failed */
};
static const struct nfs_pgio_completion_ops nfs_direct_write_completion_ops;
@@ -151,106 +151,6 @@ nfs_direct_count_bytes(struct nfs_direct_req *dreq,
dreq->count = dreq_len;
}
-/*
- * nfs_direct_select_verf - select the right verifier
- * @dreq - direct request possibly spanning multiple servers
- * @ds_clp - nfs_client of data server or NULL if MDS / non-pnfs
- * @commit_idx - commit bucket index for the DS
- *
- * returns the correct verifier to use given the role of the server
- */
-static struct nfs_writeverf *
-nfs_direct_select_verf(struct nfs_direct_req *dreq,
- struct nfs_client *ds_clp,
- int commit_idx)
-{
- struct nfs_writeverf *verfp = &dreq->verf;
-
-#ifdef CONFIG_NFS_V4_1
- /*
- * pNFS is in use, use the DS verf except commit_through_mds is set
- * for layout segment where nbuckets is zero.
- */
- if (ds_clp && dreq->ds_cinfo.nbuckets > 0) {
- if (commit_idx >= 0 && commit_idx < dreq->ds_cinfo.nbuckets)
- verfp = &dreq->ds_cinfo.buckets[commit_idx].direct_verf;
- else
- WARN_ON_ONCE(1);
- }
-#endif
- return verfp;
-}
-
-
-/*
- * nfs_direct_set_hdr_verf - set the write/commit verifier
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verfs
- *
- * Set the server's (MDS or DS) "seen" verifier
- */
-static void nfs_direct_set_hdr_verf(struct nfs_direct_req *dreq,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
- WARN_ON_ONCE(verfp->committed >= 0);
- memcpy(verfp, &hdr->verf, sizeof(struct nfs_writeverf));
- WARN_ON_ONCE(verfp->committed < 0);
-}
-
-static int nfs_direct_cmp_verf(const struct nfs_writeverf *v1,
- const struct nfs_writeverf *v2)
-{
- return nfs_write_verifier_cmp(&v1->verifier, &v2->verifier);
-}
-
-/*
- * nfs_direct_cmp_hdr_verf - compare verifier for pgio header
- * @dreq - direct request possibly spanning multiple servers
- * @hdr - pageio header to validate against previously seen verf
- *
- * set the server's "seen" verf if not initialized.
- * returns result of comparison between @hdr->verf and the "seen"
- * verf of the server used by @hdr (DS or MDS)
- */
-static int nfs_direct_set_or_cmp_hdr_verf(struct nfs_direct_req *dreq,
- struct nfs_pgio_header *hdr)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, hdr->ds_clp, hdr->ds_commit_idx);
- if (verfp->committed < 0) {
- nfs_direct_set_hdr_verf(dreq, hdr);
- return 0;
- }
- return nfs_direct_cmp_verf(verfp, &hdr->verf);
-}
-
-/*
- * nfs_direct_cmp_commit_data_verf - compare verifier for commit data
- * @dreq - direct request possibly spanning multiple servers
- * @data - commit data to validate against previously seen verf
- *
- * returns result of comparison between @data->verf and the verf of
- * the server used by @data (DS or MDS)
- */
-static int nfs_direct_cmp_commit_data_verf(struct nfs_direct_req *dreq,
- struct nfs_commit_data *data)
-{
- struct nfs_writeverf *verfp;
-
- verfp = nfs_direct_select_verf(dreq, data->ds_clp,
- data->ds_commit_index);
-
- /* verifier not set so always fail */
- if (verfp->committed < 0 || data->res.verf->committed <= NFS_UNSTABLE)
- return 1;
-
- return nfs_direct_cmp_verf(verfp, data->res.verf);
-}
-
/**
* nfs_direct_IO - NFS address space operation for direct I/O
* @iocb: target I/O control block
@@ -305,7 +205,7 @@ static inline struct nfs_direct_req *nfs_direct_req_alloc(void)
kref_get(&dreq->kref);
init_completion(&dreq->completion);
INIT_LIST_HEAD(&dreq->mds_cinfo.list);
- dreq->verf.committed = NFS_INVALID_STABLE_HOW; /* not set yet */
+ pnfs_init_ds_commit_info(&dreq->ds_cinfo);
INIT_WORK(&dreq->work, nfs_direct_write_schedule_work);
spin_lock_init(&dreq->lock);
@@ -316,7 +216,7 @@ static void nfs_direct_req_free(struct kref *kref)
{
struct nfs_direct_req *dreq = container_of(kref, struct nfs_direct_req, kref);
- nfs_free_pnfs_ds_cinfo(&dreq->ds_cinfo);
+ pnfs_release_ds_info(&dreq->ds_cinfo, dreq->inode);
if (dreq->l_ctx != NULL)
nfs_put_lock_context(dreq->l_ctx);
if (dreq->ctx != NULL)
@@ -571,6 +471,7 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
@@ -605,15 +506,30 @@ out:
}
static void
+nfs_direct_join_group(struct list_head *list, struct inode *inode)
+{
+ struct nfs_page *req, *next;
+
+ list_for_each_entry(req, list, wb_list) {
+ if (req->wb_head != req || req->wb_this_page == req)
+ continue;
+ for (next = req->wb_this_page;
+ next != req->wb_head;
+ next = next->wb_this_page) {
+ nfs_list_remove_request(next);
+ nfs_release_request(next);
+ }
+ nfs_join_page_group(req, inode);
+ }
+}
+
+static void
nfs_direct_write_scan_commit_list(struct inode *inode,
struct list_head *list,
struct nfs_commit_info *cinfo)
{
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
-#ifdef CONFIG_NFS_V4_1
- if (cinfo->ds != NULL && cinfo->ds->nwritten != 0)
- NFS_SERVER(inode)->pnfs_curr_ld->recover_commit_reqs(list, cinfo);
-#endif
+ pnfs_recover_commit_reqs(list, cinfo);
nfs_scan_commit_list(&cinfo->mds->list, list, cinfo, 0);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
@@ -629,11 +545,12 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
nfs_init_cinfo_from_dreq(&cinfo, dreq);
nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+ nfs_direct_join_group(&reqs, dreq->inode);
+
dreq->count = 0;
dreq->max_count = 0;
list_for_each_entry(req, &reqs, wb_list)
dreq->max_count += req->wb_bytes;
- dreq->verf.committed = NFS_INVALID_STABLE_HOW;
nfs_clear_pnfs_ds_commit_verifiers(&dreq->ds_cinfo);
get_dreq(dreq);
@@ -670,27 +587,35 @@ static void nfs_direct_write_reschedule(struct nfs_direct_req *dreq)
static void nfs_direct_commit_complete(struct nfs_commit_data *data)
{
+ const struct nfs_writeverf *verf = data->res.verf;
struct nfs_direct_req *dreq = data->dreq;
struct nfs_commit_info cinfo;
struct nfs_page *req;
int status = data->task.tk_status;
+ if (status < 0) {
+ /* Errors in commit are fatal */
+ dreq->error = status;
+ dreq->max_count = 0;
+ dreq->count = 0;
+ dreq->flags = NFS_ODIRECT_DONE;
+ } else if (dreq->flags == NFS_ODIRECT_DONE)
+ status = dreq->error;
+
nfs_init_cinfo_from_dreq(&cinfo, dreq);
- if (status < 0 || nfs_direct_cmp_commit_data_verf(dreq, data))
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
while (!list_empty(&data->pages)) {
req = nfs_list_entry(data->pages.next);
nfs_list_remove_request(req);
- if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES) {
+ if (status >= 0 && !nfs_write_match_verf(verf, req)) {
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
/*
* Despite the reboot, the write was successful,
* so reset wb_nio.
*/
req->wb_nio = 0;
- /* Note the rewrite will go through mds */
nfs_mark_request_commit(req, NULL, &cinfo, 0);
- } else
+ } else /* Error or match */
nfs_release_request(req);
nfs_unlock_and_release_request(req);
}
@@ -705,7 +630,8 @@ static void nfs_direct_resched_write(struct nfs_commit_info *cinfo,
struct nfs_direct_req *dreq = cinfo->dreq;
spin_lock(&dreq->lock);
- dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
+ if (dreq->flags != NFS_ODIRECT_DONE)
+ dreq->flags = NFS_ODIRECT_RESCHED_WRITES;
spin_unlock(&dreq->lock);
nfs_mark_request_commit(req, NULL, cinfo, 0);
}
@@ -728,6 +654,23 @@ static void nfs_direct_commit_schedule(struct nfs_direct_req *dreq)
nfs_direct_write_reschedule(dreq);
}
+static void nfs_direct_write_clear_reqs(struct nfs_direct_req *dreq)
+{
+ struct nfs_commit_info cinfo;
+ struct nfs_page *req;
+ LIST_HEAD(reqs);
+
+ nfs_init_cinfo_from_dreq(&cinfo, dreq);
+ nfs_direct_write_scan_commit_list(dreq->inode, &reqs, &cinfo);
+
+ while (!list_empty(&reqs)) {
+ req = nfs_list_entry(reqs.next);
+ nfs_list_remove_request(req);
+ nfs_release_request(req);
+ nfs_unlock_and_release_request(req);
+ }
+}
+
static void nfs_direct_write_schedule_work(struct work_struct *work)
{
struct nfs_direct_req *dreq = container_of(work, struct nfs_direct_req, work);
@@ -742,6 +685,7 @@ static void nfs_direct_write_schedule_work(struct work_struct *work)
nfs_direct_write_reschedule(dreq);
break;
default:
+ nfs_direct_write_clear_reqs(dreq);
nfs_zap_mapping(dreq->inode, dreq->inode->i_mapping);
nfs_direct_complete(dreq);
}
@@ -768,20 +712,15 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr)
}
nfs_direct_count_bytes(dreq, hdr);
- if (hdr->good_bytes != 0) {
- if (nfs_write_need_commit(hdr)) {
- if (dreq->flags == NFS_ODIRECT_RESCHED_WRITES)
- request_commit = true;
- else if (dreq->flags == 0) {
- nfs_direct_set_hdr_verf(dreq, hdr);
- request_commit = true;
- dreq->flags = NFS_ODIRECT_DO_COMMIT;
- } else if (dreq->flags == NFS_ODIRECT_DO_COMMIT) {
- request_commit = true;
- if (nfs_direct_set_or_cmp_hdr_verf(dreq, hdr))
- dreq->flags =
- NFS_ODIRECT_RESCHED_WRITES;
- }
+ if (hdr->good_bytes != 0 && nfs_write_need_commit(hdr)) {
+ switch (dreq->flags) {
+ case 0:
+ dreq->flags = NFS_ODIRECT_DO_COMMIT;
+ request_commit = true;
+ break;
+ case NFS_ODIRECT_RESCHED_WRITES:
+ case NFS_ODIRECT_DO_COMMIT:
+ request_commit = true;
}
}
spin_unlock(&dreq->lock);
@@ -990,11 +929,13 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
l_ctx = nfs_get_lock_context(dreq->ctx);
if (IS_ERR(l_ctx)) {
result = PTR_ERR(l_ctx);
+ nfs_direct_req_release(dreq);
goto out_release;
}
dreq->l_ctx = l_ctx;
if (!is_sync_kiocb(iocb))
dreq->iocb = iocb;
+ pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
nfs_start_io_direct(inode);
diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c
index c9b605f6c9cb..a13e69009f19 100644
--- a/fs/nfs/filelayout/filelayout.c
+++ b/fs/nfs/filelayout/filelayout.c
@@ -49,6 +49,7 @@ MODULE_AUTHOR("Dean Hildebrand <dhildebz@umich.edu>");
MODULE_DESCRIPTION("The NFSv4 file layout driver");
#define FILELAYOUT_POLL_RETRY_MAX (15*HZ)
+static const struct pnfs_commit_ops filelayout_commit_ops;
static loff_t
filelayout_get_dense_offset(struct nfs4_filelayout_segment *flseg,
@@ -750,72 +751,17 @@ filelayout_free_lseg(struct pnfs_layout_segment *lseg)
/* This assumes a single RW lseg */
if (lseg->pls_range.iomode == IOMODE_RW) {
struct nfs4_filelayout *flo;
+ struct inode *inode;
flo = FILELAYOUT_FROM_HDR(lseg->pls_layout);
- flo->commit_info.nbuckets = 0;
- kfree(flo->commit_info.buckets);
- flo->commit_info.buckets = NULL;
+ inode = flo->generic_hdr.plh_inode;
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_release_lseg(&flo->commit_info, lseg);
+ spin_unlock(&inode->i_lock);
}
_filelayout_free_lseg(fl);
}
-static int
-filelayout_alloc_commit_info(struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- gfp_t gfp_flags)
-{
- struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
- struct pnfs_commit_bucket *buckets;
- int size, i;
-
- if (fl->commit_through_mds)
- return 0;
-
- size = (fl->stripe_type == STRIPE_SPARSE) ?
- fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
-
- if (cinfo->ds->nbuckets >= size) {
- /* This assumes there is only one IOMODE_RW lseg. What
- * we really want to do is have a layout_hdr level
- * dictionary of <multipath_list4, fh> keys, each
- * associated with a struct list_head, populated by calls
- * to filelayout_write_pagelist().
- * */
- return 0;
- }
-
- buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
- gfp_flags);
- if (!buckets)
- return -ENOMEM;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- /* mark direct verifier as unset */
- buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
- }
-
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->ds->nbuckets >= size)
- goto out;
- for (i = 0; i < cinfo->ds->nbuckets; i++) {
- list_splice(&cinfo->ds->buckets[i].written,
- &buckets[i].written);
- list_splice(&cinfo->ds->buckets[i].committing,
- &buckets[i].committing);
- buckets[i].direct_verf.committed =
- cinfo->ds->buckets[i].direct_verf.committed;
- buckets[i].wlseg = cinfo->ds->buckets[i].wlseg;
- buckets[i].clseg = cinfo->ds->buckets[i].clseg;
- }
- swap(cinfo->ds->buckets, buckets);
- cinfo->ds->nbuckets = size;
-out:
- spin_unlock(&cinfo->inode->i_lock);
- kfree(buckets);
- return 0;
-}
-
static struct pnfs_layout_segment *
filelayout_alloc_lseg(struct pnfs_layout_hdr *layoutid,
struct nfs4_layoutget_res *lgr,
@@ -938,9 +884,6 @@ static void
filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
- struct nfs_commit_info cinfo;
- int status;
-
pnfs_generic_pg_check_layout(pgio);
if (!pgio->pg_lseg) {
pgio->pg_lseg = fl_pnfs_update_layout(pgio->pg_inode,
@@ -959,17 +902,7 @@ filelayout_pg_init_write(struct nfs_pageio_descriptor *pgio,
/* If no lseg, fall back to write through mds */
if (pgio->pg_lseg == NULL)
- goto out_mds;
- nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
- status = filelayout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
- if (status < 0) {
- pnfs_put_lseg(pgio->pg_lseg);
- pgio->pg_lseg = NULL;
- goto out_mds;
- }
- return;
-out_mds:
- nfs_pageio_reset_write_mds(pgio);
+ nfs_pageio_reset_write_mds(pgio);
}
static const struct nfs_pageio_ops filelayout_pg_read_ops = {
@@ -1078,36 +1011,6 @@ out_err:
return -EAGAIN;
}
-/* filelayout_search_commit_reqs - Search lists in @cinfo for the head reqest
- * for @page
- * @cinfo - commit info for current inode
- * @page - page to search for matching head request
- *
- * Returns a the head request if one is found, otherwise returns NULL.
- */
-static struct nfs_page *
-filelayout_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
-{
- struct nfs_page *freq, *t;
- struct pnfs_commit_bucket *b;
- int i;
-
- /* Linearly search the commit lists for each bucket until a matching
- * request is found */
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
- list_for_each_entry_safe(freq, t, &b->written, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
- list_for_each_entry_safe(freq, t, &b->committing, wb_list) {
- if (freq->wb_page == page)
- return freq->wb_head;
- }
- }
-
- return NULL;
-}
-
static int
filelayout_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int how, struct nfs_commit_info *cinfo)
@@ -1140,13 +1043,17 @@ filelayout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
struct nfs4_filelayout *flo;
flo = kzalloc(sizeof(*flo), gfp_flags);
- return flo != NULL ? &flo->generic_hdr : NULL;
+ if (flo == NULL)
+ return NULL;
+ pnfs_init_ds_commit_info(&flo->commit_info);
+ flo->commit_info.ops = &filelayout_commit_ops;
+ return &flo->generic_hdr;
}
static void
filelayout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
- kfree(FILELAYOUT_FROM_HDR(lo));
+ kfree_rcu(FILELAYOUT_FROM_HDR(lo), generic_hdr.plh_rcu);
}
static struct pnfs_ds_commit_info *
@@ -1160,6 +1067,46 @@ filelayout_get_ds_info(struct inode *inode)
return &FILELAYOUT_FROM_HDR(layout)->commit_info;
}
+static void
+filelayout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_filelayout_segment *fl = FILELAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+ unsigned int size = (fl->stripe_type == STRIPE_SPARSE) ?
+ fl->dsaddr->ds_num : fl->dsaddr->stripe_count;
+
+ new = pnfs_alloc_commit_array(size, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+filelayout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static const struct pnfs_commit_ops filelayout_commit_ops = {
+ .setup_ds_info = filelayout_setup_ds_info,
+ .release_ds_info = filelayout_release_ds_info,
+ .mark_request_commit = filelayout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .search_commit_reqs = pnfs_generic_search_commit_reqs,
+ .commit_pagelist = filelayout_commit_pagelist,
+};
+
static struct pnfs_layoutdriver_type filelayout_type = {
.id = LAYOUT_NFSV4_1_FILES,
.name = "LAYOUT_NFSV4_1_FILES",
@@ -1173,12 +1120,6 @@ static struct pnfs_layoutdriver_type filelayout_type = {
.pg_read_ops = &filelayout_pg_read_ops,
.pg_write_ops = &filelayout_pg_write_ops,
.get_ds_info = &filelayout_get_ds_info,
- .mark_request_commit = filelayout_mark_request_commit,
- .clear_request_commit = pnfs_generic_clear_request_commit,
- .scan_commit_lists = pnfs_generic_scan_commit_lists,
- .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .search_commit_reqs = filelayout_search_commit_reqs,
- .commit_pagelist = filelayout_commit_pagelist,
.read_pagelist = filelayout_read_pagelist,
.write_pagelist = filelayout_write_pagelist,
.alloc_deviceid_node = filelayout_alloc_deviceid_node,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c
index bb9148b83166..7d399f72ebbb 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.c
+++ b/fs/nfs/flexfilelayout/flexfilelayout.c
@@ -32,6 +32,7 @@
static unsigned short io_maxretrans;
+static const struct pnfs_commit_ops ff_layout_commit_ops;
static void ff_layout_read_record_layoutstats_done(struct rpc_task *task,
struct nfs_pgio_header *hdr);
static int ff_layout_mirror_prepare_stats(struct pnfs_layout_hdr *lo,
@@ -48,9 +49,11 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
ffl = kzalloc(sizeof(*ffl), gfp_flags);
if (ffl) {
+ pnfs_init_ds_commit_info(&ffl->commit_info);
INIT_LIST_HEAD(&ffl->error_list);
INIT_LIST_HEAD(&ffl->mirrors);
ffl->last_report_time = ktime_get();
+ ffl->commit_info.ops = &ff_layout_commit_ops;
return &ffl->generic_hdr;
} else
return NULL;
@@ -59,14 +62,14 @@ ff_layout_alloc_layout_hdr(struct inode *inode, gfp_t gfp_flags)
static void
ff_layout_free_layout_hdr(struct pnfs_layout_hdr *lo)
{
+ struct nfs4_flexfile_layout *ffl = FF_LAYOUT_FROM_HDR(lo);
struct nfs4_ff_layout_ds_err *err, *n;
- list_for_each_entry_safe(err, n, &FF_LAYOUT_FROM_HDR(lo)->error_list,
- list) {
+ list_for_each_entry_safe(err, n, &ffl->error_list, list) {
list_del(&err->list);
kfree(err);
}
- kfree(FF_LAYOUT_FROM_HDR(lo));
+ kfree_rcu(ffl, generic_hdr.plh_rcu);
}
static int decode_pnfs_stateid(struct xdr_stream *xdr, nfs4_stateid *stateid)
@@ -248,36 +251,10 @@ static void ff_layout_put_mirror(struct nfs4_ff_layout_mirror *mirror)
static void ff_layout_free_mirror_array(struct nfs4_ff_layout_segment *fls)
{
- int i;
-
- if (fls->mirror_array) {
- for (i = 0; i < fls->mirror_array_cnt; i++) {
- /* normally mirror_ds is freed in
- * .free_deviceid_node but we still do it here
- * for .alloc_lseg error path */
- ff_layout_put_mirror(fls->mirror_array[i]);
- }
- kfree(fls->mirror_array);
- fls->mirror_array = NULL;
- }
-}
-
-static int ff_layout_check_layout(struct nfs4_layoutget_res *lgr)
-{
- int ret = 0;
+ u32 i;
- dprintk("--> %s\n", __func__);
-
- /* FIXME: remove this check when layout segment support is added */
- if (lgr->range.offset != 0 ||
- lgr->range.length != NFS4_MAX_UINT64) {
- dprintk("%s Only whole file layouts supported. Use MDS i/o\n",
- __func__);
- ret = -EINVAL;
- }
-
- dprintk("--> %s returns %d\n", __func__, ret);
- return ret;
+ for (i = 0; i < fls->mirror_array_cnt; i++)
+ ff_layout_put_mirror(fls->mirror_array[i]);
}
static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
@@ -289,6 +266,23 @@ static void _ff_layout_free_lseg(struct nfs4_ff_layout_segment *fls)
}
static bool
+ff_lseg_match_mirrors(struct pnfs_layout_segment *l1,
+ struct pnfs_layout_segment *l2)
+{
+ const struct nfs4_ff_layout_segment *fl1 = FF_LAYOUT_LSEG(l1);
+ const struct nfs4_ff_layout_segment *fl2 = FF_LAYOUT_LSEG(l1);
+ u32 i;
+
+ if (fl1->mirror_array_cnt != fl2->mirror_array_cnt)
+ return false;
+ for (i = 0; i < fl1->mirror_array_cnt; i++) {
+ if (fl1->mirror_array[i] != fl2->mirror_array[i])
+ return false;
+ }
+ return true;
+}
+
+static bool
ff_lseg_range_is_after(const struct pnfs_layout_range *l1,
const struct pnfs_layout_range *l2)
{
@@ -323,6 +317,8 @@ ff_lseg_merge(struct pnfs_layout_segment *new,
new->pls_range.length);
if (new_end < old->pls_range.offset)
return false;
+ if (!ff_lseg_match_mirrors(new, old))
+ return false;
/* Mergeable: copy info from 'old' to 'new' */
if (new_end < old_end)
@@ -400,16 +396,13 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
goto out_err_free;
rc = -ENOMEM;
- fls = kzalloc(sizeof(*fls), gfp_flags);
+ fls = kzalloc(struct_size(fls, mirror_array, mirror_array_cnt),
+ gfp_flags);
if (!fls)
goto out_err_free;
fls->mirror_array_cnt = mirror_array_cnt;
fls->stripe_unit = stripe_unit;
- fls->mirror_array = kcalloc(fls->mirror_array_cnt,
- sizeof(fls->mirror_array[0]), gfp_flags);
- if (fls->mirror_array == NULL)
- goto out_err_free;
for (i = 0; i < fls->mirror_array_cnt; i++) {
struct nfs4_ff_layout_mirror *mirror;
@@ -545,9 +538,6 @@ ff_layout_alloc_lseg(struct pnfs_layout_hdr *lh,
out_sort_mirrors:
ff_layout_sort_mirrors(fls);
- rc = ff_layout_check_layout(lgr);
- if (rc)
- goto out_err_free;
ret = &fls->generic_hdr;
dprintk("<-- %s (success)\n", __func__);
out_free_page:
@@ -560,17 +550,6 @@ out_err_free:
goto out_free_page;
}
-static bool ff_layout_has_rw_segments(struct pnfs_layout_hdr *layout)
-{
- struct pnfs_layout_segment *lseg;
-
- list_for_each_entry(lseg, &layout->plh_segs, pls_list)
- if (lseg->pls_range.iomode == IOMODE_RW)
- return true;
-
- return false;
-}
-
static void
ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
{
@@ -585,23 +564,12 @@ ff_layout_free_lseg(struct pnfs_layout_segment *lseg)
ffl = FF_LAYOUT_FROM_HDR(lseg->pls_layout);
inode = ffl->generic_hdr.plh_inode;
spin_lock(&inode->i_lock);
- if (!ff_layout_has_rw_segments(lseg->pls_layout)) {
- ffl->commit_info.nbuckets = 0;
- kfree(ffl->commit_info.buckets);
- ffl->commit_info.buckets = NULL;
- }
+ pnfs_generic_ds_cinfo_release_lseg(&ffl->commit_info, lseg);
spin_unlock(&inode->i_lock);
}
_ff_layout_free_lseg(fls);
}
-/* Return 1 until we have multiple lsegs support */
-static int
-ff_layout_get_lseg_count(struct nfs4_ff_layout_segment *fls)
-{
- return 1;
-}
-
static void
nfs4_ff_start_busy_timer(struct nfs4_ff_busy_timer *timer, ktime_t now)
{
@@ -746,52 +714,6 @@ nfs4_ff_layout_stat_io_end_write(struct rpc_task *task,
spin_unlock(&mirror->lock);
}
-static int
-ff_layout_alloc_commit_info(struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- gfp_t gfp_flags)
-{
- struct nfs4_ff_layout_segment *fls = FF_LAYOUT_LSEG(lseg);
- struct pnfs_commit_bucket *buckets;
- int size;
-
- if (cinfo->ds->nbuckets != 0) {
- /* This assumes there is only one RW lseg per file.
- * To support multiple lseg per file, we need to
- * change struct pnfs_commit_bucket to allow dynamic
- * increasing nbuckets.
- */
- return 0;
- }
-
- size = ff_layout_get_lseg_count(fls) * FF_LAYOUT_MIRROR_COUNT(lseg);
-
- buckets = kcalloc(size, sizeof(struct pnfs_commit_bucket),
- gfp_flags);
- if (!buckets)
- return -ENOMEM;
- else {
- int i;
-
- spin_lock(&cinfo->inode->i_lock);
- if (cinfo->ds->nbuckets != 0)
- kfree(buckets);
- else {
- cinfo->ds->buckets = buckets;
- cinfo->ds->nbuckets = size;
- for (i = 0; i < size; i++) {
- INIT_LIST_HEAD(&buckets[i].written);
- INIT_LIST_HEAD(&buckets[i].committing);
- /* mark direct verifier as unset */
- buckets[i].direct_verf.committed =
- NFS_INVALID_STABLE_HOW;
- }
- }
- spin_unlock(&cinfo->inode->i_lock);
- return 0;
- }
-}
-
static void
ff_layout_mark_ds_unreachable(struct pnfs_layout_segment *lseg, int idx)
{
@@ -876,8 +798,8 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
pnfs_put_lseg(pgio->pg_lseg);
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_READ,
strict_iomode,
GFP_KERNEL);
@@ -888,6 +810,14 @@ ff_layout_pg_get_read(struct nfs_pageio_descriptor *pgio,
}
static void
+ff_layout_pg_check_layout(struct nfs_pageio_descriptor *pgio,
+ struct nfs_page *req)
+{
+ pnfs_generic_pg_check_layout(pgio);
+ pnfs_generic_pg_check_range(pgio, req);
+}
+
+static void
ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
struct nfs_page *req)
{
@@ -897,7 +827,7 @@ ff_layout_pg_init_read(struct nfs_pageio_descriptor *pgio,
int ds_idx;
retry:
- pnfs_generic_pg_check_layout(pgio);
+ ff_layout_pg_check_layout(pgio, req);
/* Use full layout for now */
if (!pgio->pg_lseg) {
ff_layout_pg_get_read(pgio, req, false);
@@ -953,18 +883,16 @@ ff_layout_pg_init_write(struct nfs_pageio_descriptor *pgio,
{
struct nfs4_ff_layout_mirror *mirror;
struct nfs_pgio_mirror *pgm;
- struct nfs_commit_info cinfo;
struct nfs4_pnfs_ds *ds;
int i;
- int status;
retry:
- pnfs_generic_pg_check_layout(pgio);
+ ff_layout_pg_check_layout(pgio, req);
if (!pgio->pg_lseg) {
pgio->pg_lseg = pnfs_update_layout(pgio->pg_inode,
nfs_req_openctx(req),
- 0,
- NFS4_MAX_UINT64,
+ req_offset(req),
+ req->wb_bytes,
IOMODE_RW,
false,
GFP_NOFS);
@@ -978,11 +906,6 @@ retry:
if (pgio->pg_lseg == NULL)
goto out_mds;
- nfs_init_cinfo(&cinfo, pgio->pg_inode, pgio->pg_dreq);
- status = ff_layout_alloc_commit_info(pgio->pg_lseg, &cinfo, GFP_NOFS);
- if (status < 0)
- goto out_mds;
-
/* Use a direct mapping of ds_idx to pgio mirror_idx */
if (WARN_ON_ONCE(pgio->pg_mirror_count !=
FF_LAYOUT_MIRROR_COUNT(pgio->pg_lseg)))
@@ -1297,21 +1220,23 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg,
}
}
+ mirror = FF_LAYOUT_COMP(lseg, idx);
+ err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
+ mirror, offset, length, status, opnum,
+ GFP_NOIO);
+
switch (status) {
case NFS4ERR_DELAY:
case NFS4ERR_GRACE:
- return;
- default:
break;
+ case NFS4ERR_NXIO:
+ ff_layout_mark_ds_unreachable(lseg, idx);
+ /* Fallthrough */
+ default:
+ pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode,
+ lseg);
}
- mirror = FF_LAYOUT_COMP(lseg, idx);
- err = ff_layout_track_ds_error(FF_LAYOUT_FROM_HDR(lseg->pls_layout),
- mirror, offset, length, status, opnum,
- GFP_NOIO);
- if (status == NFS4ERR_NXIO)
- ff_layout_mark_ds_unreachable(lseg, idx);
- pnfs_error_mark_layout_for_return(lseg->pls_layout->plh_inode, lseg);
dprintk("%s: err %d op %d status %u\n", __func__, err, opnum, status);
}
@@ -2012,6 +1937,33 @@ ff_layout_get_ds_info(struct inode *inode)
}
static void
+ff_layout_setup_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct nfs4_ff_layout_segment *flseg = FF_LAYOUT_LSEG(lseg);
+ struct inode *inode = lseg->pls_layout->plh_inode;
+ struct pnfs_commit_array *array, *new;
+
+ new = pnfs_alloc_commit_array(flseg->mirror_array_cnt, GFP_NOIO);
+ if (new) {
+ spin_lock(&inode->i_lock);
+ array = pnfs_add_commit_array(fl_cinfo, new, lseg);
+ spin_unlock(&inode->i_lock);
+ if (array != new)
+ pnfs_free_commit_array(new);
+ }
+}
+
+static void
+ff_layout_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo,
+ struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ pnfs_generic_ds_cinfo_destroy(fl_cinfo);
+ spin_unlock(&inode->i_lock);
+}
+
+static void
ff_layout_free_deviceid_node(struct nfs4_deviceid_node *d)
{
nfs4_ff_layout_free_deviceid(container_of(d, struct nfs4_ff_layout_ds,
@@ -2496,6 +2448,16 @@ ff_layout_set_layoutdriver(struct nfs_server *server,
return 0;
}
+static const struct pnfs_commit_ops ff_layout_commit_ops = {
+ .setup_ds_info = ff_layout_setup_ds_info,
+ .release_ds_info = ff_layout_release_ds_info,
+ .mark_request_commit = pnfs_layout_mark_request_commit,
+ .clear_request_commit = pnfs_generic_clear_request_commit,
+ .scan_commit_lists = pnfs_generic_scan_commit_lists,
+ .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
+ .commit_pagelist = ff_layout_commit_pagelist,
+};
+
static struct pnfs_layoutdriver_type flexfilelayout_type = {
.id = LAYOUT_FLEX_FILES,
.name = "LAYOUT_FLEX_FILES",
@@ -2512,11 +2474,6 @@ static struct pnfs_layoutdriver_type flexfilelayout_type = {
.pg_write_ops = &ff_layout_pg_write_ops,
.get_ds_info = ff_layout_get_ds_info,
.free_deviceid_node = ff_layout_free_deviceid_node,
- .mark_request_commit = pnfs_layout_mark_request_commit,
- .clear_request_commit = pnfs_generic_clear_request_commit,
- .scan_commit_lists = pnfs_generic_scan_commit_lists,
- .recover_commit_reqs = pnfs_generic_recover_commit_reqs,
- .commit_pagelist = ff_layout_commit_pagelist,
.read_pagelist = ff_layout_read_pagelist,
.write_pagelist = ff_layout_write_pagelist,
.alloc_deviceid_node = ff_layout_alloc_deviceid_node,
diff --git a/fs/nfs/flexfilelayout/flexfilelayout.h b/fs/nfs/flexfilelayout/flexfilelayout.h
index 2f369966abf7..354a031c69b1 100644
--- a/fs/nfs/flexfilelayout/flexfilelayout.h
+++ b/fs/nfs/flexfilelayout/flexfilelayout.h
@@ -99,7 +99,7 @@ struct nfs4_ff_layout_segment {
u64 stripe_unit;
u32 flags;
u32 mirror_array_cnt;
- struct nfs4_ff_layout_mirror **mirror_array;
+ struct nfs4_ff_layout_mirror *mirror_array[];
};
struct nfs4_flexfile_layout {
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index e113fcb4bb4c..ccc88be88d6a 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -190,6 +190,7 @@ static const struct constant_table nfs_vers_tokens[] = {
{ "4.0", Opt_vers_4_0 },
{ "4.1", Opt_vers_4_1 },
{ "4.2", Opt_vers_4_2 },
+ {}
};
enum {
@@ -202,13 +203,14 @@ enum {
nr__Opt_xprt
};
-static const struct constant_table nfs_xprt_protocol_tokens[nr__Opt_xprt] = {
+static const struct constant_table nfs_xprt_protocol_tokens[] = {
{ "rdma", Opt_xprt_rdma },
{ "rdma6", Opt_xprt_rdma6 },
{ "tcp", Opt_xprt_tcp },
{ "tcp6", Opt_xprt_tcp6 },
{ "udp", Opt_xprt_udp },
{ "udp6", Opt_xprt_udp6 },
+ {}
};
enum {
@@ -239,6 +241,7 @@ static const struct constant_table nfs_secflavor_tokens[] = {
{ "spkm3i", Opt_sec_spkmi },
{ "spkm3p", Opt_sec_spkmp },
{ "sys", Opt_sec_sys },
+ {}
};
/*
@@ -1135,7 +1138,7 @@ out_no_address:
return nfs_invalf(fc, "NFS4: mount program didn't pass remote address");
out_invalid_transport_udp:
- return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
}
#endif
@@ -1257,7 +1260,7 @@ out_v4_not_compiled:
nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel");
return -EPROTONOSUPPORT;
out_invalid_transport_udp:
- return nfs_invalf(fc, "NFSv4: Unsupported transport protocol udp");
+ return nfs_invalf(fc, "NFS: Unsupported transport protocol udp");
out_no_address:
return nfs_invalf(fc, "NFS: mount program didn't pass remote address");
out_mountproto_mismatch:
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 11bf15800ac9..b9d0921cb4fe 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -62,7 +62,6 @@
/* Default is to see 64-bit inode numbers */
static bool enable_ino64 = NFS_64_BIT_INODE_NUMBERS_ENABLED;
-static void nfs_invalidate_inode(struct inode *);
static int nfs_update_inode(struct inode *, struct nfs_fattr *);
static struct kmem_cache * nfs_inode_cachep;
@@ -284,10 +283,18 @@ EXPORT_SYMBOL_GPL(nfs_invalidate_atime);
* Invalidate, but do not unhash, the inode.
* NB: must be called with inode->i_lock held!
*/
-static void nfs_invalidate_inode(struct inode *inode)
+static void nfs_set_inode_stale_locked(struct inode *inode)
{
set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
nfs_zap_caches_locked(inode);
+ trace_nfs_set_inode_stale(inode);
+}
+
+void nfs_set_inode_stale(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ nfs_set_inode_stale_locked(inode);
+ spin_unlock(&inode->i_lock);
}
struct nfs_find_desc {
@@ -959,16 +966,16 @@ struct nfs_open_context *alloc_nfs_open_context(struct dentry *dentry,
struct file *filp)
{
struct nfs_open_context *ctx;
- const struct cred *cred = get_current_cred();
ctx = kmalloc(sizeof(*ctx), GFP_KERNEL);
- if (!ctx) {
- put_cred(cred);
+ if (!ctx)
return ERR_PTR(-ENOMEM);
- }
nfs_sb_active(dentry->d_sb);
ctx->dentry = dget(dentry);
- ctx->cred = cred;
+ if (filp)
+ ctx->cred = get_cred(filp->f_cred);
+ else
+ ctx->cred = get_current_cred();
ctx->ll_cred = NULL;
ctx->state = NULL;
ctx->mode = f_mode;
@@ -1163,9 +1170,10 @@ __nfs_revalidate_inode(struct nfs_server *server, struct inode *inode)
status = 0;
break;
case -ESTALE:
- nfs_zap_caches(inode);
if (!S_ISDIR(inode->i_mode))
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
+ else
+ nfs_zap_caches(inode);
}
goto err_out;
}
@@ -2064,7 +2072,7 @@ static int nfs_update_inode(struct inode *inode, struct nfs_fattr *fattr)
* lookup validation will know that the inode is bad.
* (But we fall through to invalidate the caches.)
*/
- nfs_invalidate_inode(inode);
+ nfs_set_inode_stale_locked(inode);
return -ESTALE;
}
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index f80c47d5ff27..1f32a9fbfdaf 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -274,12 +274,6 @@ void nfs_free_request(struct nfs_page *req);
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc);
-static inline bool nfs_pgio_has_mirroring(struct nfs_pageio_descriptor *desc)
-{
- WARN_ON_ONCE(desc->pg_mirror_count < 1);
- return desc->pg_mirror_count > 1;
-}
-
static inline bool nfs_match_open_context(const struct nfs_open_context *ctx1,
const struct nfs_open_context *ctx2)
{
@@ -417,7 +411,9 @@ extern int __init register_nfs_fs(void);
extern void __exit unregister_nfs_fs(void);
extern bool nfs_sb_active(struct super_block *sb);
extern void nfs_sb_deactive(struct super_block *sb);
-
+extern int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data);
/* io.c */
extern void nfs_start_io_read(struct inode *inode);
extern void nfs_end_io_read(struct inode *inode);
@@ -515,13 +511,25 @@ int nfs_filemap_write_and_wait_range(struct address_space *mapping,
loff_t lstart, loff_t lend);
#ifdef CONFIG_NFS_V4_1
+static inline void
+pnfs_bucket_clear_pnfs_ds_commit_verifiers(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets)
+{
+ unsigned int i;
+
+ for (i = 0; i < nbuckets; i++)
+ buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+}
static inline
void nfs_clear_pnfs_ds_commit_verifiers(struct pnfs_ds_commit_info *cinfo)
{
- int i;
+ struct pnfs_commit_array *array;
- for (i = 0; i < cinfo->nbuckets; i++)
- cinfo->buckets[i].direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &cinfo->commits, cinfo_list)
+ pnfs_bucket_clear_pnfs_ds_commit_verifiers(array->buckets,
+ array->nbuckets);
+ rcu_read_unlock();
}
#else
static inline
@@ -542,6 +550,14 @@ nfs_write_verifier_cmp(const struct nfs_write_verifier *v1,
return memcmp(v1->data, v2->data, sizeof(v1->data));
}
+static inline bool
+nfs_write_match_verf(const struct nfs_writeverf *verf,
+ struct nfs_page *req)
+{
+ return verf->committed > NFS_UNSTABLE &&
+ !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index f3ece8ed3203..6b063227e34e 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -145,6 +145,7 @@ struct vfsmount *nfs_d_automount(struct path *path)
struct vfsmount *mnt = ERR_PTR(-ENOMEM);
struct nfs_server *server = NFS_SERVER(d_inode(path->dentry));
struct nfs_client *client = server->nfs_client;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
int ret;
if (IS_ROOT(path->dentry))
@@ -190,12 +191,12 @@ struct vfsmount *nfs_d_automount(struct path *path)
if (IS_ERR(mnt))
goto out_fc;
- if (nfs_mountpoint_expiry_timeout < 0)
+ mntget(mnt); /* prevent immediate expiration */
+ if (timeout <= 0)
goto out_fc;
- mntget(mnt); /* prevent immediate expiration */
mnt_set_expiry(mnt, &nfs_automount_list);
- schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+ schedule_delayed_work(&nfs_automount_task, timeout);
out_fc:
put_fs_context(fc);
@@ -233,10 +234,11 @@ const struct inode_operations nfs_referral_inode_operations = {
static void nfs_expire_automounts(struct work_struct *work)
{
struct list_head *list = &nfs_automount_list;
+ int timeout = READ_ONCE(nfs_mountpoint_expiry_timeout);
mark_mounts_for_expiry(list);
- if (!list_empty(list))
- schedule_delayed_work(&nfs_automount_task, nfs_mountpoint_expiry_timeout);
+ if (!list_empty(list) && timeout > 0)
+ schedule_delayed_work(&nfs_automount_task, timeout);
}
void nfs_release_automount_timer(void)
@@ -247,10 +249,7 @@ void nfs_release_automount_timer(void)
/**
* nfs_do_submount - set up mountpoint when crossing a filesystem boundary
- * @dentry: parent directory
- * @fh: filehandle for new root dentry
- * @fattr: attributes for new root inode
- * @authflavor: security flavor to use when performing the mount
+ * @fc: pointer to struct nfs_fs_context
*
*/
int nfs_do_submount(struct fs_context *fc)
@@ -312,3 +311,53 @@ int nfs_submount(struct fs_context *fc, struct nfs_server *server)
return nfs_do_submount(fc);
}
EXPORT_SYMBOL_GPL(nfs_submount);
+
+static int param_set_nfs_timeout(const char *val, const struct kernel_param *kp)
+{
+ long num;
+ int ret;
+
+ if (!val)
+ return -EINVAL;
+ ret = kstrtol(val, 0, &num);
+ if (ret)
+ return -EINVAL;
+ if (num > 0) {
+ if (num >= INT_MAX / HZ)
+ num = INT_MAX;
+ else
+ num *= HZ;
+ *((int *)kp->arg) = num;
+ if (!list_empty(&nfs_automount_list))
+ mod_delayed_work(system_wq, &nfs_automount_task, num);
+ } else {
+ *((int *)kp->arg) = -1*HZ;
+ cancel_delayed_work(&nfs_automount_task);
+ }
+ return 0;
+}
+
+static int param_get_nfs_timeout(char *buffer, const struct kernel_param *kp)
+{
+ long num = *((int *)kp->arg);
+
+ if (num > 0) {
+ if (num >= INT_MAX - (HZ - 1))
+ num = INT_MAX / HZ;
+ else
+ num = (num + (HZ - 1)) / HZ;
+ } else
+ num = -1;
+ return scnprintf(buffer, PAGE_SIZE, "%li\n", num);
+}
+
+static const struct kernel_param_ops param_ops_nfs_timeout = {
+ .set = param_set_nfs_timeout,
+ .get = param_get_nfs_timeout,
+};
+#define param_check_nfs_timeout(name, p) __param_check(name, p, int);
+
+module_param(nfs_mountpoint_expiry_timeout, nfs_timeout, 0644);
+MODULE_PARM_DESC(nfs_mountpoint_expiry_timeout,
+ "Set the NFS automounted mountpoint timeout value (seconds)."
+ "Values <= 0 turn expiration off.");
diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h
index 8be1ba7c62bb..2b7f6dcd2eb8 100644
--- a/fs/nfs/nfs4_fs.h
+++ b/fs/nfs/nfs4_fs.h
@@ -42,7 +42,9 @@ enum nfs4_client_state {
NFS4CLNT_LEASE_MOVED,
NFS4CLNT_DELEGATION_EXPIRED,
NFS4CLNT_RUN_MANAGER,
- NFS4CLNT_DELEGRETURN_RUNNING,
+ NFS4CLNT_RECALL_RUNNING,
+ NFS4CLNT_RECALL_ANY_LAYOUT_READ,
+ NFS4CLNT_RECALL_ANY_LAYOUT_RW,
};
#define NFS4_RENEW_TIMEOUT 0x01
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index 1297919e0fce..8e5d6223ddd3 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -252,6 +252,9 @@ static loff_t nfs42_remap_file_range(struct file *src_file, loff_t src_off,
if (remap_flags & ~REMAP_FILE_ADVISORY)
return -EINVAL;
+ if (IS_SWAPFILE(dst_inode) || IS_SWAPFILE(src_inode))
+ return -ETXTBSY;
+
/* check alignment w.r.t. clone_blksize */
ret = -EINVAL;
if (bs) {
diff --git a/fs/nfs/nfs4namespace.c b/fs/nfs/nfs4namespace.c
index 84026e7b8a5f..a3ab6e219061 100644
--- a/fs/nfs/nfs4namespace.c
+++ b/fs/nfs/nfs4namespace.c
@@ -354,7 +354,7 @@ static int try_location(struct fs_context *fc,
/**
* nfs_follow_referral - set up mountpoint when hitting a referral on moved error
- * @dentry: parent directory
+ * @fc: pointer to struct nfs_fs_context
* @locations: array of NFSv4 server location information
*
*/
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index cb34e840e4fb..512afb1c7867 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -2346,7 +2346,7 @@ static int _nfs4_proc_open_confirm(struct nfs4_opendata *data)
.callback_ops = &nfs4_open_confirm_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status;
@@ -2511,7 +2511,7 @@ static int nfs4_run_open_task(struct nfs4_opendata *data,
.callback_ops = &nfs4_open_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status;
@@ -2790,16 +2790,19 @@ static int nfs41_check_delegation_stateid(struct nfs4_state *state)
return NFS_OK;
}
+ spin_lock(&delegation->lock);
nfs4_stateid_copy(&stateid, &delegation->stateid);
if (!test_and_clear_bit(NFS_DELEGATION_TEST_EXPIRED,
&delegation->flags)) {
+ spin_unlock(&delegation->lock);
rcu_read_unlock();
return NFS_OK;
}
if (delegation->cred)
cred = get_cred(delegation->cred);
+ spin_unlock(&delegation->lock);
rcu_read_unlock();
status = nfs41_test_and_free_expired_stateid(server, &stateid, cred);
trace_nfs4_test_delegation_stateid(state, NULL, status);
@@ -3651,7 +3654,7 @@ int nfs4_do_close(struct nfs4_state *state, gfp_t gfp_mask, int wait)
.rpc_message = &msg,
.callback_ops = &nfs4_close_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int status = -ENOMEM;
@@ -5544,7 +5547,7 @@ unwind:
struct nfs4_cached_acl {
int cached;
size_t len;
- char data[0];
+ char data[];
};
static void nfs4_set_cached_acl(struct inode *inode, struct nfs4_cached_acl *acl)
@@ -6253,6 +6256,7 @@ static void nfs4_delegreturn_done(struct rpc_task *task, void *calldata)
/* Fallthrough */
case -NFS4ERR_BAD_STATEID:
case -NFS4ERR_STALE_STATEID:
+ case -ETIMEDOUT:
task->tk_status = 0;
break;
case -NFS4ERR_OLD_STATEID:
@@ -6343,7 +6347,7 @@ static int _nfs4_proc_delegreturn(struct inode *inode, const struct cred *cred,
.rpc_client = server->client,
.rpc_message = &msg,
.callback_ops = &nfs4_delegreturn_ops,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | RPC_TASK_TIMEOUT,
};
int status = 0;
@@ -6926,7 +6930,7 @@ static int _nfs4_do_setlk(struct nfs4_state *state, int cmd, struct file_lock *f
.rpc_message = &msg,
.callback_ops = &nfs4_lock_ops,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
int ret;
@@ -9170,7 +9174,7 @@ nfs4_proc_layoutget(struct nfs4_layoutget *lgp, long *timeout)
.rpc_message = &msg,
.callback_ops = &nfs4_layoutget_call_ops,
.callback_data = lgp,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
struct pnfs_layout_segment *lseg = NULL;
struct nfs4_exception exception = {
@@ -9287,6 +9291,7 @@ static void nfs4_layoutreturn_release(void *calldata)
lrp->ld_private.ops->free(&lrp->ld_private);
pnfs_put_layout_hdr(lrp->args.layout);
nfs_iput_and_deactive(lrp->inode);
+ put_cred(lrp->cred);
kfree(calldata);
dprintk("<-- %s\n", __func__);
}
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index f7723d221945..ac93715c05a4 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -2524,6 +2524,21 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
}
return 0;
}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+ int iomode = 0;
+
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_READ, &clp->cl_state))
+ iomode += IOMODE_READ;
+ if (test_and_clear_bit(NFS4CLNT_RECALL_ANY_LAYOUT_RW, &clp->cl_state))
+ iomode += IOMODE_RW;
+ /* Note: IOMODE_READ + IOMODE_RW == IOMODE_ANY */
+ if (iomode) {
+ pnfs_layout_return_unused_byclid(clp, iomode);
+ set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
+ }
+}
#else /* CONFIG_NFS_V4_1 */
static int nfs4_reset_session(struct nfs_client *clp) { return 0; }
@@ -2531,6 +2546,10 @@ static int nfs4_bind_conn_to_session(struct nfs_client *clp)
{
return 0;
}
+
+static void nfs4_layoutreturn_any_run(struct nfs_client *clp)
+{
+}
#endif /* CONFIG_NFS_V4_1 */
static void nfs4_state_manager(struct nfs_client *clp)
@@ -2635,12 +2654,13 @@ static void nfs4_state_manager(struct nfs_client *clp)
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
- if (!test_and_set_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state)) {
+ if (!test_and_set_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state)) {
if (test_and_clear_bit(NFS4CLNT_DELEGRETURN, &clp->cl_state)) {
nfs_client_return_marked_delegations(clp);
set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state);
}
- clear_bit(NFS4CLNT_DELEGRETURN_RUNNING, &clp->cl_state);
+ nfs4_layoutreturn_any_run(clp);
+ clear_bit(NFS4CLNT_RECALL_RUNNING, &clp->cl_state);
}
/* Did we race with an attempt to give us more work? */
diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h
index 1e97e5e04cb4..543541173a3d 100644
--- a/fs/nfs/nfs4trace.h
+++ b/fs/nfs/nfs4trace.h
@@ -584,7 +584,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_LEASE_MOVED);
TRACE_DEFINE_ENUM(NFS4CLNT_DELEGATION_EXPIRED);
TRACE_DEFINE_ENUM(NFS4CLNT_RUN_MANAGER);
-TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_RUNNING);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_READ);
+TRACE_DEFINE_ENUM(NFS4CLNT_RECALL_ANY_LAYOUT_RW);
#define show_nfs4_clp_state(state) \
__print_flags(state, "|", \
@@ -605,7 +607,9 @@ TRACE_DEFINE_ENUM(NFS4CLNT_DELEGRETURN_RUNNING);
{ NFS4CLNT_LEASE_MOVED, "LEASE_MOVED" }, \
{ NFS4CLNT_DELEGATION_EXPIRED, "DELEGATION_EXPIRED" }, \
{ NFS4CLNT_RUN_MANAGER, "RUN_MANAGER" }, \
- { NFS4CLNT_DELEGRETURN_RUNNING, "DELEGRETURN_RUNNING" })
+ { NFS4CLNT_RECALL_RUNNING, "RECALL_RUNNING" }, \
+ { NFS4CLNT_RECALL_ANY_LAYOUT_READ, "RECALL_ANY_LAYOUT_READ" }, \
+ { NFS4CLNT_RECALL_ANY_LAYOUT_RW, "RECALL_ANY_LAYOUT_RW" })
TRACE_EVENT(nfs4_state_mgr,
TP_PROTO(
diff --git a/fs/nfs/nfsroot.c b/fs/nfs/nfsroot.c
index effaa4247b91..8d3278805602 100644
--- a/fs/nfs/nfsroot.c
+++ b/fs/nfs/nfsroot.c
@@ -88,7 +88,7 @@
#define NFS_ROOT "/tftpboot/%s"
/* Default NFSROOT mount options. */
-#define NFS_DEF_OPTIONS "vers=2,udp,rsize=4096,wsize=4096"
+#define NFS_DEF_OPTIONS "vers=2,tcp,rsize=4096,wsize=4096"
/* Parameters passed from the kernel command line */
static char nfs_root_parms[NFS_MAXPATHLEN + 1] __initdata = "";
diff --git a/fs/nfs/nfstrace.h b/fs/nfs/nfstrace.h
index a9588d19a5ae..7e7a97ae21ed 100644
--- a/fs/nfs/nfstrace.h
+++ b/fs/nfs/nfstrace.h
@@ -181,6 +181,7 @@ DECLARE_EVENT_CLASS(nfs_inode_event_done,
int error \
), \
TP_ARGS(inode, error))
+DEFINE_NFS_INODE_EVENT(nfs_set_inode_stale);
DEFINE_NFS_INODE_EVENT(nfs_refresh_inode_enter);
DEFINE_NFS_INODE_EVENT_DONE(nfs_refresh_inode_exit);
DEFINE_NFS_INODE_EVENT(nfs_revalidate_inode_enter);
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 20b3717cd7ca..f61f96603df7 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -33,9 +33,7 @@ static const struct rpc_call_ops nfs_pgio_common_ops;
struct nfs_pgio_mirror *
nfs_pgio_current_mirror(struct nfs_pageio_descriptor *desc)
{
- return nfs_pgio_has_mirroring(desc) ?
- &desc->pg_mirrors[desc->pg_mirror_idx] :
- &desc->pg_mirrors[0];
+ return &desc->pg_mirrors[desc->pg_mirror_idx];
}
EXPORT_SYMBOL_GPL(nfs_pgio_current_mirror);
@@ -133,47 +131,166 @@ nfs_async_iocounter_wait(struct rpc_task *task, struct nfs_lock_context *l_ctx)
EXPORT_SYMBOL_GPL(nfs_async_iocounter_wait);
/*
- * nfs_page_group_lock - lock the head of the page group
- * @req - request in group that is to be locked
+ * nfs_page_lock_head_request - page lock the head of the page group
+ * @req: any member of the page group
+ */
+struct nfs_page *
+nfs_page_group_lock_head(struct nfs_page *req)
+{
+ struct nfs_page *head = req->wb_head;
+
+ while (!nfs_lock_request(head)) {
+ int ret = nfs_wait_on_request(head);
+ if (ret < 0)
+ return ERR_PTR(ret);
+ }
+ if (head != req)
+ kref_get(&head->wb_kref);
+ return head;
+}
+
+/*
+ * nfs_unroll_locks - unlock all newly locked reqs and wait on @req
+ * @head: head request of page group, must be holding head lock
+ * @req: request that couldn't lock and needs to wait on the req bit lock
*
- * this lock must be held when traversing or modifying the page
- * group list
+ * This is a helper function for nfs_lock_and_join_requests
+ * returns 0 on success, < 0 on error.
+ */
+static void
+nfs_unroll_locks(struct nfs_page *head, struct nfs_page *req)
+{
+ struct nfs_page *tmp;
+
+ /* relinquish all the locks successfully grabbed this run */
+ for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
+ if (!kref_read(&tmp->wb_kref))
+ continue;
+ nfs_unlock_and_release_request(tmp);
+ }
+}
+
+/*
+ * nfs_page_group_lock_subreq - try to lock a subrequest
+ * @head: head request of page group
+ * @subreq: request to lock
*
- * return 0 on success, < 0 on error
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request and page group both locked.
+ * On error, it returns with the page group unlocked.
*/
-int
-nfs_page_group_lock(struct nfs_page *req)
+static int
+nfs_page_group_lock_subreq(struct nfs_page *head, struct nfs_page *subreq)
{
- struct nfs_page *head = req->wb_head;
+ int ret;
+
+ if (!kref_get_unless_zero(&subreq->wb_kref))
+ return 0;
+ while (!nfs_lock_request(subreq)) {
+ nfs_page_group_unlock(head);
+ ret = nfs_wait_on_request(subreq);
+ if (!ret)
+ ret = nfs_page_group_lock(head);
+ if (ret < 0) {
+ nfs_unroll_locks(head, subreq);
+ nfs_release_request(subreq);
+ return ret;
+ }
+ }
+ return 0;
+}
+
+/*
+ * nfs_page_group_lock_subrequests - try to lock the subrequests
+ * @head: head request of page group
+ *
+ * This is a helper function for nfs_lock_and_join_requests which
+ * must be called with the head request locked.
+ */
+int nfs_page_group_lock_subrequests(struct nfs_page *head)
+{
+ struct nfs_page *subreq;
+ int ret;
- WARN_ON_ONCE(head != head->wb_head);
+ ret = nfs_page_group_lock(head);
+ if (ret < 0)
+ return ret;
+ /* lock each request in the page group */
+ for (subreq = head->wb_this_page; subreq != head;
+ subreq = subreq->wb_this_page) {
+ ret = nfs_page_group_lock_subreq(head, subreq);
+ if (ret < 0)
+ return ret;
+ }
+ nfs_page_group_unlock(head);
+ return 0;
+}
- if (!test_and_set_bit(PG_HEADLOCK, &head->wb_flags))
+/*
+ * nfs_page_set_headlock - set the request PG_HEADLOCK
+ * @req: request that is to be locked
+ *
+ * this lock must be held when modifying req->wb_head
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_set_headlock(struct nfs_page *req)
+{
+ if (!test_and_set_bit(PG_HEADLOCK, &req->wb_flags))
return 0;
- set_bit(PG_CONTENDED1, &head->wb_flags);
+ set_bit(PG_CONTENDED1, &req->wb_flags);
smp_mb__after_atomic();
- return wait_on_bit_lock(&head->wb_flags, PG_HEADLOCK,
+ return wait_on_bit_lock(&req->wb_flags, PG_HEADLOCK,
TASK_UNINTERRUPTIBLE);
}
/*
- * nfs_page_group_unlock - unlock the head of the page group
- * @req - request in group that is to be unlocked
+ * nfs_page_clear_headlock - clear the request PG_HEADLOCK
+ * @req: request that is to be locked
*/
void
-nfs_page_group_unlock(struct nfs_page *req)
+nfs_page_clear_headlock(struct nfs_page *req)
{
- struct nfs_page *head = req->wb_head;
-
- WARN_ON_ONCE(head != head->wb_head);
-
smp_mb__before_atomic();
- clear_bit(PG_HEADLOCK, &head->wb_flags);
+ clear_bit(PG_HEADLOCK, &req->wb_flags);
smp_mb__after_atomic();
- if (!test_bit(PG_CONTENDED1, &head->wb_flags))
+ if (!test_bit(PG_CONTENDED1, &req->wb_flags))
return;
- wake_up_bit(&head->wb_flags, PG_HEADLOCK);
+ wake_up_bit(&req->wb_flags, PG_HEADLOCK);
+}
+
+/*
+ * nfs_page_group_lock - lock the head of the page group
+ * @req: request in group that is to be locked
+ *
+ * this lock must be held when traversing or modifying the page
+ * group list
+ *
+ * return 0 on success, < 0 on error
+ */
+int
+nfs_page_group_lock(struct nfs_page *req)
+{
+ int ret;
+
+ ret = nfs_page_set_headlock(req);
+ if (ret || req->wb_head == req)
+ return ret;
+ return nfs_page_set_headlock(req->wb_head);
+}
+
+/*
+ * nfs_page_group_unlock - unlock the head of the page group
+ * @req: request in group that is to be unlocked
+ */
+void
+nfs_page_group_unlock(struct nfs_page *req)
+{
+ if (req != req->wb_head)
+ nfs_page_clear_headlock(req->wb_head);
+ nfs_page_clear_headlock(req);
}
/*
@@ -359,15 +476,23 @@ nfs_create_request(struct nfs_open_context *ctx, struct page *page,
}
static struct nfs_page *
-nfs_create_subreq(struct nfs_page *req, struct nfs_page *last,
- unsigned int pgbase, unsigned int offset,
+nfs_create_subreq(struct nfs_page *req,
+ unsigned int pgbase,
+ unsigned int offset,
unsigned int count)
{
+ struct nfs_page *last;
struct nfs_page *ret;
ret = __nfs_create_request(req->wb_lock_context, req->wb_page,
pgbase, offset, count);
if (!IS_ERR(ret)) {
+ /* find the last request */
+ for (last = req->wb_head;
+ last->wb_this_page != req->wb_head;
+ last = last->wb_this_page)
+ ;
+
nfs_lock_request(ret);
ret->wb_index = req->wb_index;
nfs_page_group_init(ret, last);
@@ -627,9 +752,8 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
.callback_ops = call_ops,
.callback_data = hdr,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
};
- int ret = 0;
hdr->rw_ops->rw_initiate(hdr, &msg, rpc_ops, &task_setup_data, how);
@@ -641,18 +765,10 @@ int nfs_initiate_pgio(struct rpc_clnt *clnt, struct nfs_pgio_header *hdr,
(unsigned long long)hdr->args.offset);
task = rpc_run_task(&task_setup_data);
- if (IS_ERR(task)) {
- ret = PTR_ERR(task);
- goto out;
- }
- if (how & FLUSH_SYNC) {
- ret = rpc_wait_for_completion_task(task);
- if (ret == 0)
- ret = task->tk_status;
- }
+ if (IS_ERR(task))
+ return PTR_ERR(task);
rpc_put_task(task);
-out:
- return ret;
+ return 0;
}
EXPORT_SYMBOL_GPL(nfs_initiate_pgio);
@@ -886,15 +1002,6 @@ static void nfs_pageio_setup_mirroring(struct nfs_pageio_descriptor *pgio,
pgio->pg_mirror_count = mirror_count;
}
-/*
- * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
- */
-void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
-{
- pgio->pg_mirror_count = 1;
- pgio->pg_mirror_idx = 0;
-}
-
static void nfs_pageio_cleanup_mirroring(struct nfs_pageio_descriptor *pgio)
{
pgio->pg_mirror_count = 1;
@@ -911,7 +1018,7 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
}
/**
- * nfs_can_coalesce_requests - test two requests for compatibility
+ * nfs_coalesce_size - test two requests for compatibility
* @prev: pointer to nfs_page
* @req: pointer to nfs_page
* @pgio: pointer to nfs_pagio_descriptor
@@ -920,41 +1027,36 @@ static bool nfs_match_lock_context(const struct nfs_lock_context *l1,
* page data area they describe is contiguous, and that their RPC
* credentials, NFSv4 open state, and lockowners are the same.
*
- * Return 'true' if this is the case, else return 'false'.
+ * Returns size of the request that can be coalesced
*/
-static bool nfs_can_coalesce_requests(struct nfs_page *prev,
+static unsigned int nfs_coalesce_size(struct nfs_page *prev,
struct nfs_page *req,
struct nfs_pageio_descriptor *pgio)
{
- size_t size;
struct file_lock_context *flctx;
if (prev) {
if (!nfs_match_open_context(nfs_req_openctx(req), nfs_req_openctx(prev)))
- return false;
+ return 0;
flctx = d_inode(nfs_req_openctx(req)->dentry)->i_flctx;
if (flctx != NULL &&
!(list_empty_careful(&flctx->flc_posix) &&
list_empty_careful(&flctx->flc_flock)) &&
!nfs_match_lock_context(req->wb_lock_context,
prev->wb_lock_context))
- return false;
+ return 0;
if (req_offset(req) != req_offset(prev) + prev->wb_bytes)
- return false;
+ return 0;
if (req->wb_page == prev->wb_page) {
if (req->wb_pgbase != prev->wb_pgbase + prev->wb_bytes)
- return false;
+ return 0;
} else {
if (req->wb_pgbase != 0 ||
prev->wb_pgbase + prev->wb_bytes != PAGE_SIZE)
- return false;
+ return 0;
}
}
- size = pgio->pg_ops->pg_test(pgio, prev, req);
- WARN_ON_ONCE(size > req->wb_bytes);
- if (size && size < req->wb_bytes)
- req->wb_bytes = size;
- return size > 0;
+ return pgio->pg_ops->pg_test(pgio, prev, req);
}
/**
@@ -962,15 +1064,16 @@ static bool nfs_can_coalesce_requests(struct nfs_page *prev,
* @desc: destination io descriptor
* @req: request
*
- * Returns true if the request 'req' was successfully coalesced into the
- * existing list of pages 'desc'.
+ * If the request 'req' was successfully coalesced into the existing list
+ * of pages 'desc', it returns the size of req.
*/
-static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
- struct nfs_page *req)
+static unsigned int
+nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
+ struct nfs_page *req)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_page *prev = NULL;
+ unsigned int size;
if (mirror->pg_count != 0) {
prev = nfs_list_entry(mirror->pg_list.prev);
@@ -990,11 +1093,12 @@ static int nfs_pageio_do_add_request(struct nfs_pageio_descriptor *desc,
return 0;
}
- if (!nfs_can_coalesce_requests(prev, req, desc))
- return 0;
+ size = nfs_coalesce_size(prev, req, desc);
+ if (size < req->wb_bytes)
+ return size;
nfs_list_move_request(req, &mirror->pg_list);
mirror->pg_count += req->wb_bytes;
- return 1;
+ return req->wb_bytes;
}
/*
@@ -1034,7 +1138,8 @@ nfs_pageio_cleanup_request(struct nfs_pageio_descriptor *desc,
* @req: request
*
* This may split a request into subrequests which are all part of the
- * same page group.
+ * same page group. If so, it will submit @req as the last one, to ensure
+ * the pointer to @req is still valid in case of failure.
*
* Returns true if the request 'req' was successfully coalesced into the
* existing list of pages 'desc'.
@@ -1043,51 +1148,50 @@ static int __nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
struct nfs_page *req)
{
struct nfs_pgio_mirror *mirror = nfs_pgio_current_mirror(desc);
-
struct nfs_page *subreq;
- unsigned int bytes_left = 0;
- unsigned int offset, pgbase;
+ unsigned int size, subreq_size;
nfs_page_group_lock(req);
subreq = req;
- bytes_left = subreq->wb_bytes;
- offset = subreq->wb_offset;
- pgbase = subreq->wb_pgbase;
-
- do {
- if (!nfs_pageio_do_add_request(desc, subreq)) {
- /* make sure pg_test call(s) did nothing */
- WARN_ON_ONCE(subreq->wb_bytes != bytes_left);
- WARN_ON_ONCE(subreq->wb_offset != offset);
- WARN_ON_ONCE(subreq->wb_pgbase != pgbase);
-
+ subreq_size = subreq->wb_bytes;
+ for(;;) {
+ size = nfs_pageio_do_add_request(desc, subreq);
+ if (size == subreq_size) {
+ /* We successfully submitted a request */
+ if (subreq == req)
+ break;
+ req->wb_pgbase += size;
+ req->wb_bytes -= size;
+ req->wb_offset += size;
+ subreq_size = req->wb_bytes;
+ subreq = req;
+ continue;
+ }
+ if (WARN_ON_ONCE(subreq != req)) {
+ nfs_page_group_unlock(req);
+ nfs_pageio_cleanup_request(desc, subreq);
+ subreq = req;
+ subreq_size = req->wb_bytes;
+ nfs_page_group_lock(req);
+ }
+ if (!size) {
+ /* Can't coalesce any more, so do I/O */
nfs_page_group_unlock(req);
desc->pg_moreio = 1;
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || mirror->pg_recoalesce)
- goto out_cleanup_subreq;
+ return 0;
/* retry add_request for this subreq */
nfs_page_group_lock(req);
continue;
}
-
- /* check for buggy pg_test call(s) */
- WARN_ON_ONCE(subreq->wb_bytes + subreq->wb_pgbase > PAGE_SIZE);
- WARN_ON_ONCE(subreq->wb_bytes > bytes_left);
- WARN_ON_ONCE(subreq->wb_bytes == 0);
-
- bytes_left -= subreq->wb_bytes;
- offset += subreq->wb_bytes;
- pgbase += subreq->wb_bytes;
-
- if (bytes_left) {
- subreq = nfs_create_subreq(req, subreq, pgbase,
- offset, bytes_left);
- if (IS_ERR(subreq))
- goto err_ptr;
- }
- } while (bytes_left > 0);
+ subreq = nfs_create_subreq(req, req->wb_pgbase,
+ req->wb_offset, size);
+ if (IS_ERR(subreq))
+ goto err_ptr;
+ subreq_size = size;
+ }
nfs_page_group_unlock(req);
return 1;
@@ -1095,10 +1199,6 @@ err_ptr:
desc->pg_error = PTR_ERR(subreq);
nfs_page_group_unlock(req);
return 0;
-out_cleanup_subreq:
- if (req != subreq)
- nfs_pageio_cleanup_request(desc, subreq);
- return 0;
}
static int nfs_do_recoalesce(struct nfs_pageio_descriptor *desc)
@@ -1167,7 +1267,7 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
{
u32 midx;
unsigned int pgbase, offset, bytes;
- struct nfs_page *dupreq, *lastreq;
+ struct nfs_page *dupreq;
pgbase = req->wb_pgbase;
offset = req->wb_offset;
@@ -1177,38 +1277,32 @@ int nfs_pageio_add_request(struct nfs_pageio_descriptor *desc,
if (desc->pg_error < 0)
goto out_failed;
- for (midx = 0; midx < desc->pg_mirror_count; midx++) {
- if (midx) {
- nfs_page_group_lock(req);
+ /* Create the mirror instances first, and fire them off */
+ for (midx = 1; midx < desc->pg_mirror_count; midx++) {
+ nfs_page_group_lock(req);
- /* find the last request */
- for (lastreq = req->wb_head;
- lastreq->wb_this_page != req->wb_head;
- lastreq = lastreq->wb_this_page)
- ;
+ dupreq = nfs_create_subreq(req,
+ pgbase, offset, bytes);
- dupreq = nfs_create_subreq(req, lastreq,
- pgbase, offset, bytes);
-
- nfs_page_group_unlock(req);
- if (IS_ERR(dupreq)) {
- desc->pg_error = PTR_ERR(dupreq);
- goto out_failed;
- }
- } else
- dupreq = req;
+ nfs_page_group_unlock(req);
+ if (IS_ERR(dupreq)) {
+ desc->pg_error = PTR_ERR(dupreq);
+ goto out_failed;
+ }
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = midx;
+ desc->pg_mirror_idx = midx;
if (!nfs_pageio_add_request_mirror(desc, dupreq))
goto out_cleanup_subreq;
}
+ desc->pg_mirror_idx = 0;
+ if (!nfs_pageio_add_request_mirror(desc, req))
+ goto out_failed;
+
return 1;
out_cleanup_subreq:
- if (req != dupreq)
- nfs_pageio_cleanup_request(desc, dupreq);
+ nfs_pageio_cleanup_request(desc, dupreq);
out_failed:
nfs_pageio_error_cleanup(desc);
return 0;
@@ -1226,8 +1320,7 @@ static void nfs_pageio_complete_mirror(struct nfs_pageio_descriptor *desc,
struct nfs_pgio_mirror *mirror = &desc->pg_mirrors[mirror_idx];
u32 restore_idx = desc->pg_mirror_idx;
- if (nfs_pgio_has_mirroring(desc))
- desc->pg_mirror_idx = mirror_idx;
+ desc->pg_mirror_idx = mirror_idx;
for (;;) {
nfs_pageio_doio(desc);
if (desc->pg_error < 0 || !mirror->pg_recoalesce)
@@ -1320,6 +1413,14 @@ void nfs_pageio_cond_complete(struct nfs_pageio_descriptor *desc, pgoff_t index)
}
}
+/*
+ * nfs_pageio_stop_mirroring - stop using mirroring (set mirror count to 1)
+ */
+void nfs_pageio_stop_mirroring(struct nfs_pageio_descriptor *pgio)
+{
+ nfs_pageio_complete(pgio);
+}
+
int __init nfs_init_nfspagecache(void)
{
nfs_page_cachep = kmem_cache_create("nfs_page",
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 542ea8dfd1bc..f2dc35c22964 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -268,11 +268,11 @@ pnfs_free_layout_hdr(struct pnfs_layout_hdr *lo)
struct nfs_server *server = NFS_SERVER(lo->plh_inode);
struct pnfs_layoutdriver_type *ld = server->pnfs_curr_ld;
- if (!list_empty(&lo->plh_layouts)) {
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
struct nfs_client *clp = server->nfs_client;
spin_lock(&clp->cl_lock);
- list_del_init(&lo->plh_layouts);
+ list_del_rcu(&lo->plh_layouts);
spin_unlock(&clp->cl_lock);
}
put_cred(lo->plh_lc_cred);
@@ -309,6 +309,16 @@ pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo)
}
}
+static struct inode *
+pnfs_grab_inode_layout_hdr(struct pnfs_layout_hdr *lo)
+{
+ struct inode *inode = igrab(lo->plh_inode);
+ if (inode)
+ return inode;
+ set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
+ return NULL;
+}
+
static void
pnfs_set_plh_return_info(struct pnfs_layout_hdr *lo, enum pnfs_iomode iomode,
u32 seq)
@@ -496,6 +506,7 @@ pnfs_init_lseg(struct pnfs_layout_hdr *lo, struct pnfs_layout_segment *lseg,
{
INIT_LIST_HEAD(&lseg->pls_list);
INIT_LIST_HEAD(&lseg->pls_lc_list);
+ INIT_LIST_HEAD(&lseg->pls_commits);
refcount_set(&lseg->pls_refcount, 1);
set_bit(NFS_LSEG_VALID, &lseg->pls_flags);
lseg->pls_layout = lo;
@@ -782,9 +793,10 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
/* If the sb is being destroyed, just bail */
if (!nfs_sb_active(server->super))
break;
- inode = igrab(lo->plh_inode);
+ inode = pnfs_grab_inode_layout_hdr(lo);
if (inode != NULL) {
- list_del_init(&lo->plh_layouts);
+ if (test_and_clear_bit(NFS_LAYOUT_HASHED, &lo->plh_flags))
+ list_del_rcu(&lo->plh_layouts);
if (pnfs_layout_add_bulk_destroy_list(inode,
layout_list))
continue;
@@ -794,7 +806,6 @@ pnfs_layout_bulk_destroy_byserver_locked(struct nfs_client *clp,
} else {
rcu_read_unlock();
spin_unlock(&clp->cl_lock);
- set_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags);
}
nfs_sb_deactive(server->super);
spin_lock(&clp->cl_lock);
@@ -903,10 +914,21 @@ pnfs_destroy_all_layouts(struct nfs_client *clp)
pnfs_destroy_layouts_byclid(clp, false);
}
+static void
+pnfs_set_layout_cred(struct pnfs_layout_hdr *lo, const struct cred *cred)
+{
+ const struct cred *old;
+
+ if (cred && cred_fscmp(lo->plh_lc_cred, cred) != 0) {
+ old = xchg(&lo->plh_lc_cred, get_cred(cred));
+ put_cred(old);
+ }
+}
+
/* update lo->plh_stateid with new if is more recent */
void
pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
- bool update_barrier)
+ const struct cred *cred, bool update_barrier)
{
u32 oldseq, newseq, new_barrier = 0;
@@ -914,6 +936,7 @@ pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo, const nfs4_stateid *new,
newseq = be32_to_cpu(new->seqid);
if (!pnfs_layout_is_valid(lo)) {
+ pnfs_set_layout_cred(lo, cred);
nfs4_stateid_copy(&lo->plh_stateid, new);
lo->plh_barrier = newseq;
pnfs_clear_layoutreturn_info(lo);
@@ -1061,7 +1084,7 @@ pnfs_alloc_init_layoutget_args(struct inode *ino,
lgp->args.ctx = get_nfs_open_context(ctx);
nfs4_stateid_copy(&lgp->args.stateid, stateid);
lgp->gfp_flags = gfp_flags;
- lgp->cred = get_cred(ctx->cred);
+ lgp->cred = ctx->cred;
return lgp;
}
@@ -1072,7 +1095,6 @@ void pnfs_layoutget_free(struct nfs4_layoutget *lgp)
nfs4_free_pages(lgp->args.layout.pages, max_pages);
if (lgp->args.inode)
pnfs_put_layout_hdr(NFS_I(lgp->args.inode)->layout);
- put_cred(lgp->cred);
put_nfs_open_context(lgp->args.ctx);
kfree(lgp);
}
@@ -1109,7 +1131,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo,
pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq);
pnfs_free_returned_lsegs(lo, &freeme, range, seq);
- pnfs_set_layout_stateid(lo, stateid, true);
+ pnfs_set_layout_stateid(lo, stateid, NULL, true);
} else
pnfs_mark_layout_stateid_invalid(lo, &freeme);
out_unlock:
@@ -1122,6 +1144,7 @@ out_unlock:
static bool
pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
nfs4_stateid *stateid,
+ const struct cred **cred,
enum pnfs_iomode *iomode)
{
/* Serialise LAYOUTGET/LAYOUTRETURN */
@@ -1132,18 +1155,17 @@ pnfs_prepare_layoutreturn(struct pnfs_layout_hdr *lo,
set_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
pnfs_get_layout_hdr(lo);
if (test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags)) {
- if (stateid != NULL) {
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
- if (lo->plh_return_seq != 0)
- stateid->seqid = cpu_to_be32(lo->plh_return_seq);
- }
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
+ if (lo->plh_return_seq != 0)
+ stateid->seqid = cpu_to_be32(lo->plh_return_seq);
if (iomode != NULL)
*iomode = lo->plh_return_iomode;
pnfs_clear_layoutreturn_info(lo);
return true;
}
- if (stateid != NULL)
- nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ nfs4_stateid_copy(stateid, &lo->plh_stateid);
+ *cred = get_cred(lo->plh_lc_cred);
if (iomode != NULL)
*iomode = IOMODE_ANY;
return true;
@@ -1167,20 +1189,26 @@ pnfs_init_layoutreturn_args(struct nfs4_layoutreturn_args *args,
}
static int
-pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
- enum pnfs_iomode iomode, bool sync)
+pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo,
+ const nfs4_stateid *stateid,
+ const struct cred **pcred,
+ enum pnfs_iomode iomode,
+ bool sync)
{
struct inode *ino = lo->plh_inode;
struct pnfs_layoutdriver_type *ld = NFS_SERVER(ino)->pnfs_curr_ld;
struct nfs4_layoutreturn *lrp;
+ const struct cred *cred = *pcred;
int status = 0;
+ *pcred = NULL;
lrp = kzalloc(sizeof(*lrp), GFP_NOFS);
if (unlikely(lrp == NULL)) {
status = -ENOMEM;
spin_lock(&ino->i_lock);
pnfs_clear_layoutreturn_waitbit(lo);
spin_unlock(&ino->i_lock);
+ put_cred(cred);
pnfs_put_layout_hdr(lo);
goto out;
}
@@ -1188,7 +1216,7 @@ pnfs_send_layoutreturn(struct pnfs_layout_hdr *lo, const nfs4_stateid *stateid,
pnfs_init_layoutreturn_args(&lrp->args, lo, stateid, iomode);
lrp->args.ld_private = &lrp->ld_private;
lrp->clp = NFS_SERVER(ino)->nfs_client;
- lrp->cred = lo->plh_lc_cred;
+ lrp->cred = cred;
if (ld->prepare_layoutreturn)
ld->prepare_layoutreturn(&lrp->args);
@@ -1233,15 +1261,16 @@ static void pnfs_layoutreturn_before_put_layout_hdr(struct pnfs_layout_hdr *lo)
return;
spin_lock(&inode->i_lock);
if (pnfs_layout_need_return(lo)) {
+ const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
bool send;
- send = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (send) {
/* Send an async layoutreturn so we dont deadlock */
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
}
} else
spin_unlock(&inode->i_lock);
@@ -1261,6 +1290,7 @@ _pnfs_return_layout(struct inode *ino)
struct pnfs_layout_hdr *lo = NULL;
struct nfs_inode *nfsi = NFS_I(ino);
LIST_HEAD(tmp_list);
+ const struct cred *cred;
nfs4_stateid stateid;
int status = 0;
bool send, valid_layout;
@@ -1305,10 +1335,10 @@ _pnfs_return_layout(struct inode *ino)
goto out_put_layout_hdr;
}
- send = pnfs_prepare_layoutreturn(lo, &stateid, NULL);
+ send = pnfs_prepare_layoutreturn(lo, &stateid, &cred, NULL);
spin_unlock(&ino->i_lock);
if (send)
- status = pnfs_send_layoutreturn(lo, &stateid, IOMODE_ANY, true);
+ status = pnfs_send_layoutreturn(lo, &stateid, &cred, IOMODE_ANY, true);
out_put_layout_hdr:
pnfs_free_lseg_list(&tmp_list);
pnfs_put_layout_hdr(lo);
@@ -1354,6 +1384,7 @@ bool pnfs_roc(struct inode *ino,
struct nfs4_state *state;
struct pnfs_layout_hdr *lo;
struct pnfs_layout_segment *lseg, *next;
+ const struct cred *lc_cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode = 0;
bool layoutreturn = false, roc = false;
@@ -1423,16 +1454,20 @@ retry:
* 2. we don't send layoutreturn
*/
/* lo ref dropped in pnfs_roc_release() */
- layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ layoutreturn = pnfs_prepare_layoutreturn(lo, &stateid, &lc_cred, &iomode);
/* If the creds don't match, we can't compound the layoutreturn */
- if (!layoutreturn || cred_fscmp(cred, lo->plh_lc_cred) != 0)
+ if (!layoutreturn)
goto out_noroc;
+ if (cred_fscmp(cred, lc_cred) != 0)
+ goto out_noroc_put_cred;
roc = layoutreturn;
pnfs_init_layoutreturn_args(args, lo, &stateid, iomode);
res->lrs_present = 0;
layoutreturn = false;
+out_noroc_put_cred:
+ put_cred(lc_cred);
out_noroc:
spin_unlock(&ino->i_lock);
rcu_read_unlock();
@@ -1445,7 +1480,7 @@ out_noroc:
return true;
}
if (layoutreturn)
- pnfs_send_layoutreturn(lo, &stateid, iomode, true);
+ pnfs_send_layoutreturn(lo, &stateid, &lc_cred, iomode, true);
pnfs_put_layout_hdr(lo);
return false;
}
@@ -1859,15 +1894,14 @@ static void pnfs_clear_first_layoutget(struct pnfs_layout_hdr *lo)
static void _add_to_server_list(struct pnfs_layout_hdr *lo,
struct nfs_server *server)
{
- if (list_empty(&lo->plh_layouts)) {
+ if (!test_and_set_bit(NFS_LAYOUT_HASHED, &lo->plh_flags)) {
struct nfs_client *clp = server->nfs_client;
/* The lo must be on the clp list if there is any
* chance of a CB_LAYOUTRECALL(FILE) coming in.
*/
spin_lock(&clp->cl_lock);
- if (list_empty(&lo->plh_layouts))
- list_add_tail(&lo->plh_layouts, &server->layouts);
+ list_add_tail_rcu(&lo->plh_layouts, &server->layouts);
spin_unlock(&clp->cl_lock);
}
}
@@ -2323,14 +2357,14 @@ pnfs_layout_process(struct nfs4_layoutget *lgp)
if (!pnfs_layout_is_valid(lo)) {
/* We have a completely new layout */
- pnfs_set_layout_stateid(lo, &res->stateid, true);
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, true);
} else if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
/* existing state ID, make sure the sequence number matches. */
if (pnfs_layout_stateid_blocked(lo, &res->stateid)) {
dprintk("%s forget reply due to sequence\n", __func__);
goto out_forget;
}
- pnfs_set_layout_stateid(lo, &res->stateid, false);
+ pnfs_set_layout_stateid(lo, &res->stateid, lgp->cred, false);
} else {
/*
* We got an entirely new state ID. Mark all segments for the
@@ -2423,43 +2457,159 @@ pnfs_mark_matching_lsegs_return(struct pnfs_layout_hdr *lo,
return -ENOENT;
}
-void pnfs_error_mark_layout_for_return(struct inode *inode,
- struct pnfs_layout_segment *lseg)
+static void
+pnfs_mark_layout_for_return(struct inode *inode,
+ const struct pnfs_layout_range *range)
{
- struct pnfs_layout_hdr *lo = NFS_I(inode)->layout;
- struct pnfs_layout_range range = {
- .iomode = lseg->pls_range.iomode,
- .offset = 0,
- .length = NFS4_MAX_UINT64,
- };
+ struct pnfs_layout_hdr *lo;
bool return_now = false;
spin_lock(&inode->i_lock);
+ lo = NFS_I(inode)->layout;
if (!pnfs_layout_is_valid(lo)) {
spin_unlock(&inode->i_lock);
return;
}
- pnfs_set_plh_return_info(lo, range.iomode, 0);
+ pnfs_set_plh_return_info(lo, range->iomode, 0);
/*
* mark all matching lsegs so that we are sure to have no live
* segments at hand when sending layoutreturn. See pnfs_put_lseg()
* for how it works.
*/
- if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, &range, 0) != -EBUSY) {
+ if (pnfs_mark_matching_lsegs_return(lo, &lo->plh_return_segs, range, 0) != -EBUSY) {
+ const struct cred *cred;
nfs4_stateid stateid;
enum pnfs_iomode iomode;
- return_now = pnfs_prepare_layoutreturn(lo, &stateid, &iomode);
+ return_now = pnfs_prepare_layoutreturn(lo, &stateid, &cred, &iomode);
spin_unlock(&inode->i_lock);
if (return_now)
- pnfs_send_layoutreturn(lo, &stateid, iomode, false);
+ pnfs_send_layoutreturn(lo, &stateid, &cred, iomode, false);
} else {
spin_unlock(&inode->i_lock);
nfs_commit_inode(inode, 0);
}
}
+
+void pnfs_error_mark_layout_for_return(struct inode *inode,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_layout_range range = {
+ .iomode = lseg->pls_range.iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ pnfs_mark_layout_for_return(inode, &range);
+}
EXPORT_SYMBOL_GPL(pnfs_error_mark_layout_for_return);
+static bool
+pnfs_layout_can_be_returned(struct pnfs_layout_hdr *lo)
+{
+ return pnfs_layout_is_valid(lo) &&
+ !test_bit(NFS_LAYOUT_INODE_FREEING, &lo->plh_flags) &&
+ !test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags);
+}
+
+static struct pnfs_layout_segment *
+pnfs_find_first_lseg(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_segment *lseg;
+
+ list_for_each_entry(lseg, &lo->plh_segs, pls_list) {
+ if (!test_bit(NFS_LSEG_VALID, &lseg->pls_flags))
+ continue;
+ if (test_bit(NFS_LSEG_LAYOUTRETURN, &lseg->pls_flags))
+ continue;
+ if (lseg->pls_range.iomode != iomode && iomode != IOMODE_ANY)
+ continue;
+ if (pnfs_lseg_range_intersecting(&lseg->pls_range, range))
+ return lseg;
+ }
+ return NULL;
+}
+
+/* Find open file states whose mode matches that of the range */
+static bool
+pnfs_should_return_unused_layout(struct pnfs_layout_hdr *lo,
+ const struct pnfs_layout_range *range)
+{
+ struct list_head *head;
+ struct nfs_open_context *ctx;
+ fmode_t mode = 0;
+
+ if (!pnfs_layout_can_be_returned(lo) ||
+ !pnfs_find_first_lseg(lo, range, range->iomode))
+ return false;
+
+ head = &NFS_I(lo->plh_inode)->open_files;
+ list_for_each_entry_rcu(ctx, head, list) {
+ if (ctx->state)
+ mode |= ctx->state->state & (FMODE_READ|FMODE_WRITE);
+ }
+
+ switch (range->iomode) {
+ default:
+ break;
+ case IOMODE_READ:
+ mode &= ~FMODE_WRITE;
+ break;
+ case IOMODE_RW:
+ if (pnfs_find_first_lseg(lo, range, IOMODE_READ))
+ mode &= ~FMODE_READ;
+ }
+ return mode == 0;
+}
+
+static int
+pnfs_layout_return_unused_byserver(struct nfs_server *server, void *data)
+{
+ const struct pnfs_layout_range *range = data;
+ struct pnfs_layout_hdr *lo;
+ struct inode *inode;
+restart:
+ rcu_read_lock();
+ list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) {
+ if (!pnfs_layout_can_be_returned(lo) ||
+ test_bit(NFS_LAYOUT_RETURN_REQUESTED, &lo->plh_flags))
+ continue;
+ inode = lo->plh_inode;
+ spin_lock(&inode->i_lock);
+ if (!pnfs_should_return_unused_layout(lo, range)) {
+ spin_unlock(&inode->i_lock);
+ continue;
+ }
+ spin_unlock(&inode->i_lock);
+ inode = pnfs_grab_inode_layout_hdr(lo);
+ if (!inode)
+ continue;
+ rcu_read_unlock();
+ pnfs_mark_layout_for_return(inode, range);
+ iput(inode);
+ cond_resched();
+ goto restart;
+ }
+ rcu_read_unlock();
+ return 0;
+}
+
+void
+pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode)
+{
+ struct pnfs_layout_range range = {
+ .iomode = iomode,
+ .offset = 0,
+ .length = NFS4_MAX_UINT64,
+ };
+
+ nfs_client_for_each_server(clp, pnfs_layout_return_unused_byserver,
+ &range);
+}
+
void
pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio)
{
@@ -2475,7 +2625,7 @@ EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_layout);
* Check for any intersection between the request and the pgio->pg_lseg,
* and if none, put this pgio->pg_lseg away.
*/
-static void
+void
pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
{
if (pgio->pg_lseg && !pnfs_lseg_request_intersecting(pgio->pg_lseg, req)) {
@@ -2483,6 +2633,7 @@ pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page
pgio->pg_lseg = NULL;
}
}
+EXPORT_SYMBOL_GPL(pnfs_generic_pg_check_range);
void
pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *pgio, struct nfs_page *req)
@@ -3000,10 +3151,10 @@ pnfs_layoutcommit_inode(struct inode *inode, bool sync)
end_pos = nfsi->layout->plh_lwb;
nfs4_stateid_copy(&data->args.stateid, &nfsi->layout->plh_stateid);
+ data->cred = get_cred(nfsi->layout->plh_lc_cred);
spin_unlock(&inode->i_lock);
data->args.inode = inode;
- data->cred = get_cred(nfsi->layout->plh_lc_cred);
nfs_fattr_init(&data->fattr);
data->args.bitmask = NFS_SERVER(inode)->cache_consistency_bitmask;
data->res.fattr = &data->fattr;
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0fafdadc9c8d..8e0ada581b92 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -66,6 +66,7 @@ struct nfs4_pnfs_ds {
struct pnfs_layout_segment {
struct list_head pls_list;
struct list_head pls_lc_list;
+ struct list_head pls_commits;
struct pnfs_layout_range pls_range;
refcount_t pls_refcount;
u32 pls_seq;
@@ -105,6 +106,7 @@ enum {
NFS_LAYOUT_INVALID_STID, /* layout stateid id is invalid */
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
+ NFS_LAYOUT_HASHED, /* The layout visible */
};
enum layoutdriver_policy_flags {
@@ -148,22 +150,6 @@ struct pnfs_layoutdriver_type {
const struct nfs_pageio_ops *pg_write_ops;
struct pnfs_ds_commit_info *(*get_ds_info) (struct inode *inode);
- void (*mark_request_commit) (struct nfs_page *req,
- struct pnfs_layout_segment *lseg,
- struct nfs_commit_info *cinfo,
- u32 ds_commit_idx);
- void (*clear_request_commit) (struct nfs_page *req,
- struct nfs_commit_info *cinfo);
- int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
- int max);
- void (*recover_commit_reqs) (struct list_head *list,
- struct nfs_commit_info *cinfo);
- struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
- struct page *page);
- int (*commit_pagelist)(struct inode *inode,
- struct list_head *mds_pages,
- int how,
- struct nfs_commit_info *cinfo);
int (*sync)(struct inode *inode, bool datasync);
@@ -186,6 +172,29 @@ struct pnfs_layoutdriver_type {
int (*prepare_layoutstats) (struct nfs42_layoutstat_args *args);
};
+struct pnfs_commit_ops {
+ void (*setup_ds_info)(struct pnfs_ds_commit_info *,
+ struct pnfs_layout_segment *);
+ void (*release_ds_info)(struct pnfs_ds_commit_info *,
+ struct inode *inode);
+ int (*commit_pagelist)(struct inode *inode,
+ struct list_head *mds_pages,
+ int how,
+ struct nfs_commit_info *cinfo);
+ void (*mark_request_commit) (struct nfs_page *req,
+ struct pnfs_layout_segment *lseg,
+ struct nfs_commit_info *cinfo,
+ u32 ds_commit_idx);
+ void (*clear_request_commit) (struct nfs_page *req,
+ struct nfs_commit_info *cinfo);
+ int (*scan_commit_lists) (struct nfs_commit_info *cinfo,
+ int max);
+ void (*recover_commit_reqs) (struct list_head *list,
+ struct nfs_commit_info *cinfo);
+ struct nfs_page * (*search_commit_reqs)(struct nfs_commit_info *cinfo,
+ struct page *page);
+};
+
struct pnfs_layout_hdr {
refcount_t plh_refcount;
atomic_t plh_outstanding; /* number of RPCs out */
@@ -203,6 +212,7 @@ struct pnfs_layout_hdr {
loff_t plh_lwb; /* last write byte for layoutcommit */
const struct cred *plh_lc_cred; /* layoutcommit cred */
struct inode *plh_inode;
+ struct rcu_head plh_rcu;
};
struct pnfs_device {
@@ -242,6 +252,7 @@ void pnfs_put_lseg(struct pnfs_layout_segment *lseg);
void set_pnfs_layoutdriver(struct nfs_server *, const struct nfs_fh *, struct nfs_fsinfo *);
void unset_pnfs_layoutdriver(struct nfs_server *);
void pnfs_generic_pg_check_layout(struct nfs_pageio_descriptor *pgio);
+void pnfs_generic_pg_check_range(struct nfs_pageio_descriptor *pgio, struct nfs_page *req);
void pnfs_generic_pg_init_read(struct nfs_pageio_descriptor *, struct nfs_page *);
int pnfs_generic_pg_readpages(struct nfs_pageio_descriptor *desc);
void pnfs_generic_pg_init_write(struct nfs_pageio_descriptor *pgio,
@@ -267,6 +278,7 @@ bool nfs4_layout_refresh_old_stateid(nfs4_stateid *dst,
void pnfs_put_layout_hdr(struct pnfs_layout_hdr *lo);
void pnfs_set_layout_stateid(struct pnfs_layout_hdr *lo,
const nfs4_stateid *new,
+ const struct cred *cred,
bool update_barrier);
int pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo,
struct list_head *tmp_list,
@@ -326,6 +338,9 @@ int pnfs_write_done_resend_to_mds(struct nfs_pgio_header *);
struct nfs4_threshold *pnfs_mdsthreshold_alloc(void);
void pnfs_error_mark_layout_for_return(struct inode *inode,
struct pnfs_layout_segment *lseg);
+void pnfs_layout_return_unused_byclid(struct nfs_client *clp,
+ enum pnfs_iomode iomode);
+
/* nfs4_deviceid_flags */
enum {
NFS_DEVICEID_INVALID = 0, /* set when MDS clientid recalled */
@@ -360,6 +375,16 @@ bool nfs4_test_deviceid_unavailable(struct nfs4_deviceid_node *node);
void nfs4_deviceid_purge_client(const struct nfs_client *);
/* pnfs_nfs.c */
+struct pnfs_commit_array *pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags);
+void pnfs_free_commit_array(struct pnfs_commit_array *p);
+struct pnfs_commit_array *pnfs_add_commit_array(struct pnfs_ds_commit_info *,
+ struct pnfs_commit_array *,
+ struct pnfs_layout_segment *);
+
+void pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg);
+void pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo);
+
void pnfs_generic_clear_request_commit(struct nfs_page *req,
struct nfs_commit_info *cinfo);
void pnfs_generic_commit_release(void *calldata);
@@ -367,6 +392,8 @@ void pnfs_generic_prepare_to_resend_writes(struct nfs_commit_data *data);
void pnfs_generic_rw_release(void *data);
void pnfs_generic_recover_commit_reqs(struct list_head *dst,
struct nfs_commit_info *cinfo);
+struct nfs_page *pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo,
+ struct page *page);
int pnfs_generic_commit_pagelist(struct inode *inode,
struct list_head *mds_pages,
int how,
@@ -438,9 +465,11 @@ static inline int
pnfs_commit_list(struct inode *inode, struct list_head *mds_pages, int how,
struct nfs_commit_info *cinfo)
{
- if (cinfo->ds == NULL || cinfo->ds->ncommitting == 0)
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo == NULL || fl_cinfo->ncommitting == 0)
return PNFS_NOT_ATTEMPTED;
- return NFS_SERVER(inode)->pnfs_curr_ld->commit_pagelist(inode, mds_pages, how, cinfo);
+ return fl_cinfo->ops->commit_pagelist(inode, mds_pages, how, cinfo);
}
static inline struct pnfs_ds_commit_info *
@@ -454,6 +483,28 @@ pnfs_get_ds_info(struct inode *inode)
}
static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ struct pnfs_ds_commit_info *inode_cinfo = pnfs_get_ds_info(inode);
+ if (inode_cinfo != NULL)
+ fl_cinfo->ops = inode_cinfo->ops;
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ INIT_LIST_HEAD(&fl_cinfo->commits);
+ fl_cinfo->ops = NULL;
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+ if (fl_cinfo->ops != NULL && fl_cinfo->ops->release_ds_info != NULL)
+ fl_cinfo->ops->release_ds_info(fl_cinfo, inode);
+}
+
+static inline void
pnfs_generic_mark_devid_invalid(struct nfs4_deviceid_node *node)
{
set_bit(NFS_DEVICEID_INVALID, &node->flags);
@@ -463,24 +514,22 @@ static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (lseg == NULL || ld->mark_request_commit == NULL)
+ if (!lseg || !fl_cinfo->ops->mark_request_commit)
return false;
- ld->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
+ fl_cinfo->ops->mark_request_commit(req, lseg, cinfo, ds_commit_idx);
return true;
}
static inline bool
pnfs_clear_request_commit(struct nfs_page *req, struct nfs_commit_info *cinfo)
{
- struct inode *inode = d_inode(nfs_req_openctx(req)->dentry);
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (ld == NULL || ld->clear_request_commit == NULL)
+ if (!fl_cinfo || !fl_cinfo->ops || !fl_cinfo->ops->clear_request_commit)
return false;
- ld->clear_request_commit(req, cinfo);
+ fl_cinfo->ops->clear_request_commit(req, cinfo);
return true;
}
@@ -488,21 +537,31 @@ static inline int
pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
int max)
{
- if (cinfo->ds == NULL || cinfo->ds->nwritten == 0)
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (!fl_cinfo || fl_cinfo->nwritten == 0)
return 0;
- else
- return NFS_SERVER(inode)->pnfs_curr_ld->scan_commit_lists(cinfo, max);
+ return fl_cinfo->ops->scan_commit_lists(cinfo, max);
+}
+
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+
+ if (fl_cinfo && fl_cinfo->nwritten != 0)
+ fl_cinfo->ops->recover_commit_reqs(head, cinfo);
}
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
{
- struct pnfs_layoutdriver_type *ld = NFS_SERVER(inode)->pnfs_curr_ld;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
- if (ld == NULL || ld->search_commit_reqs == NULL)
+ if (!fl_cinfo->ops || !fl_cinfo->ops->search_commit_reqs)
return NULL;
- return ld->search_commit_reqs(cinfo, page);
+ return fl_cinfo->ops->search_commit_reqs(cinfo, page);
}
/* Should the pNFS client commit and return the layout upon a setattr */
@@ -750,6 +809,21 @@ pnfs_get_ds_info(struct inode *inode)
return NULL;
}
+static inline void
+pnfs_init_ds_commit_info_ops(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
+static inline void
+pnfs_init_ds_commit_info(struct pnfs_ds_commit_info *fl_cinfo)
+{
+}
+
+static inline void
+pnfs_release_ds_info(struct pnfs_ds_commit_info *fl_cinfo, struct inode *inode)
+{
+}
+
static inline bool
pnfs_mark_request_commit(struct nfs_page *req, struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo, u32 ds_commit_idx)
@@ -770,6 +844,11 @@ pnfs_scan_commit_lists(struct inode *inode, struct nfs_commit_info *cinfo,
return 0;
}
+static inline void
+pnfs_recover_commit_reqs(struct list_head *head, struct nfs_commit_info *cinfo)
+{
+}
+
static inline struct nfs_page *
pnfs_search_commit_reqs(struct inode *inode, struct nfs_commit_info *cinfo,
struct page *page)
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 8b37e7f8e789..25f135572fc8 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -59,6 +59,17 @@ void pnfs_generic_commit_release(void *calldata)
}
EXPORT_SYMBOL_GPL(pnfs_generic_commit_release);
+static struct pnfs_layout_segment *
+pnfs_free_bucket_lseg(struct pnfs_commit_bucket *bucket)
+{
+ if (list_empty(&bucket->committing) && list_empty(&bucket->written)) {
+ struct pnfs_layout_segment *freeme = bucket->lseg;
+ bucket->lseg = NULL;
+ return freeme;
+ }
+ return NULL;
+}
+
/* The generic layer is about to remove the req from the commit list.
* If this will make the bucket empty, it will need to put the lseg reference.
* Note this must be called holding nfsi->commit_mutex
@@ -78,8 +89,7 @@ pnfs_generic_clear_request_commit(struct nfs_page *req,
bucket = list_first_entry(&req->wb_list,
struct pnfs_commit_bucket,
written);
- freeme = bucket->wlseg;
- bucket->wlseg = NULL;
+ freeme = pnfs_free_bucket_lseg(bucket);
}
out:
nfs_request_remove_commit_list(req, cinfo);
@@ -87,10 +97,154 @@ out:
}
EXPORT_SYMBOL_GPL(pnfs_generic_clear_request_commit);
+struct pnfs_commit_array *
+pnfs_alloc_commit_array(size_t n, gfp_t gfp_flags)
+{
+ struct pnfs_commit_array *p;
+ struct pnfs_commit_bucket *b;
+
+ p = kmalloc(struct_size(p, buckets, n), gfp_flags);
+ if (!p)
+ return NULL;
+ p->nbuckets = n;
+ INIT_LIST_HEAD(&p->cinfo_list);
+ INIT_LIST_HEAD(&p->lseg_list);
+ p->lseg = NULL;
+ for (b = &p->buckets[0]; n != 0; b++, n--) {
+ INIT_LIST_HEAD(&b->written);
+ INIT_LIST_HEAD(&b->committing);
+ b->lseg = NULL;
+ b->direct_verf.committed = NFS_INVALID_STABLE_HOW;
+ }
+ return p;
+}
+EXPORT_SYMBOL_GPL(pnfs_alloc_commit_array);
+
+void
+pnfs_free_commit_array(struct pnfs_commit_array *p)
+{
+ kfree_rcu(p, rcu);
+}
+EXPORT_SYMBOL_GPL(pnfs_free_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_find_commit_array_by_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (array->lseg == lseg)
+ return array;
+ }
+ return NULL;
+}
+
+struct pnfs_commit_array *
+pnfs_add_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_commit_array *new,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (array)
+ return array;
+ new->lseg = lseg;
+ refcount_set(&new->refcount, 1);
+ list_add_rcu(&new->cinfo_list, &fl_cinfo->commits);
+ list_add(&new->lseg_list, &lseg->pls_commits);
+ return new;
+}
+EXPORT_SYMBOL_GPL(pnfs_add_commit_array);
+
+static struct pnfs_commit_array *
+pnfs_lookup_commit_array(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array;
+
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ if (!array) {
+ rcu_read_unlock();
+ fl_cinfo->ops->setup_ds_info(fl_cinfo, lseg);
+ rcu_read_lock();
+ array = pnfs_find_commit_array_by_lseg(fl_cinfo, lseg);
+ }
+ rcu_read_unlock();
+ return array;
+}
+
+static void
+pnfs_release_commit_array_locked(struct pnfs_commit_array *array)
+{
+ list_del_rcu(&array->cinfo_list);
+ list_del(&array->lseg_list);
+ pnfs_free_commit_array(array);
+}
+
+static void
+pnfs_put_commit_array_locked(struct pnfs_commit_array *array)
+{
+ if (refcount_dec_and_test(&array->refcount))
+ pnfs_release_commit_array_locked(array);
+}
+
+static void
+pnfs_put_commit_array(struct pnfs_commit_array *array, struct inode *inode)
+{
+ if (refcount_dec_and_lock(&array->refcount, &inode->i_lock)) {
+ pnfs_release_commit_array_locked(array);
+ spin_unlock(&inode->i_lock);
+ }
+}
+
+static struct pnfs_commit_array *
+pnfs_get_commit_array(struct pnfs_commit_array *array)
+{
+ if (refcount_inc_not_zero(&array->refcount))
+ return array;
+ return NULL;
+}
+
+static void
+pnfs_remove_and_free_commit_array(struct pnfs_commit_array *array)
+{
+ array->lseg = NULL;
+ list_del_init(&array->lseg_list);
+ pnfs_put_commit_array_locked(array);
+}
+
+void
+pnfs_generic_ds_cinfo_release_lseg(struct pnfs_ds_commit_info *fl_cinfo,
+ struct pnfs_layout_segment *lseg)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &lseg->pls_commits, lseg_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_release_lseg);
+
+void
+pnfs_generic_ds_cinfo_destroy(struct pnfs_ds_commit_info *fl_cinfo)
+{
+ struct pnfs_commit_array *array, *tmp;
+
+ list_for_each_entry_safe(array, tmp, &fl_cinfo->commits, cinfo_list)
+ pnfs_remove_and_free_commit_array(array);
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_ds_cinfo_destroy);
+
+/*
+ * Locks the nfs_page requests for commit and moves them to
+ * @bucket->committing.
+ */
static int
-pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
- struct nfs_commit_info *cinfo,
- int max)
+pnfs_bucket_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo,
+ int max)
{
struct list_head *src = &bucket->written;
struct list_head *dst = &bucket->committing;
@@ -101,158 +255,253 @@ pnfs_generic_scan_ds_commit_list(struct pnfs_commit_bucket *bucket,
if (ret) {
cinfo->ds->nwritten -= ret;
cinfo->ds->ncommitting += ret;
- if (bucket->clseg == NULL)
- bucket->clseg = pnfs_get_lseg(bucket->wlseg);
- if (list_empty(src)) {
- pnfs_put_lseg(bucket->wlseg);
- bucket->wlseg = NULL;
- }
}
return ret;
}
+static int pnfs_bucket_scan_array(struct nfs_commit_info *cinfo,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ int max)
+{
+ unsigned int i;
+ int rv = 0, cnt;
+
+ for (i = 0; i < nbuckets && max != 0; i++) {
+ cnt = pnfs_bucket_scan_ds_commit_list(&buckets[i], cinfo, max);
+ rv += cnt;
+ max -= cnt;
+ }
+ return rv;
+}
+
/* Move reqs from written to committing lists, returning count
* of number moved.
*/
-int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo,
- int max)
+int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max)
{
- int i, rv = 0, cnt;
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ int rv = 0, cnt;
- lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
- for (i = 0; i < cinfo->ds->nbuckets && max != 0; i++) {
- cnt = pnfs_generic_scan_ds_commit_list(&cinfo->ds->buckets[i],
- cinfo, max);
- max -= cnt;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ cnt = pnfs_bucket_scan_array(cinfo, array->buckets,
+ array->nbuckets, max);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
rv += cnt;
+ max -= cnt;
+ if (!max)
+ break;
}
+ rcu_read_unlock();
return rv;
}
EXPORT_SYMBOL_GPL(pnfs_generic_scan_commit_lists);
-/* Pull everything off the committing lists and dump into @dst. */
-void pnfs_generic_recover_commit_reqs(struct list_head *dst,
- struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_bucket_recover_commit_reqs(struct list_head *dst,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
{
struct pnfs_commit_bucket *b;
struct pnfs_layout_segment *freeme;
- int nwritten;
- int i;
+ unsigned int nwritten, ret = 0;
+ unsigned int i;
- lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
restart:
- for (i = 0, b = cinfo->ds->buckets; i < cinfo->ds->nbuckets; i++, b++) {
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
nwritten = nfs_scan_commit_list(&b->written, dst, cinfo, 0);
if (!nwritten)
continue;
- cinfo->ds->nwritten -= nwritten;
- if (list_empty(&b->written)) {
- freeme = b->wlseg;
- b->wlseg = NULL;
+ ret += nwritten;
+ freeme = pnfs_free_bucket_lseg(b);
+ if (freeme) {
pnfs_put_lseg(freeme);
goto restart;
}
}
+ return ret;
+}
+
+/* Pull everything off the committing lists and dump into @dst. */
+void pnfs_generic_recover_commit_reqs(struct list_head *dst,
+ struct nfs_commit_info *cinfo)
+{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ unsigned int nwritten;
+
+ lockdep_assert_held(&NFS_I(cinfo->inode)->commit_mutex);
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ nwritten = pnfs_bucket_recover_commit_reqs(dst,
+ array->buckets,
+ array->nbuckets,
+ cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
+ fl_cinfo->nwritten -= nwritten;
+ }
+ rcu_read_unlock();
}
EXPORT_SYMBOL_GPL(pnfs_generic_recover_commit_reqs);
-static void pnfs_generic_retry_commit(struct nfs_commit_info *cinfo, int idx)
+static struct nfs_page *
+pnfs_bucket_search_commit_reqs(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets, struct page *page)
+{
+ struct nfs_page *req;
+ struct pnfs_commit_bucket *b;
+ unsigned int i;
+
+ /* Linearly search the commit lists for each bucket until a matching
+ * request is found */
+ for (i = 0, b = buckets; i < nbuckets; i++, b++) {
+ list_for_each_entry(req, &b->written, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ list_for_each_entry(req, &b->committing, wb_list) {
+ if (req->wb_page == page)
+ return req->wb_head;
+ }
+ }
+ return NULL;
+}
+
+/* pnfs_generic_search_commit_reqs - Search lists in @cinfo for the head reqest
+ * for @page
+ * @cinfo - commit info for current inode
+ * @page - page to search for matching head request
+ *
+ * Returns a the head request if one is found, otherwise returns NULL.
+ */
+struct nfs_page *
+pnfs_generic_search_commit_reqs(struct nfs_commit_info *cinfo, struct page *page)
{
struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
+ struct pnfs_commit_array *array;
+ struct nfs_page *req;
+
+ list_for_each_entry(array, &fl_cinfo->commits, cinfo_list) {
+ req = pnfs_bucket_search_commit_reqs(array->buckets,
+ array->nbuckets, page);
+ if (req)
+ return req;
+ }
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(pnfs_generic_search_commit_reqs);
+
+static struct pnfs_layout_segment *
+pnfs_bucket_get_committing(struct list_head *head,
+ struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct list_head *pos;
+
+ list_for_each(pos, &bucket->committing)
+ cinfo->ds->ncommitting--;
+ list_splice_init(&bucket->committing, head);
+ return pnfs_free_bucket_lseg(bucket);
+}
+
+static struct nfs_commit_data *
+pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
+ struct nfs_commit_info *cinfo)
+{
+ struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+
+ if (!data)
+ return NULL;
+ data->lseg = pnfs_bucket_get_committing(&data->pages, bucket, cinfo);
+ if (!data->lseg)
+ data->lseg = pnfs_get_lseg(bucket->lseg);
+ return data;
+}
+
+static void pnfs_generic_retry_commit(struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo,
+ unsigned int idx)
+{
struct pnfs_commit_bucket *bucket;
struct pnfs_layout_segment *freeme;
- struct list_head *pos;
LIST_HEAD(pages);
- int i;
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- for (i = idx; i < fl_cinfo->nbuckets; i++) {
- bucket = &fl_cinfo->buckets[i];
+ for (bucket = buckets; idx < nbuckets; bucket++, idx++) {
if (list_empty(&bucket->committing))
continue;
- freeme = bucket->clseg;
- bucket->clseg = NULL;
- list_for_each(pos, &bucket->committing)
- cinfo->ds->ncommitting--;
- list_splice_init(&bucket->committing, &pages);
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ freeme = pnfs_bucket_get_committing(&pages, bucket, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- nfs_retry_commit(&pages, freeme, cinfo, i);
+ nfs_retry_commit(&pages, freeme, cinfo, idx);
pnfs_put_lseg(freeme);
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
}
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
static unsigned int
-pnfs_generic_alloc_ds_commits(struct nfs_commit_info *cinfo,
- struct list_head *list)
+pnfs_bucket_alloc_ds_commits(struct list_head *list,
+ struct pnfs_commit_bucket *buckets,
+ unsigned int nbuckets,
+ struct nfs_commit_info *cinfo)
{
- struct pnfs_ds_commit_info *fl_cinfo;
struct pnfs_commit_bucket *bucket;
struct nfs_commit_data *data;
- int i;
+ unsigned int i;
unsigned int nreq = 0;
- fl_cinfo = cinfo->ds;
- bucket = fl_cinfo->buckets;
- for (i = 0; i < fl_cinfo->nbuckets; i++, bucket++) {
+ for (i = 0, bucket = buckets; i < nbuckets; i++, bucket++) {
if (list_empty(&bucket->committing))
continue;
- data = nfs_commitdata_alloc(false);
- if (!data)
- break;
- data->ds_commit_index = i;
- list_add(&data->pages, list);
- nreq++;
+ mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
+ if (!list_empty(&bucket->committing)) {
+ data = pnfs_bucket_fetch_commitdata(bucket, cinfo);
+ if (!data)
+ goto out_error;
+ data->ds_commit_index = i;
+ list_add_tail(&data->list, list);
+ atomic_inc(&cinfo->mds->rpcs_out);
+ nreq++;
+ }
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
}
-
+ return nreq;
+out_error:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
/* Clean up on error */
- pnfs_generic_retry_commit(cinfo, i);
+ pnfs_generic_retry_commit(buckets, nbuckets, cinfo, i);
return nreq;
}
-static inline
-void pnfs_fetch_commit_bucket_list(struct list_head *pages,
- struct nfs_commit_data *data,
- struct nfs_commit_info *cinfo)
+static unsigned int
+pnfs_alloc_ds_commits_list(struct list_head *list,
+ struct pnfs_ds_commit_info *fl_cinfo,
+ struct nfs_commit_info *cinfo)
{
- struct pnfs_commit_bucket *bucket;
- struct list_head *pos;
-
- bucket = &cinfo->ds->buckets[data->ds_commit_index];
- mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- list_for_each(pos, &bucket->committing)
- cinfo->ds->ncommitting--;
- list_splice_init(&bucket->committing, pages);
- data->lseg = bucket->clseg;
- bucket->clseg = NULL;
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
-
-}
+ struct pnfs_commit_array *array;
+ unsigned int ret = 0;
-/* Helper function for pnfs_generic_commit_pagelist to catch an empty
- * page list. This can happen when two commits race.
- *
- * This must be called instead of nfs_init_commit - call one or the other, but
- * not both!
- */
-static bool
-pnfs_generic_commit_cancel_empty_pagelist(struct list_head *pages,
- struct nfs_commit_data *data,
- struct nfs_commit_info *cinfo)
-{
- if (list_empty(pages)) {
- if (atomic_dec_and_test(&cinfo->mds->rpcs_out))
- wake_up_var(&cinfo->mds->rpcs_out);
- /* don't call nfs_commitdata_release - it tries to put
- * the open_context which is not acquired until nfs_init_commit
- * which has not been called on @data */
- WARN_ON_ONCE(data->context);
- nfs_commit_free(data);
- return true;
+ rcu_read_lock();
+ list_for_each_entry_rcu(array, &fl_cinfo->commits, cinfo_list) {
+ if (!array->lseg || !pnfs_get_commit_array(array))
+ continue;
+ rcu_read_unlock();
+ ret += pnfs_bucket_alloc_ds_commits(list, array->buckets,
+ array->nbuckets, cinfo);
+ rcu_read_lock();
+ pnfs_put_commit_array(array, cinfo->inode);
}
-
- return false;
+ return ret;
}
/* This follows nfs_commit_list pretty closely */
@@ -262,6 +511,7 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
int (*initiate_commit)(struct nfs_commit_data *data,
int how))
{
+ struct pnfs_ds_commit_info *fl_cinfo = cinfo->ds;
struct nfs_commit_data *data, *tmp;
LIST_HEAD(list);
unsigned int nreq = 0;
@@ -269,40 +519,25 @@ pnfs_generic_commit_pagelist(struct inode *inode, struct list_head *mds_pages,
if (!list_empty(mds_pages)) {
data = nfs_commitdata_alloc(true);
data->ds_commit_index = -1;
- list_add(&data->pages, &list);
+ list_splice_init(mds_pages, &data->pages);
+ list_add_tail(&data->list, &list);
+ atomic_inc(&cinfo->mds->rpcs_out);
nreq++;
}
- nreq += pnfs_generic_alloc_ds_commits(cinfo, &list);
-
+ nreq += pnfs_alloc_ds_commits_list(&list, fl_cinfo, cinfo);
if (nreq == 0)
goto out;
- atomic_add(nreq, &cinfo->mds->rpcs_out);
-
- list_for_each_entry_safe(data, tmp, &list, pages) {
- list_del_init(&data->pages);
+ list_for_each_entry_safe(data, tmp, &list, list) {
+ list_del(&data->list);
if (data->ds_commit_index < 0) {
- /* another commit raced with us */
- if (pnfs_generic_commit_cancel_empty_pagelist(mds_pages,
- data, cinfo))
- continue;
-
- nfs_init_commit(data, mds_pages, NULL, cinfo);
+ nfs_init_commit(data, NULL, NULL, cinfo);
nfs_initiate_commit(NFS_CLIENT(inode), data,
NFS_PROTO(data->inode),
data->mds_ops, how, 0);
} else {
- LIST_HEAD(pages);
-
- pnfs_fetch_commit_bucket_list(&pages, data, cinfo);
-
- /* another commit raced with us */
- if (pnfs_generic_commit_cancel_empty_pagelist(&pages,
- data, cinfo))
- continue;
-
- nfs_init_commit(data, &pages, data->lseg, cinfo);
+ nfs_init_commit(data, NULL, data->lseg, cinfo);
initiate_commit(data, how);
}
}
@@ -930,32 +1165,33 @@ pnfs_layout_mark_request_commit(struct nfs_page *req,
u32 ds_commit_idx)
{
struct list_head *list;
- struct pnfs_commit_bucket *buckets;
+ struct pnfs_commit_array *array;
+ struct pnfs_commit_bucket *bucket;
mutex_lock(&NFS_I(cinfo->inode)->commit_mutex);
- buckets = cinfo->ds->buckets;
- list = &buckets[ds_commit_idx].written;
- if (list_empty(list)) {
- if (!pnfs_is_valid_lseg(lseg)) {
- mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
- cinfo->completion_ops->resched_write(cinfo, req);
- return;
- }
- /* Non-empty buckets hold a reference on the lseg. That ref
- * is normally transferred to the COMMIT call and released
- * there. It could also be released if the last req is pulled
- * off due to a rewrite, in which case it will be done in
- * pnfs_common_clear_request_commit
- */
- WARN_ON_ONCE(buckets[ds_commit_idx].wlseg != NULL);
- buckets[ds_commit_idx].wlseg = pnfs_get_lseg(lseg);
- }
+ array = pnfs_lookup_commit_array(cinfo->ds, lseg);
+ if (!array || !pnfs_is_valid_lseg(lseg))
+ goto out_resched;
+ bucket = &array->buckets[ds_commit_idx];
+ list = &bucket->written;
+ /* Non-empty buckets hold a reference on the lseg. That ref
+ * is normally transferred to the COMMIT call and released
+ * there. It could also be released if the last req is pulled
+ * off due to a rewrite, in which case it will be done in
+ * pnfs_common_clear_request_commit
+ */
+ if (!bucket->lseg)
+ bucket->lseg = pnfs_get_lseg(lseg);
set_bit(PG_COMMIT_TO_DS, &req->wb_flags);
cinfo->ds->nwritten++;
nfs_request_add_commit_list_locked(req, list, cinfo);
mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
nfs_mark_page_unstable(req->wb_page, cinfo);
+ return;
+out_resched:
+ mutex_unlock(&NFS_I(cinfo->inode)->commit_mutex);
+ cinfo->completion_ops->resched_write(cinfo, req);
}
EXPORT_SYMBOL_GPL(pnfs_layout_mark_request_commit);
diff --git a/fs/nfs/read.c b/fs/nfs/read.c
index 34bb9add2302..13b22e898116 100644
--- a/fs/nfs/read.c
+++ b/fs/nfs/read.c
@@ -250,7 +250,7 @@ static int nfs_readpage_done(struct rpc_task *task,
trace_nfs_readpage_done(task, hdr);
if (task->tk_status == -ESTALE) {
- set_bit(NFS_INO_STALE, &NFS_I(inode)->flags);
+ nfs_set_inode_stale(inode);
nfs_mark_for_revalidate(inode);
}
return 0;
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index bb14bede6da5..59ef3b13ccca 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -176,6 +176,41 @@ void nfs_sb_deactive(struct super_block *sb)
}
EXPORT_SYMBOL_GPL(nfs_sb_deactive);
+static int __nfs_list_for_each_server(struct list_head *head,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ struct nfs_server *server, *last = NULL;
+ int ret = 0;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(server, head, client_link) {
+ if (!nfs_sb_active(server->super))
+ continue;
+ rcu_read_unlock();
+ if (last)
+ nfs_sb_deactive(last->super);
+ last = server;
+ ret = fn(server, data);
+ if (ret)
+ goto out;
+ rcu_read_lock();
+ }
+ rcu_read_unlock();
+out:
+ if (last)
+ nfs_sb_deactive(last->super);
+ return ret;
+}
+
+int nfs_client_for_each_server(struct nfs_client *clp,
+ int (*fn)(struct nfs_server *, void *),
+ void *data)
+{
+ return __nfs_list_for_each_server(&clp->cl_superblocks, fn, data);
+}
+EXPORT_SYMBOL_GPL(nfs_client_for_each_server);
+
/*
* Deliver file system statistics to userspace
*/
diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c
index 0effeee28352..b27ebdccef70 100644
--- a/fs/nfs/unlink.c
+++ b/fs/nfs/unlink.c
@@ -98,7 +98,7 @@ static void nfs_do_call_unlink(struct inode *inode, struct nfs_unlinkdata *data)
.callback_ops = &nfs_unlink_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
struct rpc_task *task;
struct inode *dir = d_inode(data->dentry->d_parent);
@@ -341,7 +341,7 @@ nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
.callback_ops = &nfs_rename_ops,
.workqueue = nfsiod_workqueue,
.rpc_client = NFS_CLIENT(old_dir),
- .flags = RPC_TASK_ASYNC,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF,
};
data = kzalloc(sizeof(*data), GFP_KERNEL);
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index c478b772cc49..df4b87c30ac9 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -149,6 +149,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc)
kref_put(&ioc->refcount, nfs_io_completion_release);
}
+static void
+nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode)
+{
+ if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) {
+ kref_get(&req->wb_kref);
+ atomic_long_inc(&NFS_I(inode)->nrequests);
+ }
+}
+
+static int
+nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode)
+{
+ int ret;
+
+ if (!test_bit(PG_REMOVE, &req->wb_flags))
+ return 0;
+ ret = nfs_page_group_lock(req);
+ if (ret)
+ return ret;
+ if (test_and_clear_bit(PG_REMOVE, &req->wb_flags))
+ nfs_page_set_inode_ref(req, inode);
+ nfs_page_group_unlock(req);
+ return 0;
+}
+
static struct nfs_page *
nfs_page_private_request(struct page *page)
{
@@ -218,6 +243,36 @@ static struct nfs_page *nfs_page_find_head_request(struct page *page)
return req;
}
+static struct nfs_page *nfs_find_and_lock_page_request(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *req, *head;
+ int ret;
+
+ for (;;) {
+ req = nfs_page_find_head_request(page);
+ if (!req)
+ return req;
+ head = nfs_page_group_lock_head(req);
+ if (head != req)
+ nfs_release_request(req);
+ if (IS_ERR(head))
+ return head;
+ ret = nfs_cancel_remove_inode(head, inode);
+ if (ret < 0) {
+ nfs_unlock_and_release_request(head);
+ return ERR_PTR(ret);
+ }
+ /* Ensure that nobody removed the request before we locked it */
+ if (head == nfs_page_private_request(page))
+ break;
+ if (PageSwapCache(page))
+ break;
+ nfs_unlock_and_release_request(head);
+ }
+ return head;
+}
+
/* Adjust the file length if we're writing beyond the end */
static void nfs_grow_file(struct page *page, unsigned int offset, unsigned int count)
{
@@ -380,34 +435,6 @@ static void nfs_end_page_writeback(struct nfs_page *req)
}
/*
- * nfs_unroll_locks_and_wait - unlock all newly locked reqs and wait on @req
- *
- * this is a helper function for nfs_lock_and_join_requests
- *
- * @inode - inode associated with request page group, must be holding inode lock
- * @head - head request of page group, must be holding head lock
- * @req - request that couldn't lock and needs to wait on the req bit lock
- *
- * NOTE: this must be called holding page_group bit lock
- * which will be released before returning.
- *
- * returns 0 on success, < 0 on error.
- */
-static void
-nfs_unroll_locks(struct inode *inode, struct nfs_page *head,
- struct nfs_page *req)
-{
- struct nfs_page *tmp;
-
- /* relinquish all the locks successfully grabbed this run */
- for (tmp = head->wb_this_page ; tmp != req; tmp = tmp->wb_this_page) {
- if (!kref_read(&tmp->wb_kref))
- continue;
- nfs_unlock_and_release_request(tmp);
- }
-}
-
-/*
* nfs_destroy_unlinked_subrequests - destroy recently unlinked subrequests
*
* @destroy_list - request list (using wb_this_page) terminated by @old_head
@@ -428,22 +455,29 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
destroy_list = (subreq->wb_this_page == old_head) ?
NULL : subreq->wb_this_page;
+ /* Note: lock subreq in order to change subreq->wb_head */
+ nfs_page_set_headlock(subreq);
WARN_ON_ONCE(old_head != subreq->wb_head);
/* make sure old group is not used */
subreq->wb_this_page = subreq;
+ subreq->wb_head = subreq;
clear_bit(PG_REMOVE, &subreq->wb_flags);
/* Note: races with nfs_page_group_destroy() */
if (!kref_read(&subreq->wb_kref)) {
/* Check if we raced with nfs_page_group_destroy() */
- if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags))
+ if (test_and_clear_bit(PG_TEARDOWN, &subreq->wb_flags)) {
+ nfs_page_clear_headlock(subreq);
nfs_free_request(subreq);
+ } else
+ nfs_page_clear_headlock(subreq);
continue;
}
+ nfs_page_clear_headlock(subreq);
- subreq->wb_head = subreq;
+ nfs_release_request(old_head);
if (test_and_clear_bit(PG_INODE_REF, &subreq->wb_flags)) {
nfs_release_request(subreq);
@@ -457,105 +491,43 @@ nfs_destroy_unlinked_subrequests(struct nfs_page *destroy_list,
}
/*
- * nfs_lock_and_join_requests - join all subreqs to the head req and return
- * a locked reference, cancelling any pending
- * operations for this page.
- *
- * @page - the page used to lookup the "page group" of nfs_page structures
+ * nfs_join_page_group - destroy subrequests of the head req
+ * @head: the page used to lookup the "page group" of nfs_page structures
+ * @inode: Inode to which the request belongs.
*
* This function joins all sub requests to the head request by first
* locking all requests in the group, cancelling any pending operations
* and finally updating the head request to cover the whole range covered by
* the (former) group. All subrequests are removed from any write or commit
* lists, unlinked from the group and destroyed.
- *
- * Returns a locked, referenced pointer to the head request - which after
- * this call is guaranteed to be the only request associated with the page.
- * Returns NULL if no requests are found for @page, or a ERR_PTR if an
- * error was encountered.
*/
-static struct nfs_page *
-nfs_lock_and_join_requests(struct page *page)
+void
+nfs_join_page_group(struct nfs_page *head, struct inode *inode)
{
- struct inode *inode = page_file_mapping(page)->host;
- struct nfs_page *head, *subreq;
+ struct nfs_page *subreq;
struct nfs_page *destroy_list = NULL;
- unsigned int total_bytes;
- int ret;
+ unsigned int pgbase, off, bytes;
-try_again:
- /*
- * A reference is taken only on the head request which acts as a
- * reference to the whole page group - the group will not be destroyed
- * until the head reference is released.
- */
- head = nfs_page_find_head_request(page);
- if (!head)
- return NULL;
-
- /* lock the page head first in order to avoid an ABBA inefficiency */
- if (!nfs_lock_request(head)) {
- ret = nfs_wait_on_request(head);
- nfs_release_request(head);
- if (ret < 0)
- return ERR_PTR(ret);
- goto try_again;
- }
-
- /* Ensure that nobody removed the request before we locked it */
- if (head != nfs_page_private_request(page) && !PageSwapCache(page)) {
- nfs_unlock_and_release_request(head);
- goto try_again;
- }
-
- ret = nfs_page_group_lock(head);
- if (ret < 0)
- goto release_request;
-
- /* lock each request in the page group */
- total_bytes = head->wb_bytes;
+ pgbase = head->wb_pgbase;
+ bytes = head->wb_bytes;
+ off = head->wb_offset;
for (subreq = head->wb_this_page; subreq != head;
subreq = subreq->wb_this_page) {
-
- if (!kref_get_unless_zero(&subreq->wb_kref)) {
- if (subreq->wb_offset == head->wb_offset + total_bytes)
- total_bytes += subreq->wb_bytes;
- continue;
- }
-
- while (!nfs_lock_request(subreq)) {
- /*
- * Unlock page to allow nfs_page_group_sync_on_bit()
- * to succeed
- */
- nfs_page_group_unlock(head);
- ret = nfs_wait_on_request(subreq);
- if (!ret)
- ret = nfs_page_group_lock(head);
- if (ret < 0) {
- nfs_unroll_locks(inode, head, subreq);
- nfs_release_request(subreq);
- goto release_request;
- }
- }
- /*
- * Subrequests are always contiguous, non overlapping
- * and in order - but may be repeated (mirrored writes).
- */
- if (subreq->wb_offset == (head->wb_offset + total_bytes)) {
- /* keep track of how many bytes this group covers */
- total_bytes += subreq->wb_bytes;
- } else if (WARN_ON_ONCE(subreq->wb_offset < head->wb_offset ||
- ((subreq->wb_offset + subreq->wb_bytes) >
- (head->wb_offset + total_bytes)))) {
- nfs_page_group_unlock(head);
- nfs_unroll_locks(inode, head, subreq);
- nfs_unlock_and_release_request(subreq);
- ret = -EIO;
- goto release_request;
+ /* Subrequests should always form a contiguous range */
+ if (pgbase > subreq->wb_pgbase) {
+ off -= pgbase - subreq->wb_pgbase;
+ bytes += pgbase - subreq->wb_pgbase;
+ pgbase = subreq->wb_pgbase;
}
+ bytes = max(subreq->wb_pgbase + subreq->wb_bytes
+ - pgbase, bytes);
}
+ /* Set the head request's range to cover the former page group */
+ head->wb_pgbase = pgbase;
+ head->wb_bytes = bytes;
+ head->wb_offset = off;
+
/* Now that all requests are locked, make sure they aren't on any list.
* Commit list removal accounting is done after locks are dropped */
subreq = head;
@@ -569,36 +541,52 @@ try_again:
/* destroy list will be terminated by head */
destroy_list = head->wb_this_page;
head->wb_this_page = head;
-
- /* change head request to cover whole range that
- * the former page group covered */
- head->wb_bytes = total_bytes;
}
- /* Postpone destruction of this request */
- if (test_and_clear_bit(PG_REMOVE, &head->wb_flags)) {
- set_bit(PG_INODE_REF, &head->wb_flags);
- kref_get(&head->wb_kref);
- atomic_long_inc(&NFS_I(inode)->nrequests);
- }
+ nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+}
- nfs_page_group_unlock(head);
+/*
+ * nfs_lock_and_join_requests - join all subreqs to the head req
+ * @page: the page used to lookup the "page group" of nfs_page structures
+ *
+ * This function joins all sub requests to the head request by first
+ * locking all requests in the group, cancelling any pending operations
+ * and finally updating the head request to cover the whole range covered by
+ * the (former) group. All subrequests are removed from any write or commit
+ * lists, unlinked from the group and destroyed.
+ *
+ * Returns a locked, referenced pointer to the head request - which after
+ * this call is guaranteed to be the only request associated with the page.
+ * Returns NULL if no requests are found for @page, or a ERR_PTR if an
+ * error was encountered.
+ */
+static struct nfs_page *
+nfs_lock_and_join_requests(struct page *page)
+{
+ struct inode *inode = page_file_mapping(page)->host;
+ struct nfs_page *head;
+ int ret;
- nfs_destroy_unlinked_subrequests(destroy_list, head, inode);
+ /*
+ * A reference is taken only on the head request which acts as a
+ * reference to the whole page group - the group will not be destroyed
+ * until the head reference is released.
+ */
+ head = nfs_find_and_lock_page_request(page);
+ if (IS_ERR_OR_NULL(head))
+ return head;
- /* Did we lose a race with nfs_inode_remove_request()? */
- if (!(PagePrivate(page) || PageSwapCache(page))) {
+ /* lock each request in the page group */
+ ret = nfs_page_group_lock_subrequests(head);
+ if (ret < 0) {
nfs_unlock_and_release_request(head);
- return NULL;
+ return ERR_PTR(ret);
}
- /* still holds ref on head from nfs_page_find_head_request
- * and still has lock on head from lock loop */
- return head;
+ nfs_join_page_group(head, inode);
-release_request:
- nfs_unlock_and_release_request(head);
- return ERR_PTR(ret);
+ return head;
}
static void nfs_write_error(struct nfs_page *req, int error)
@@ -1707,7 +1695,7 @@ int nfs_initiate_commit(struct rpc_clnt *clnt, struct nfs_commit_data *data,
.callback_ops = call_ops,
.callback_data = data,
.workqueue = nfsiod_workqueue,
- .flags = RPC_TASK_ASYNC | flags,
+ .flags = RPC_TASK_ASYNC | RPC_TASK_CRED_NOREF | flags,
.priority = priority,
};
/* Set up the initial task struct. */
@@ -1746,14 +1734,19 @@ void nfs_init_commit(struct nfs_commit_data *data,
struct pnfs_layout_segment *lseg,
struct nfs_commit_info *cinfo)
{
- struct nfs_page *first = nfs_list_entry(head->next);
- struct nfs_open_context *ctx = nfs_req_openctx(first);
- struct inode *inode = d_inode(ctx->dentry);
+ struct nfs_page *first;
+ struct nfs_open_context *ctx;
+ struct inode *inode;
/* Set up the RPC argument and reply structs
* NB: take care not to mess about with data->commit et al. */
- list_splice_init(head, &data->pages);
+ if (head)
+ list_splice_init(head, &data->pages);
+
+ first = nfs_list_entry(data->pages.next);
+ ctx = nfs_req_openctx(first);
+ inode = d_inode(ctx->dentry);
data->inode = inode;
data->cred = ctx->cred;
@@ -1869,8 +1862,7 @@ static void nfs_commit_release_pages(struct nfs_commit_data *data)
/* Okay, COMMIT succeeded, apparently. Check the verifier
* returned by the server against all stored verfs. */
- if (verf->committed > NFS_UNSTABLE &&
- !nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier)) {
+ if (nfs_write_match_verf(verf, req)) {
/* We have a match */
if (req->wb_page)
nfs_inode_remove_request(req);
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index 8ceb51478800..7e4bfaf2871f 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -225,7 +225,7 @@ int ubifs_is_mapped(const struct ubifs_info *c, int lnum)
int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
int offs, int quiet, int must_chk_crc)
{
- int err = -EINVAL, type, node_len;
+ int err = -EINVAL, type, node_len, dump_node = 1;
uint32_t crc, node_crc, magic;
const struct ubifs_ch *ch = buf;
@@ -278,10 +278,22 @@ int ubifs_check_node(const struct ubifs_info *c, const void *buf, int lnum,
out_len:
if (!quiet)
ubifs_err(c, "bad node length %d", node_len);
+ if (type == UBIFS_DATA_NODE && node_len > UBIFS_DATA_NODE_SZ)
+ dump_node = 0;
out:
if (!quiet) {
ubifs_err(c, "bad node at LEB %d:%d", lnum, offs);
- ubifs_dump_node(c, buf);
+ if (dump_node) {
+ ubifs_dump_node(c, buf);
+ } else {
+ int safe_len = min3(node_len, c->leb_size - offs,
+ (int)UBIFS_MAX_DATA_NODE_SZ);
+ pr_err("\tprevent out-of-bounds memory access\n");
+ pr_err("\ttruncated data node length %d\n", safe_len);
+ pr_err("\tcorrupted data node:\n");
+ print_hex_dump(KERN_ERR, "\t", DUMP_PREFIX_OFFSET, 32, 1,
+ buf, safe_len, 0);
+ }
dump_stack();
}
return err;
diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c
index 3bf8b1fda9d7..e5ec1afe1c66 100644
--- a/fs/ubifs/journal.c
+++ b/fs/ubifs/journal.c
@@ -905,6 +905,7 @@ int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode)
ubifs_err(c, "dead directory entry '%s', error %d",
xent->name, err);
ubifs_ro_mode(c, err);
+ kfree(xent);
goto out_release;
}
ubifs_assert(c, ubifs_inode(xino)->xattr);
diff --git a/fs/ubifs/orphan.c b/fs/ubifs/orphan.c
index edf43ddd7dce..283f9eb48410 100644
--- a/fs/ubifs/orphan.c
+++ b/fs/ubifs/orphan.c
@@ -157,7 +157,7 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
int err = 0;
ino_t xattr_inum;
union ubifs_key key;
- struct ubifs_dent_node *xent;
+ struct ubifs_dent_node *xent, *pxent = NULL;
struct fscrypt_name nm = {0};
struct ubifs_orphan *xattr_orphan;
struct ubifs_orphan *orphan;
@@ -181,11 +181,16 @@ int ubifs_add_orphan(struct ubifs_info *c, ino_t inum)
xattr_inum = le64_to_cpu(xent->inum);
xattr_orphan = orphan_add(c, xattr_inum, orphan);
- if (IS_ERR(xattr_orphan))
+ if (IS_ERR(xattr_orphan)) {
+ kfree(xent);
return PTR_ERR(xattr_orphan);
+ }
+ kfree(pxent);
+ pxent = xent;
key_read(c, &xent->key, &key);
}
+ kfree(pxent);
return 0;
}
@@ -688,14 +693,14 @@ static int do_kill_orphans(struct ubifs_info *c, struct ubifs_scan_leb *sleb,
ino_key_init(c, &key1, inum);
err = ubifs_tnc_lookup(c, &key1, ino);
- if (err)
+ if (err && err != -ENOENT)
goto out_free;
/*
* Check whether an inode can really get deleted.
* linkat() with O_TMPFILE allows rebirth of an inode.
*/
- if (ino->nlink == 0) {
+ if (err == 0 && ino->nlink == 0) {
dbg_rcvry("deleting orphaned inode %lu",
(unsigned long)inum);
diff --git a/include/linux/f2fs_fs.h b/include/linux/f2fs_fs.h
index ac3f4888b3df..3c383ddd92dd 100644
--- a/include/linux/f2fs_fs.h
+++ b/include/linux/f2fs_fs.h
@@ -125,6 +125,7 @@ struct f2fs_super_block {
/*
* For checkpoint
*/
+#define CP_RESIZEFS_FLAG 0x00004000
#define CP_DISABLED_QUICK_FLAG 0x00002000
#define CP_DISABLED_FLAG 0x00001000
#define CP_QUOTA_NEED_FSCK_FLAG 0x00000800
diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h
index 5d5b91e54f73..73eda45f1cfd 100644
--- a/include/linux/nfs_fs.h
+++ b/include/linux/nfs_fs.h
@@ -354,6 +354,7 @@ static inline unsigned long nfs_save_change_attribute(struct inode *dir)
extern int nfs_sync_mapping(struct address_space *mapping);
extern void nfs_zap_mapping(struct inode *inode, struct address_space *mapping);
extern void nfs_zap_caches(struct inode *);
+extern void nfs_set_inode_stale(struct inode *inode);
extern void nfs_invalidate_atime(struct inode *);
extern struct inode *nfs_fhget(struct super_block *, struct nfs_fh *,
struct nfs_fattr *, struct nfs4_label *);
diff --git a/include/linux/nfs_page.h b/include/linux/nfs_page.h
index 0bbd587fac6a..c32c15216da3 100644
--- a/include/linux/nfs_page.h
+++ b/include/linux/nfs_page.h
@@ -139,9 +139,14 @@ extern size_t nfs_generic_pg_test(struct nfs_pageio_descriptor *desc,
extern int nfs_wait_on_request(struct nfs_page *);
extern void nfs_unlock_request(struct nfs_page *req);
extern void nfs_unlock_and_release_request(struct nfs_page *);
+extern struct nfs_page *nfs_page_group_lock_head(struct nfs_page *req);
+extern int nfs_page_group_lock_subrequests(struct nfs_page *head);
+extern void nfs_join_page_group(struct nfs_page *head, struct inode *inode);
extern int nfs_page_group_lock(struct nfs_page *);
extern void nfs_page_group_unlock(struct nfs_page *);
extern bool nfs_page_group_sync_on_bit(struct nfs_page *, unsigned int);
+extern int nfs_page_set_headlock(struct nfs_page *req);
+extern void nfs_page_clear_headlock(struct nfs_page *req);
extern bool nfs_async_iocounter_wait(struct rpc_task *, struct nfs_lock_context *);
/*
diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h
index 6838c149f335..440230488025 100644
--- a/include/linux/nfs_xdr.h
+++ b/include/linux/nfs_xdr.h
@@ -1266,16 +1266,25 @@ struct nfstime4 {
struct pnfs_commit_bucket {
struct list_head written;
struct list_head committing;
- struct pnfs_layout_segment *wlseg;
- struct pnfs_layout_segment *clseg;
+ struct pnfs_layout_segment *lseg;
struct nfs_writeverf direct_verf;
};
+struct pnfs_commit_array {
+ struct list_head cinfo_list;
+ struct list_head lseg_list;
+ struct pnfs_layout_segment *lseg;
+ struct rcu_head rcu;
+ refcount_t refcount;
+ unsigned int nbuckets;
+ struct pnfs_commit_bucket buckets[];
+};
+
struct pnfs_ds_commit_info {
- int nwritten;
- int ncommitting;
- int nbuckets;
- struct pnfs_commit_bucket *buckets;
+ struct list_head commits;
+ unsigned int nwritten;
+ unsigned int ncommitting;
+ const struct pnfs_commit_ops *ops;
};
struct nfs41_state_protection {
@@ -1386,22 +1395,11 @@ struct nfs41_free_stateid_res {
unsigned int status;
};
-static inline void
-nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
-{
- kfree(cinfo->buckets);
-}
-
#else
struct pnfs_ds_commit_info {
};
-static inline void
-nfs_free_pnfs_ds_cinfo(struct pnfs_ds_commit_info *cinfo)
-{
-}
-
#endif /* CONFIG_NFS_V4_1 */
#ifdef CONFIG_NFS_V4_2
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 28b1a2b4459e..3a2ac7072dbb 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -47,8 +47,8 @@
* A. IP checksum related features
*
* Drivers advertise checksum offload capabilities in the features of a device.
- * From the stack's point of view these are capabilities offered by the driver,
- * a driver typically only advertises features that it is capable of offloading
+ * From the stack's point of view these are capabilities offered by the driver.
+ * A driver typically only advertises features that it is capable of offloading
* to its device.
*
* The checksum related features are:
@@ -63,7 +63,7 @@
* TCP or UDP packets over IPv4. These are specifically
* unencapsulated packets of the form IPv4|TCP or
* IPv4|UDP where the Protocol field in the IPv4 header
- * is TCP or UDP. The IPv4 header may contain IP options
+ * is TCP or UDP. The IPv4 header may contain IP options.
* This feature cannot be set in features for a device
* with NETIF_F_HW_CSUM also set. This feature is being
* DEPRECATED (see below).
@@ -79,13 +79,13 @@
* DEPRECATED (see below).
*
* NETIF_F_RXCSUM - Driver (device) performs receive checksum offload.
- * This flag is used only used to disable the RX checksum
+ * This flag is only used to disable the RX checksum
* feature for a device. The stack will accept receive
* checksum indication in packets received on a device
* regardless of whether NETIF_F_RXCSUM is set.
*
* B. Checksumming of received packets by device. Indication of checksum
- * verification is in set skb->ip_summed. Possible values are:
+ * verification is set in skb->ip_summed. Possible values are:
*
* CHECKSUM_NONE:
*
@@ -115,16 +115,16 @@
* the packet minus one that have been verified as CHECKSUM_UNNECESSARY.
* For instance if a device receives an IPv6->UDP->GRE->IPv4->TCP packet
* and a device is able to verify the checksums for UDP (possibly zero),
- * GRE (checksum flag is set), and TCP-- skb->csum_level would be set to
+ * GRE (checksum flag is set) and TCP, skb->csum_level would be set to
* two. If the device were only able to verify the UDP checksum and not
- * GRE, either because it doesn't support GRE checksum of because GRE
+ * GRE, either because it doesn't support GRE checksum or because GRE
* checksum is bad, skb->csum_level would be set to zero (TCP checksum is
* not considered in this case).
*
* CHECKSUM_COMPLETE:
*
* This is the most generic way. The device supplied checksum of the _whole_
- * packet as seen by netif_rx() and fills out in skb->csum. Meaning, the
+ * packet as seen by netif_rx() and fills in skb->csum. This means the
* hardware doesn't need to parse L3/L4 headers to implement this.
*
* Notes:
@@ -153,8 +153,8 @@
* from skb->csum_start up to the end, and to record/write the checksum at
* offset skb->csum_start + skb->csum_offset. A driver may verify that the
* csum_start and csum_offset values are valid values given the length and
- * offset of the packet, however they should not attempt to validate that the
- * checksum refers to a legitimate transport layer checksum-- it is the
+ * offset of the packet, but it should not attempt to validate that the
+ * checksum refers to a legitimate transport layer checksum -- it is the
* purview of the stack to validate that csum_start and csum_offset are set
* correctly.
*
@@ -178,18 +178,18 @@
*
* CHECKSUM_UNNECESSARY:
*
- * This has the same meaning on as CHECKSUM_NONE for checksum offload on
+ * This has the same meaning as CHECKSUM_NONE for checksum offload on
* output.
*
* CHECKSUM_COMPLETE:
* Not used in checksum output. If a driver observes a packet with this value
- * set in skbuff, if should treat as CHECKSUM_NONE being set.
+ * set in skbuff, it should treat the packet as if CHECKSUM_NONE were set.
*
* D. Non-IP checksum (CRC) offloads
*
* NETIF_F_SCTP_CRC - This feature indicates that a device is capable of
* offloading the SCTP CRC in a packet. To perform this offload the stack
- * will set set csum_start and csum_offset accordingly, set ip_summed to
+ * will set csum_start and csum_offset accordingly, set ip_summed to
* CHECKSUM_PARTIAL and set csum_not_inet to 1, to provide an indication in
* the skbuff that the CHECKSUM_PARTIAL refers to CRC32c.
* A driver that supports both IP checksum offload and SCTP CRC32c offload
@@ -200,10 +200,10 @@
* NETIF_F_FCOE_CRC - This feature indicates that a device is capable of
* offloading the FCOE CRC in a packet. To perform this offload the stack
* will set ip_summed to CHECKSUM_PARTIAL and set csum_start and csum_offset
- * accordingly. Note the there is no indication in the skbuff that the
- * CHECKSUM_PARTIAL refers to an FCOE checksum, a driver that supports
+ * accordingly. Note that there is no indication in the skbuff that the
+ * CHECKSUM_PARTIAL refers to an FCOE checksum, so a driver that supports
* both IP checksum offload and FCOE CRC offload must verify which offload
- * is configured for a packet presumably by inspecting packet headers.
+ * is configured for a packet, presumably by inspecting packet headers.
*
* E. Checksumming on output with GSO.
*
@@ -211,9 +211,9 @@
* is implied by the SKB_GSO_* flags in gso_type. Most obviously, if the
* gso_type is SKB_GSO_TCPV4 or SKB_GSO_TCPV6, TCP checksum offload as
* part of the GSO operation is implied. If a checksum is being offloaded
- * with GSO then ip_summed is CHECKSUM_PARTIAL, csum_start and csum_offset
- * are set to refer to the outermost checksum being offload (two offloaded
- * checksums are possible with UDP encapsulation).
+ * with GSO then ip_summed is CHECKSUM_PARTIAL, and both csum_start and
+ * csum_offset are set to refer to the outermost checksum being offloaded
+ * (two offloaded checksums are possible with UDP encapsulation).
*/
/* Don't change this without changing skb_csum_unnecessary! */
diff --git a/include/linux/sunrpc/sched.h b/include/linux/sunrpc/sched.h
index a6ef35184ef1..df696efdd675 100644
--- a/include/linux/sunrpc/sched.h
+++ b/include/linux/sunrpc/sched.h
@@ -132,6 +132,7 @@ struct rpc_task_setup {
#define RPC_TASK_TIMEOUT 0x1000 /* fail with ETIMEDOUT on timeout */
#define RPC_TASK_NOCONNECT 0x2000 /* return ENOTCONN if not connected */
#define RPC_TASK_NO_RETRANS_TIMEOUT 0x4000 /* wait forever for a reply */
+#define RPC_TASK_CRED_NOREF 0x8000 /* No refcount on the credential */
#define RPC_IS_ASYNC(t) ((t)->tk_flags & RPC_TASK_ASYNC)
#define RPC_IS_SWAPPER(t) ((t)->tk_flags & RPC_TASK_SWAPPER)
diff --git a/include/linux/sunrpc/xdr.h b/include/linux/sunrpc/xdr.h
index 8529d6e33137..01bb41908c93 100644
--- a/include/linux/sunrpc/xdr.h
+++ b/include/linux/sunrpc/xdr.h
@@ -184,7 +184,6 @@ xdr_adjust_iovec(struct kvec *iov, __be32 *p)
extern void xdr_shift_buf(struct xdr_buf *, size_t);
extern void xdr_buf_from_iov(struct kvec *, struct xdr_buf *);
extern int xdr_buf_subsegment(struct xdr_buf *, struct xdr_buf *, unsigned int, unsigned int);
-extern int xdr_buf_read_mic(struct xdr_buf *, struct xdr_netobj *, unsigned int);
extern int read_bytes_from_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
extern int write_bytes_to_xdr_buf(struct xdr_buf *, unsigned int, void *, unsigned int);
diff --git a/include/trace/events/f2fs.h b/include/trace/events/f2fs.h
index 67a97838c2a0..d97adfc327f0 100644
--- a/include/trace/events/f2fs.h
+++ b/include/trace/events/f2fs.h
@@ -153,7 +153,8 @@ TRACE_DEFINE_ENUM(CP_PAUSE);
#define show_compress_algorithm(type) \
__print_symbolic(type, \
{ COMPRESS_LZO, "LZO" }, \
- { COMPRESS_LZ4, "LZ4" })
+ { COMPRESS_LZ4, "LZ4" }, \
+ { COMPRESS_ZSTD, "ZSTD" })
struct f2fs_sb_info;
struct f2fs_io_info;
diff --git a/include/trace/events/rpcrdma.h b/include/trace/events/rpcrdma.h
index 9238d233f8cf..051f26fedc4d 100644
--- a/include/trace/events/rpcrdma.h
+++ b/include/trace/events/rpcrdma.h
@@ -104,12 +104,12 @@ DECLARE_EVENT_CLASS(xprtrdma_connect_class,
TP_fast_assign(
__entry->r_xprt = r_xprt;
__entry->rc = rc;
- __entry->connect_status = r_xprt->rx_ep.rep_connected;
+ __entry->connect_status = r_xprt->rx_ep->re_connect_status;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
- TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connect status=%d",
+ TP_printk("peer=[%s]:%s r_xprt=%p: rc=%d connection status=%d",
__get_str(addr), __get_str(port), __entry->r_xprt,
__entry->rc, __entry->connect_status
)
@@ -228,20 +228,20 @@ DECLARE_EVENT_CLASS(xprtrdma_frwr_done,
TP_ARGS(wc, frwr),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(unsigned int, status)
__field(unsigned int, vendor_err)
),
TP_fast_assign(
- __entry->mr = container_of(frwr, struct rpcrdma_mr, frwr);
+ __entry->mr_id = frwr->fr_mr->res.id;
__entry->status = wc->status;
__entry->vendor_err = __entry->status ? wc->vendor_err : 0;
),
TP_printk(
- "mr=%p: %s (%u/0x%x)",
- __entry->mr, rdma_show_wc_status(__entry->status),
+ "mr.id=%u: %s (%u/0x%x)",
+ __entry->mr_id, rdma_show_wc_status(__entry->status),
__entry->status, __entry->vendor_err
)
);
@@ -274,7 +274,8 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
TP_ARGS(mr),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
+ __field(int, nents)
__field(u32, handle)
__field(u32, length)
__field(u64, offset)
@@ -282,15 +283,16 @@ DECLARE_EVENT_CLASS(xprtrdma_mr,
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
+ __entry->nents = mr->mr_nents;
__entry->handle = mr->mr_handle;
__entry->length = mr->mr_length;
__entry->offset = mr->mr_offset;
__entry->dir = mr->mr_dir;
),
- TP_printk("mr=%p %u@0x%016llx:0x%08x (%s)",
- __entry->mr, __entry->length,
+ TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s)",
+ __entry->mr_id, __entry->nents, __entry->length,
(unsigned long long)__entry->offset, __entry->handle,
xprtrdma_show_direction(__entry->dir)
)
@@ -340,68 +342,37 @@ DECLARE_EVENT_CLASS(xprtrdma_cb_event,
** Connection events
**/
-TRACE_EVENT(xprtrdma_cm_event,
- TP_PROTO(
- const struct rpcrdma_xprt *r_xprt,
- struct rdma_cm_event *event
- ),
-
- TP_ARGS(r_xprt, event),
-
- TP_STRUCT__entry(
- __field(const void *, r_xprt)
- __field(unsigned int, event)
- __field(int, status)
- __string(addr, rpcrdma_addrstr(r_xprt))
- __string(port, rpcrdma_portstr(r_xprt))
- ),
-
- TP_fast_assign(
- __entry->r_xprt = r_xprt;
- __entry->event = event->event;
- __entry->status = event->status;
- __assign_str(addr, rpcrdma_addrstr(r_xprt));
- __assign_str(port, rpcrdma_portstr(r_xprt));
- ),
-
- TP_printk("peer=[%s]:%s r_xprt=%p: %s (%u/%d)",
- __get_str(addr), __get_str(port),
- __entry->r_xprt, rdma_show_cm_event(__entry->event),
- __entry->event, __entry->status
- )
-);
-
TRACE_EVENT(xprtrdma_inline_thresh,
TP_PROTO(
- const struct rpcrdma_xprt *r_xprt
+ const struct rpcrdma_ep *ep
),
- TP_ARGS(r_xprt),
+ TP_ARGS(ep),
TP_STRUCT__entry(
- __field(const void *, r_xprt)
__field(unsigned int, inline_send)
__field(unsigned int, inline_recv)
__field(unsigned int, max_send)
__field(unsigned int, max_recv)
- __string(addr, rpcrdma_addrstr(r_xprt))
- __string(port, rpcrdma_portstr(r_xprt))
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
- const struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ const struct rdma_cm_id *id = ep->re_id;
- __entry->r_xprt = r_xprt;
- __entry->inline_send = ep->rep_inline_send;
- __entry->inline_recv = ep->rep_inline_recv;
- __entry->max_send = ep->rep_max_inline_send;
- __entry->max_recv = ep->rep_max_inline_recv;
- __assign_str(addr, rpcrdma_addrstr(r_xprt));
- __assign_str(port, rpcrdma_portstr(r_xprt));
+ __entry->inline_send = ep->re_inline_send;
+ __entry->inline_recv = ep->re_inline_recv;
+ __entry->max_send = ep->re_max_inline_send;
+ __entry->max_recv = ep->re_max_inline_recv;
+ memcpy(__entry->srcaddr, &id->route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
),
- TP_printk("peer=[%s]:%s r_xprt=%p neg send/recv=%u/%u, calc send/recv=%u/%u",
- __get_str(addr), __get_str(port), __entry->r_xprt,
+ TP_printk("%pISpc -> %pISpc neg send/recv=%u/%u, calc send/recv=%u/%u",
+ __entry->srcaddr, __entry->dstaddr,
__entry->inline_send, __entry->inline_recv,
__entry->max_send, __entry->max_recv
)
@@ -409,11 +380,10 @@ TRACE_EVENT(xprtrdma_inline_thresh,
DEFINE_CONN_EVENT(connect);
DEFINE_CONN_EVENT(disconnect);
+DEFINE_CONN_EVENT(flush_dct);
DEFINE_RXPRT_EVENT(xprtrdma_create);
DEFINE_RXPRT_EVENT(xprtrdma_op_destroy);
-DEFINE_RXPRT_EVENT(xprtrdma_remove);
-DEFINE_RXPRT_EVENT(xprtrdma_reinsert);
DEFINE_RXPRT_EVENT(xprtrdma_op_inject_dsc);
DEFINE_RXPRT_EVENT(xprtrdma_op_close);
DEFINE_RXPRT_EVENT(xprtrdma_op_setport);
@@ -480,32 +450,33 @@ TRACE_EVENT(xprtrdma_op_set_cto,
TRACE_EVENT(xprtrdma_qp_event,
TP_PROTO(
- const struct rpcrdma_xprt *r_xprt,
+ const struct rpcrdma_ep *ep,
const struct ib_event *event
),
- TP_ARGS(r_xprt, event),
+ TP_ARGS(ep, event),
TP_STRUCT__entry(
- __field(const void *, r_xprt)
- __field(unsigned int, event)
+ __field(unsigned long, event)
__string(name, event->device->name)
- __string(addr, rpcrdma_addrstr(r_xprt))
- __string(port, rpcrdma_portstr(r_xprt))
+ __array(unsigned char, srcaddr, sizeof(struct sockaddr_in6))
+ __array(unsigned char, dstaddr, sizeof(struct sockaddr_in6))
),
TP_fast_assign(
- __entry->r_xprt = r_xprt;
+ const struct rdma_cm_id *id = ep->re_id;
+
__entry->event = event->event;
__assign_str(name, event->device->name);
- __assign_str(addr, rpcrdma_addrstr(r_xprt));
- __assign_str(port, rpcrdma_portstr(r_xprt));
+ memcpy(__entry->srcaddr, &id->route.addr.src_addr,
+ sizeof(struct sockaddr_in6));
+ memcpy(__entry->dstaddr, &id->route.addr.dst_addr,
+ sizeof(struct sockaddr_in6));
),
- TP_printk("peer=[%s]:%s r_xprt=%p: dev %s: %s (%u)",
- __get_str(addr), __get_str(port), __entry->r_xprt,
- __get_str(name), rdma_show_ib_event(__entry->event),
- __entry->event
+ TP_printk("%pISpc -> %pISpc device=%s %s (%lu)",
+ __entry->srcaddr, __entry->dstaddr, __get_str(name),
+ rdma_show_ib_event(__entry->event), __entry->event
)
);
@@ -801,7 +772,7 @@ TRACE_EVENT(xprtrdma_post_recvs,
__entry->r_xprt = r_xprt;
__entry->count = count;
__entry->status = status;
- __entry->posted = r_xprt->rx_ep.rep_receive_count;
+ __entry->posted = r_xprt->rx_ep->re_receive_count;
__assign_str(addr, rpcrdma_addrstr(r_xprt));
__assign_str(port, rpcrdma_portstr(r_xprt));
),
@@ -920,17 +891,17 @@ TRACE_EVENT(xprtrdma_frwr_alloc,
TP_ARGS(mr, rc),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(int, rc)
),
TP_fast_assign(
- __entry->mr = mr;
- __entry->rc = rc;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
+ __entry->rc = rc;
),
- TP_printk("mr=%p: rc=%d",
- __entry->mr, __entry->rc
+ TP_printk("mr.id=%u: rc=%d",
+ __entry->mr_id, __entry->rc
)
);
@@ -943,7 +914,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
TP_ARGS(mr, rc),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
+ __field(int, nents)
__field(u32, handle)
__field(u32, length)
__field(u64, offset)
@@ -952,7 +924,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
+ __entry->nents = mr->mr_nents;
__entry->handle = mr->mr_handle;
__entry->length = mr->mr_length;
__entry->offset = mr->mr_offset;
@@ -960,8 +933,8 @@ TRACE_EVENT(xprtrdma_frwr_dereg,
__entry->rc = rc;
),
- TP_printk("mr=%p %u@0x%016llx:0x%08x (%s): rc=%d",
- __entry->mr, __entry->length,
+ TP_printk("mr.id=%u nents=%d %u@0x%016llx:0x%08x (%s): rc=%d",
+ __entry->mr_id, __entry->nents, __entry->length,
(unsigned long long)__entry->offset, __entry->handle,
xprtrdma_show_direction(__entry->dir),
__entry->rc
@@ -977,21 +950,21 @@ TRACE_EVENT(xprtrdma_frwr_sgerr,
TP_ARGS(mr, sg_nents),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(u64, addr)
__field(u32, dir)
__field(int, nents)
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
__entry->addr = mr->mr_sg->dma_address;
__entry->dir = mr->mr_dir;
__entry->nents = sg_nents;
),
- TP_printk("mr=%p dma addr=0x%llx (%s) sg_nents=%d",
- __entry->mr, __entry->addr,
+ TP_printk("mr.id=%u DMA addr=0x%llx (%s) sg_nents=%d",
+ __entry->mr_id, __entry->addr,
xprtrdma_show_direction(__entry->dir),
__entry->nents
)
@@ -1006,7 +979,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
TP_ARGS(mr, num_mapped),
TP_STRUCT__entry(
- __field(const void *, mr)
+ __field(u32, mr_id)
__field(u64, addr)
__field(u32, dir)
__field(int, num_mapped)
@@ -1014,15 +987,15 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
),
TP_fast_assign(
- __entry->mr = mr;
+ __entry->mr_id = mr->frwr.fr_mr->res.id;
__entry->addr = mr->mr_sg->dma_address;
__entry->dir = mr->mr_dir;
__entry->num_mapped = num_mapped;
__entry->nents = mr->mr_nents;
),
- TP_printk("mr=%p dma addr=0x%llx (%s) nents=%d of %d",
- __entry->mr, __entry->addr,
+ TP_printk("mr.id=%u DMA addr=0x%llx (%s) nents=%d of %d",
+ __entry->mr_id, __entry->addr,
xprtrdma_show_direction(__entry->dir),
__entry->num_mapped, __entry->nents
)
@@ -1031,7 +1004,7 @@ TRACE_EVENT(xprtrdma_frwr_maperr,
DEFINE_MR_EVENT(localinv);
DEFINE_MR_EVENT(map);
DEFINE_MR_EVENT(unmap);
-DEFINE_MR_EVENT(remoteinv);
+DEFINE_MR_EVENT(reminv);
DEFINE_MR_EVENT(recycle);
TRACE_EVENT(xprtrdma_dma_maperr,
diff --git a/include/uapi/linux/um_timetravel.h b/include/uapi/linux/um_timetravel.h
new file mode 100644
index 000000000000..ca3238222b6d
--- /dev/null
+++ b/include/uapi/linux/um_timetravel.h
@@ -0,0 +1,128 @@
+/*
+ * Permission to use, copy, modify, and/or distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Copyright (C) 2019 Intel Corporation
+ */
+#ifndef _UAPI_LINUX_UM_TIMETRAVEL_H
+#define _UAPI_LINUX_UM_TIMETRAVEL_H
+#include <linux/types.h>
+
+/**
+ * struct um_timetravel_msg - UM time travel message
+ *
+ * This is the basic message type, going in both directions.
+ *
+ * This is the message passed between the host (user-mode Linux instance)
+ * and the calendar (the application on the other side of the socket) in
+ * order to implement common scheduling.
+ *
+ * Whenever UML has an event it will request runtime for it from the
+ * calendar, and then wait for its turn until it can run, etc. Note
+ * that it will only ever request the single next runtime, i.e. multiple
+ * REQUEST messages override each other.
+ */
+struct um_timetravel_msg {
+ /**
+ * @op: operation value from &enum um_timetravel_ops
+ */
+ __u32 op;
+
+ /**
+ * @seq: sequence number for the message - shall be reflected in
+ * the ACK response, and should be checked while processing
+ * the response to see if it matches
+ */
+ __u32 seq;
+
+ /**
+ * @time: time in nanoseconds
+ */
+ __u64 time;
+};
+
+/**
+ * enum um_timetravel_ops - Operation codes
+ */
+enum um_timetravel_ops {
+ /**
+ * @UM_TIMETRAVEL_ACK: response (ACK) to any previous message,
+ * this usually doesn't carry any data in the 'time' field
+ * unless otherwise specified below
+ */
+ UM_TIMETRAVEL_ACK = 0,
+
+ /**
+ * @UM_TIMETRAVEL_START: initialize the connection, the time
+ * field contains an (arbitrary) ID to possibly be able
+ * to distinguish the connections.
+ */
+ UM_TIMETRAVEL_START = 1,
+
+ /**
+ * @UM_TIMETRAVEL_REQUEST: request to run at the given time
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_REQUEST = 2,
+
+ /**
+ * @UM_TIMETRAVEL_WAIT: Indicate waiting for the previously requested
+ * runtime, new requests may be made while waiting (e.g. due to
+ * interrupts); the time field is ignored. The calendar must process
+ * this message and later send a %UM_TIMETRAVEL_RUN message when
+ * the host can run again.
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_WAIT = 3,
+
+ /**
+ * @UM_TIMETRAVEL_GET: return the current time from the calendar in the
+ * ACK message, the time in the request message is ignored
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_GET = 4,
+
+ /**
+ * @UM_TIMETRAVEL_UPDATE: time update to the calendar, must be sent e.g.
+ * before kicking an interrupt to another calendar
+ * (host -> calendar)
+ */
+ UM_TIMETRAVEL_UPDATE = 5,
+
+ /**
+ * @UM_TIMETRAVEL_RUN: run time request granted, current time is in
+ * the time field
+ * (calendar -> host)
+ */
+ UM_TIMETRAVEL_RUN = 6,
+
+ /**
+ * @UM_TIMETRAVEL_FREE_UNTIL: Enable free-running until the given time,
+ * this is a message from the calendar telling the host that it can
+ * freely do its own scheduling for anything before the indicated
+ * time.
+ * Note that if a calendar sends this message once, the host may
+ * assume that it will also do so in the future, if it implements
+ * wraparound semantics for the time field.
+ * (calendar -> host)
+ */
+ UM_TIMETRAVEL_FREE_UNTIL = 7,
+
+ /**
+ * @UM_TIMETRAVEL_GET_TOD: Return time of day, typically used once at
+ * boot by the virtual machines to get a synchronized time from
+ * the simulation.
+ */
+ UM_TIMETRAVEL_GET_TOD = 8,
+};
+
+#endif /* _UAPI_LINUX_UM_TIMETRAVEL_H */
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 5bf8d22a47ec..39d37d0ef575 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -1065,11 +1065,12 @@ static void neigh_timer_handler(struct timer_list *t)
neigh->updated = jiffies;
atomic_set(&neigh->probes, 0);
notify = 1;
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100);
}
} else {
/* NUD_PROBE|NUD_INCOMPLETE */
- next = now + NEIGH_VAR(neigh->parms, RETRANS_TIME);
+ next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME), HZ/100);
}
if ((neigh->nud_state & (NUD_INCOMPLETE | NUD_PROBE)) &&
@@ -1125,7 +1126,7 @@ int __neigh_event_send(struct neighbour *neigh, struct sk_buff *skb)
neigh->nud_state = NUD_INCOMPLETE;
neigh->updated = now;
next = now + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
- HZ/2);
+ HZ/100);
neigh_add_timer(neigh, next);
immediate_probe = true;
} else {
@@ -1427,7 +1428,8 @@ void __neigh_set_probe_once(struct neighbour *neigh)
neigh->nud_state = NUD_INCOMPLETE;
atomic_set(&neigh->probes, neigh_max_probes(neigh));
neigh_add_timer(neigh,
- jiffies + NEIGH_VAR(neigh->parms, RETRANS_TIME));
+ jiffies + max(NEIGH_VAR(neigh->parms, RETRANS_TIME),
+ HZ/100));
}
EXPORT_SYMBOL(__neigh_set_probe_once);
diff --git a/net/core/sock.c b/net/core/sock.c
index da32d9b6d09f..ce1d8dce9b7a 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -574,7 +574,7 @@ static int sock_setbindtodevice_locked(struct sock *sk, int ifindex)
/* Sorry... */
ret = -EPERM;
- if (!ns_capable(net->user_ns, CAP_NET_RAW))
+ if (sk->sk_bound_dev_if && !ns_capable(net->user_ns, CAP_NET_RAW))
goto out;
ret = -EINVAL;
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 5390ff541658..e94eb1aac602 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -1338,7 +1338,7 @@ static void dsa_hw_port_list_free(struct list_head *hw_port_list)
}
/* Make the hardware datapath to/from @dev limited to a common MTU */
-void dsa_bridge_mtu_normalization(struct dsa_port *dp)
+static void dsa_bridge_mtu_normalization(struct dsa_port *dp)
{
struct list_head hw_port_list;
struct dsa_switch_tree *dst;
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index a11fd4d67832..24e319dfb510 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -1357,7 +1357,7 @@ retry:
regen_advance = idev->cnf.regen_max_retry *
idev->cnf.dad_transmits *
- NEIGH_VAR(idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
/* recalculate max_desync_factor each time and update
* idev->desync_factor if it's larger
@@ -3298,6 +3298,10 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
if (netif_is_l3_master(idev->dev))
return;
+ /* no link local addresses on devices flagged as slaves */
+ if (idev->dev->flags & IFF_SLAVE)
+ return;
+
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
switch (idev->cnf.addr_gen_mode) {
@@ -4117,7 +4121,8 @@ static void addrconf_dad_work(struct work_struct *w)
ifp->dad_probes--;
addrconf_mod_dad_work(ifp,
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME));
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME),
+ HZ/100));
spin_unlock(&ifp->lock);
write_unlock_bh(&idev->lock);
@@ -4523,7 +4528,7 @@ restart:
!(ifp->flags&IFA_F_TENTATIVE)) {
unsigned long regen_advance = ifp->idev->cnf.regen_max_retry *
ifp->idev->cnf.dad_transmits *
- NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME) / HZ;
+ max(NEIGH_VAR(ifp->idev->nd_parms, RETRANS_TIME), HZ/100) / HZ;
if (age >= ifp->prefered_lft - regen_advance) {
struct inet6_ifaddr *ifpub = ifp->ifpub;
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 6ffa153e5166..1ecd4e9b0bdf 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -1359,8 +1359,8 @@ skip_defrtr:
if (rtime && rtime/1000 < MAX_SCHEDULE_TIMEOUT/HZ) {
rtime = (rtime*HZ)/1000;
- if (rtime < HZ/10)
- rtime = HZ/10;
+ if (rtime < HZ/100)
+ rtime = HZ/100;
NEIGH_VAR_SET(in6_dev->nd_parms, RETRANS_TIME, rtime);
in6_dev->tstamp = jiffies;
send_ifinfo_notify = true;
diff --git a/net/ipv6/rpl.c b/net/ipv6/rpl.c
index dc4f20e23bf7..d38b476fc7f2 100644
--- a/net/ipv6/rpl.c
+++ b/net/ipv6/rpl.c
@@ -48,7 +48,7 @@ void ipv6_rpl_srh_decompress(struct ipv6_rpl_sr_hdr *outhdr,
outhdr->cmpri = 0;
outhdr->cmpre = 0;
- for (i = 0; i <= n; i++)
+ for (i = 0; i < n; i++)
ipv6_rpl_addr_decompress(&outhdr->rpl_segaddr[i], daddr,
ipv6_rpl_segdata_pos(inhdr, i),
inhdr->cmpri);
@@ -66,7 +66,7 @@ static unsigned char ipv6_rpl_srh_calc_cmpri(const struct ipv6_rpl_sr_hdr *inhdr
int i;
for (plen = 0; plen < sizeof(*daddr); plen++) {
- for (i = 0; i <= n; i++) {
+ for (i = 0; i < n; i++) {
if (daddr->s6_addr[plen] !=
inhdr->rpl_segaddr[i].s6_addr[plen])
return plen;
@@ -114,7 +114,7 @@ void ipv6_rpl_srh_compress(struct ipv6_rpl_sr_hdr *outhdr,
outhdr->cmpri = cmpri;
outhdr->cmpre = cmpre;
- for (i = 0; i <= n; i++)
+ for (i = 0; i < n; i++)
ipv6_rpl_addr_compress(ipv6_rpl_segdata_pos(outhdr, i),
&inhdr->rpl_segaddr[i], cmpri);
diff --git a/net/ipv6/rpl_iptunnel.c b/net/ipv6/rpl_iptunnel.c
index a49ddc6cd020..c3ececd7cfc1 100644
--- a/net/ipv6/rpl_iptunnel.c
+++ b/net/ipv6/rpl_iptunnel.c
@@ -210,7 +210,7 @@ static int rpl_output(struct net *net, struct sock *sk, struct sk_buff *skb)
struct dst_entry *orig_dst = skb_dst(skb);
struct dst_entry *dst = NULL;
struct rpl_lwt *rlwt;
- int err = -EINVAL;
+ int err;
rlwt = rpl_lwt_lwtunnel(orig_dst->lwtstate);
diff --git a/net/mptcp/options.c b/net/mptcp/options.c
index bd220ee4aac9..faf57585b892 100644
--- a/net/mptcp/options.c
+++ b/net/mptcp/options.c
@@ -4,6 +4,8 @@
* Copyright (c) 2017 - 2019, Intel Corporation.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
diff --git a/net/mptcp/pm.c b/net/mptcp/pm.c
index 064639f72487..977d9c8b1453 100644
--- a/net/mptcp/pm.c
+++ b/net/mptcp/pm.c
@@ -3,6 +3,8 @@
*
* Copyright (c) 2019, Intel Corporation.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/kernel.h>
#include <net/tcp.h>
#include <net/mptcp.h>
diff --git a/net/mptcp/pm_netlink.c b/net/mptcp/pm_netlink.c
index a0ce7f324499..86d61ab34c7c 100644
--- a/net/mptcp/pm_netlink.c
+++ b/net/mptcp/pm_netlink.c
@@ -4,6 +4,8 @@
* Copyright (c) 2020, Red Hat, Inc.
*/
+#define pr_fmt(fmt) "MPTCP: " fmt
+
#include <linux/inet.h>
#include <linux/kernel.h>
#include <net/tcp.h>
diff --git a/net/mptcp/protocol.c b/net/mptcp/protocol.c
index 1833bc1f4a43..939a5045181a 100644
--- a/net/mptcp/protocol.c
+++ b/net/mptcp/protocol.c
@@ -57,10 +57,43 @@ static bool __mptcp_needs_tcp_fallback(const struct mptcp_sock *msk)
return msk->first && !sk_is_mptcp(msk->first);
}
+static struct socket *mptcp_is_tcpsk(struct sock *sk)
+{
+ struct socket *sock = sk->sk_socket;
+
+ if (sock->sk != sk)
+ return NULL;
+
+ if (unlikely(sk->sk_prot == &tcp_prot)) {
+ /* we are being invoked after mptcp_accept() has
+ * accepted a non-mp-capable flow: sk is a tcp_sk,
+ * not an mptcp one.
+ *
+ * Hand the socket over to tcp so all further socket ops
+ * bypass mptcp.
+ */
+ sock->ops = &inet_stream_ops;
+ return sock;
+#if IS_ENABLED(CONFIG_MPTCP_IPV6)
+ } else if (unlikely(sk->sk_prot == &tcpv6_prot)) {
+ sock->ops = &inet6_stream_ops;
+ return sock;
+#endif
+ }
+
+ return NULL;
+}
+
static struct socket *__mptcp_tcp_fallback(struct mptcp_sock *msk)
{
+ struct socket *sock;
+
sock_owned_by_me((const struct sock *)msk);
+ sock = mptcp_is_tcpsk((struct sock *)msk);
+ if (unlikely(sock))
+ return sock;
+
if (likely(!__mptcp_needs_tcp_fallback(msk)))
return NULL;
@@ -84,6 +117,10 @@ static struct socket *__mptcp_socket_create(struct mptcp_sock *msk, int state)
struct socket *ssock;
int err;
+ ssock = __mptcp_tcp_fallback(msk);
+ if (unlikely(ssock))
+ return ssock;
+
ssock = __mptcp_nmpc_socket(msk);
if (ssock)
goto set_state;
@@ -121,6 +158,27 @@ static void __mptcp_move_skb(struct mptcp_sock *msk, struct sock *ssk,
MPTCP_SKB_CB(skb)->offset = offset;
}
+/* both sockets must be locked */
+static bool mptcp_subflow_dsn_valid(const struct mptcp_sock *msk,
+ struct sock *ssk)
+{
+ struct mptcp_subflow_context *subflow = mptcp_subflow_ctx(ssk);
+ u64 dsn = mptcp_subflow_get_mapped_dsn(subflow);
+
+ /* revalidate data sequence number.
+ *
+ * mptcp_subflow_data_available() is usually called
+ * without msk lock. Its unlikely (but possible)
+ * that msk->ack_seq has been advanced since the last
+ * call found in-sequence data.
+ */
+ if (likely(dsn == msk->ack_seq))
+ return true;
+
+ subflow->data_avail = 0;
+ return mptcp_subflow_data_available(ssk);
+}
+
static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct sock *ssk,
unsigned int *bytes)
@@ -132,6 +190,11 @@ static bool __mptcp_move_skbs_from_subflow(struct mptcp_sock *msk,
struct tcp_sock *tp;
bool done = false;
+ if (!mptcp_subflow_dsn_valid(msk, ssk)) {
+ *bytes = 0;
+ return false;
+ }
+
if (!(sk->sk_userlocks & SOCK_RCVBUF_LOCK)) {
int rcvbuf = max(ssk->sk_rcvbuf, sk->sk_rcvbuf);
@@ -290,6 +353,15 @@ void mptcp_data_acked(struct sock *sk)
sock_hold(sk);
}
+void mptcp_subflow_eof(struct sock *sk)
+{
+ struct mptcp_sock *msk = mptcp_sk(sk);
+
+ if (!test_and_set_bit(MPTCP_WORK_EOF, &msk->flags) &&
+ schedule_work(&msk->work))
+ sock_hold(sk);
+}
+
static void mptcp_stop_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
@@ -994,6 +1066,27 @@ static unsigned int mptcp_sync_mss(struct sock *sk, u32 pmtu)
return 0;
}
+static void mptcp_check_for_eof(struct mptcp_sock *msk)
+{
+ struct mptcp_subflow_context *subflow;
+ struct sock *sk = (struct sock *)msk;
+ int receivers = 0;
+
+ mptcp_for_each_subflow(msk, subflow)
+ receivers += !subflow->rx_eof;
+
+ if (!receivers && !(sk->sk_shutdown & RCV_SHUTDOWN)) {
+ /* hopefully temporary hack: propagate shutdown status
+ * to msk, when all subflows agree on it
+ */
+ sk->sk_shutdown |= RCV_SHUTDOWN;
+
+ smp_mb__before_atomic(); /* SHUTDOWN must be visible first */
+ set_bit(MPTCP_DATA_READY, &msk->flags);
+ sk->sk_data_ready(sk);
+ }
+}
+
static void mptcp_worker(struct work_struct *work)
{
struct mptcp_sock *msk = container_of(work, struct mptcp_sock, work);
@@ -1010,6 +1103,9 @@ static void mptcp_worker(struct work_struct *work)
__mptcp_flush_join_list(msk);
__mptcp_move_skbs(msk);
+ if (test_and_clear_bit(MPTCP_WORK_EOF, &msk->flags))
+ mptcp_check_for_eof(msk);
+
if (!test_and_clear_bit(MPTCP_WORK_RTX, &msk->flags))
goto unlock;
@@ -1752,7 +1848,9 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
msk = mptcp_sk(sk);
lock_sock(sk);
- ssock = __mptcp_nmpc_socket(msk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (!ssock)
+ ssock = __mptcp_nmpc_socket(msk);
if (ssock) {
mask = ssock->ops->poll(file, ssock, wait);
release_sock(sk);
@@ -1762,9 +1860,6 @@ static __poll_t mptcp_poll(struct file *file, struct socket *sock,
release_sock(sk);
sock_poll_wait(file, sock, wait);
lock_sock(sk);
- ssock = __mptcp_tcp_fallback(msk);
- if (unlikely(ssock))
- return ssock->ops->poll(file, ssock, NULL);
if (test_bit(MPTCP_DATA_READY, &msk->flags))
mask = EPOLLIN | EPOLLRDNORM;
@@ -1783,11 +1878,17 @@ static int mptcp_shutdown(struct socket *sock, int how)
{
struct mptcp_sock *msk = mptcp_sk(sock->sk);
struct mptcp_subflow_context *subflow;
+ struct socket *ssock;
int ret = 0;
pr_debug("sk=%p, how=%d", msk, how);
lock_sock(sock->sk);
+ ssock = __mptcp_tcp_fallback(msk);
+ if (ssock) {
+ release_sock(sock->sk);
+ return inet_shutdown(ssock, how);
+ }
if (how == SHUT_WR || how == SHUT_RDWR)
inet_sk_state_store(sock->sk, TCP_FIN_WAIT1);
diff --git a/net/mptcp/protocol.h b/net/mptcp/protocol.h
index f733c5425552..67448002a2d7 100644
--- a/net/mptcp/protocol.h
+++ b/net/mptcp/protocol.h
@@ -89,6 +89,7 @@
#define MPTCP_DATA_READY 0
#define MPTCP_SEND_SPACE 1
#define MPTCP_WORK_RTX 2
+#define MPTCP_WORK_EOF 3
static inline __be32 mptcp_option(u8 subopt, u8 len, u8 nib, u8 field)
{
@@ -339,6 +340,7 @@ void mptcp_finish_connect(struct sock *sk);
void mptcp_data_ready(struct sock *sk, struct sock *ssk);
bool mptcp_finish_join(struct sock *sk);
void mptcp_data_acked(struct sock *sk);
+void mptcp_subflow_eof(struct sock *sk);
int mptcp_token_new_request(struct request_sock *req);
void mptcp_token_destroy_request(u32 token);
diff --git a/net/mptcp/subflow.c b/net/mptcp/subflow.c
index b5180c81588e..50a8bea987c6 100644
--- a/net/mptcp/subflow.c
+++ b/net/mptcp/subflow.c
@@ -994,8 +994,7 @@ static void subflow_state_change(struct sock *sk)
if (!(parent->sk_shutdown & RCV_SHUTDOWN) &&
!subflow->rx_eof && subflow_is_done(sk)) {
subflow->rx_eof = 1;
- parent->sk_shutdown |= RCV_SHUTDOWN;
- __subflow_state_change(parent);
+ mptcp_subflow_eof(parent);
}
}
diff --git a/net/mptcp/token.c b/net/mptcp/token.c
index 129a5ad1bc35..33352dd99d4d 100644
--- a/net/mptcp/token.c
+++ b/net/mptcp/token.c
@@ -40,7 +40,7 @@ static int token_used __read_mostly;
/**
* mptcp_token_new_request - create new key/idsn/token for subflow_request
- * @req - the request socket
+ * @req: the request socket
*
* This function is called when a new mptcp connection is coming in.
*
@@ -80,7 +80,7 @@ int mptcp_token_new_request(struct request_sock *req)
/**
* mptcp_token_new_connect - create new key/idsn/token for subflow
- * @sk - the socket that will initiate a connection
+ * @sk: the socket that will initiate a connection
*
* This function is called when a new outgoing mptcp connection is
* initiated.
@@ -125,6 +125,7 @@ int mptcp_token_new_connect(struct sock *sk)
/**
* mptcp_token_new_accept - insert token for later processing
* @token: the token to insert to the tree
+ * @conn: the just cloned socket linked to the new connection
*
* Called when a SYN packet creates a new logical connection, i.e.
* is not a join request.
@@ -169,7 +170,7 @@ struct mptcp_sock *mptcp_token_get_sock(u32 token)
/**
* mptcp_token_destroy_request - remove mptcp connection/token
- * @token - token of mptcp connection to remove
+ * @token: token of mptcp connection to remove
*
* Remove not-yet-fully-established incoming connection identified
* by @token.
@@ -183,7 +184,7 @@ void mptcp_token_destroy_request(u32 token)
/**
* mptcp_token_destroy - remove mptcp connection/token
- * @token - token of mptcp connection to remove
+ * @token: token of mptcp connection to remove
*
* Remove the connection identified by @token.
*/
diff --git a/net/openvswitch/flow_table.c b/net/openvswitch/flow_table.c
index fd8a01ca7a2d..2398d7238300 100644
--- a/net/openvswitch/flow_table.c
+++ b/net/openvswitch/flow_table.c
@@ -462,12 +462,14 @@ static void flow_table_copy_flows(struct table_instance *old,
struct hlist_head *head = &old->buckets[i];
if (ufid)
- hlist_for_each_entry(flow, head,
- ufid_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ ufid_table.node[old_ver],
+ lockdep_ovsl_is_held())
ufid_table_instance_insert(new, flow);
else
- hlist_for_each_entry(flow, head,
- flow_table.node[old_ver])
+ hlist_for_each_entry_rcu(flow, head,
+ flow_table.node[old_ver],
+ lockdep_ovsl_is_held())
table_instance_insert(new, flow);
}
diff --git a/net/sched/cls_tcindex.c b/net/sched/cls_tcindex.c
index 9904299424a1..61e95029c18f 100644
--- a/net/sched/cls_tcindex.c
+++ b/net/sched/cls_tcindex.c
@@ -11,6 +11,7 @@
#include <linux/skbuff.h>
#include <linux/errno.h>
#include <linux/slab.h>
+#include <linux/refcount.h>
#include <net/act_api.h>
#include <net/netlink.h>
#include <net/pkt_cls.h>
@@ -26,9 +27,12 @@
#define DEFAULT_HASH_SIZE 64 /* optimized for diffserv */
+struct tcindex_data;
+
struct tcindex_filter_result {
struct tcf_exts exts;
struct tcf_result res;
+ struct tcindex_data *p;
struct rcu_work rwork;
};
@@ -49,6 +53,7 @@ struct tcindex_data {
u32 hash; /* hash table size; 0 if undefined */
u32 alloc_hash; /* allocated size */
u32 fall_through; /* 0: only classify if explicit match */
+ refcount_t refcnt; /* a temporary refcnt for perfect hash */
struct rcu_work rwork;
};
@@ -57,6 +62,20 @@ static inline int tcindex_filter_is_set(struct tcindex_filter_result *r)
return tcf_exts_has_actions(&r->exts) || r->res.classid;
}
+static void tcindex_data_get(struct tcindex_data *p)
+{
+ refcount_inc(&p->refcnt);
+}
+
+static void tcindex_data_put(struct tcindex_data *p)
+{
+ if (refcount_dec_and_test(&p->refcnt)) {
+ kfree(p->perfect);
+ kfree(p->h);
+ kfree(p);
+ }
+}
+
static struct tcindex_filter_result *tcindex_lookup(struct tcindex_data *p,
u16 key)
{
@@ -132,6 +151,7 @@ static int tcindex_init(struct tcf_proto *tp)
p->mask = 0xffff;
p->hash = DEFAULT_HASH_SIZE;
p->fall_through = 1;
+ refcount_set(&p->refcnt, 1); /* Paired with tcindex_destroy_work() */
rcu_assign_pointer(tp->root, p);
return 0;
@@ -141,6 +161,7 @@ static void __tcindex_destroy_rexts(struct tcindex_filter_result *r)
{
tcf_exts_destroy(&r->exts);
tcf_exts_put_net(&r->exts);
+ tcindex_data_put(r->p);
}
static void tcindex_destroy_rexts_work(struct work_struct *work)
@@ -212,6 +233,8 @@ found:
else
__tcindex_destroy_fexts(f);
} else {
+ tcindex_data_get(p);
+
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork, tcindex_destroy_rexts_work);
else
@@ -228,9 +251,7 @@ static void tcindex_destroy_work(struct work_struct *work)
struct tcindex_data,
rwork);
- kfree(p->perfect);
- kfree(p->h);
- kfree(p);
+ tcindex_data_put(p);
}
static inline int
@@ -248,9 +269,11 @@ static const struct nla_policy tcindex_policy[TCA_TCINDEX_MAX + 1] = {
};
static int tcindex_filter_result_init(struct tcindex_filter_result *r,
+ struct tcindex_data *p,
struct net *net)
{
memset(r, 0, sizeof(*r));
+ r->p = p;
return tcf_exts_init(&r->exts, net, TCA_TCINDEX_ACT,
TCA_TCINDEX_POLICE);
}
@@ -290,6 +313,7 @@ static int tcindex_alloc_perfect_hash(struct net *net, struct tcindex_data *cp)
TCA_TCINDEX_ACT, TCA_TCINDEX_POLICE);
if (err < 0)
goto errout;
+ cp->perfect[i].p = cp;
}
return 0;
@@ -334,6 +358,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
cp->alloc_hash = p->alloc_hash;
cp->fall_through = p->fall_through;
cp->tp = tp;
+ refcount_set(&cp->refcnt, 1); /* Paired with tcindex_destroy_work() */
if (tb[TCA_TCINDEX_HASH])
cp->hash = nla_get_u32(tb[TCA_TCINDEX_HASH]);
@@ -366,7 +391,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
cp->h = p->h;
- err = tcindex_filter_result_init(&new_filter_result, net);
+ err = tcindex_filter_result_init(&new_filter_result, cp, net);
if (err < 0)
goto errout_alloc;
if (old_r)
@@ -434,7 +459,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
goto errout_alloc;
f->key = handle;
f->next = NULL;
- err = tcindex_filter_result_init(&f->result, net);
+ err = tcindex_filter_result_init(&f->result, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -447,7 +472,7 @@ tcindex_set_parms(struct net *net, struct tcf_proto *tp, unsigned long base,
}
if (old_r && old_r != r) {
- err = tcindex_filter_result_init(old_r, net);
+ err = tcindex_filter_result_init(old_r, cp, net);
if (err < 0) {
kfree(f);
goto errout_alloc;
@@ -571,6 +596,14 @@ static void tcindex_destroy(struct tcf_proto *tp, bool rtnl_held,
for (i = 0; i < p->hash; i++) {
struct tcindex_filter_result *r = p->perfect + i;
+ /* tcf_queue_work() does not guarantee the ordering we
+ * want, so we have to take this refcnt temporarily to
+ * ensure 'p' is freed after all tcindex_filter_result
+ * here. Imperfect hash does not need this, because it
+ * uses linked lists rather than an array.
+ */
+ tcindex_data_get(p);
+
tcf_unbind_filter(tp, &r->res);
if (tcf_exts_get_net(&r->exts))
tcf_queue_work(&r->rwork,
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index ee060d57d216..25fbd8d9de74 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -20,6 +20,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/auth_gss.h>
+#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/svcauth_gss.h>
#include <linux/sunrpc/gss_err.h>
#include <linux/workqueue.h>
@@ -1050,7 +1051,7 @@ gss_create_new(const struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
goto err_put_mech;
auth = &gss_auth->rpc_auth;
auth->au_cslack = GSS_CRED_SLACK >> 2;
- auth->au_rslack = GSS_VERF_SLACK >> 2;
+ auth->au_rslack = GSS_KRB5_MAX_SLACK_NEEDED >> 2;
auth->au_verfsize = GSS_VERF_SLACK >> 2;
auth->au_ralign = GSS_VERF_SLACK >> 2;
auth->au_flags = 0;
@@ -1724,8 +1725,9 @@ bad_mic:
goto out;
}
-static int gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_integ(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf integ_buf, *snd_buf = &rqstp->rq_snd_buf;
@@ -1816,8 +1818,9 @@ out:
return -EAGAIN;
}
-static int gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
- struct rpc_task *task, struct xdr_stream *xdr)
+static noinline_for_stack int
+gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
+ struct rpc_task *task, struct xdr_stream *xdr)
{
struct rpc_rqst *rqstp = task->tk_rqstp;
struct xdr_buf *snd_buf = &rqstp->rq_snd_buf;
@@ -1934,35 +1937,69 @@ gss_unwrap_resp_auth(struct rpc_cred *cred)
return 0;
}
-static int
+/*
+ * RFC 2203, Section 5.3.2.2
+ *
+ * struct rpc_gss_integ_data {
+ * opaque databody_integ<>;
+ * opaque checksum<>;
+ * };
+ *
+ * struct rpc_gss_data_t {
+ * unsigned int seq_num;
+ * proc_req_arg_t arg;
+ * };
+ */
+static noinline_for_stack int
gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
{
- struct xdr_buf integ_buf, *rcv_buf = &rqstp->rq_rcv_buf;
- u32 data_offset, mic_offset, integ_len, maj_stat;
+ struct xdr_buf gss_data, *rcv_buf = &rqstp->rq_rcv_buf;
struct rpc_auth *auth = cred->cr_auth;
+ u32 len, offset, seqno, maj_stat;
struct xdr_netobj mic;
- __be32 *p;
+ int ret;
- p = xdr_inline_decode(xdr, 2 * sizeof(*p));
- if (unlikely(!p))
+ ret = -EIO;
+ mic.data = NULL;
+
+ /* opaque databody_integ<>; */
+ if (xdr_stream_decode_u32(xdr, &len))
goto unwrap_failed;
- integ_len = be32_to_cpup(p++);
- if (integ_len & 3)
+ if (len & 3)
goto unwrap_failed;
- data_offset = (u8 *)(p) - (u8 *)rcv_buf->head[0].iov_base;
- mic_offset = integ_len + data_offset;
- if (mic_offset > rcv_buf->len)
+ offset = rcv_buf->len - xdr_stream_remaining(xdr);
+ if (xdr_stream_decode_u32(xdr, &seqno))
goto unwrap_failed;
- if (be32_to_cpup(p) != rqstp->rq_seqno)
+ if (seqno != rqstp->rq_seqno)
goto bad_seqno;
+ if (xdr_buf_subsegment(rcv_buf, &gss_data, offset, len))
+ goto unwrap_failed;
- if (xdr_buf_subsegment(rcv_buf, &integ_buf, data_offset, integ_len))
+ /*
+ * The xdr_stream now points to the beginning of the
+ * upper layer payload, to be passed below to
+ * rpcauth_unwrap_resp_decode(). The checksum, which
+ * follows the upper layer payload in @rcv_buf, is
+ * located and parsed without updating the xdr_stream.
+ */
+
+ /* opaque checksum<>; */
+ offset += len;
+ if (xdr_decode_word(rcv_buf, offset, &len))
+ goto unwrap_failed;
+ offset += sizeof(__be32);
+ if (offset + len > rcv_buf->len)
goto unwrap_failed;
- if (xdr_buf_read_mic(rcv_buf, &mic, mic_offset))
+ mic.len = len;
+ mic.data = kmalloc(len, GFP_NOFS);
+ if (!mic.data)
+ goto unwrap_failed;
+ if (read_bytes_from_xdr_buf(rcv_buf, offset, mic.data, mic.len))
goto unwrap_failed;
- maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &integ_buf, &mic);
+
+ maj_stat = gss_verify_mic(ctx->gc_gss_ctx, &gss_data, &mic);
if (maj_stat == GSS_S_CONTEXT_EXPIRED)
clear_bit(RPCAUTH_CRED_UPTODATE, &cred->cr_flags);
if (maj_stat != GSS_S_COMPLETE)
@@ -1970,19 +2007,24 @@ gss_unwrap_resp_integ(struct rpc_task *task, struct rpc_cred *cred,
auth->au_rslack = auth->au_verfsize + 2 + 1 + XDR_QUADLEN(mic.len);
auth->au_ralign = auth->au_verfsize + 2;
- return 0;
+ ret = 0;
+
+out:
+ kfree(mic.data);
+ return ret;
+
unwrap_failed:
trace_rpcgss_unwrap_failed(task);
- return -EIO;
+ goto out;
bad_seqno:
- trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, be32_to_cpup(p));
- return -EIO;
+ trace_rpcgss_bad_seqno(task, rqstp->rq_seqno, seqno);
+ goto out;
bad_mic:
trace_rpcgss_verify_mic(task, maj_stat);
- return -EIO;
+ goto out;
}
-static int
+static noinline_for_stack int
gss_unwrap_resp_priv(struct rpc_task *task, struct rpc_cred *cred,
struct gss_cl_ctx *ctx, struct rpc_rqst *rqstp,
struct xdr_stream *xdr)
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 07992d3e8703..325a0858700f 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -1099,8 +1099,9 @@ rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
task->tk_msg.rpc_proc = msg->rpc_proc;
task->tk_msg.rpc_argp = msg->rpc_argp;
task->tk_msg.rpc_resp = msg->rpc_resp;
- if (msg->rpc_cred != NULL)
- task->tk_msg.rpc_cred = get_cred(msg->rpc_cred);
+ task->tk_msg.rpc_cred = msg->rpc_cred;
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ get_cred(task->tk_msg.rpc_cred);
}
}
@@ -1126,6 +1127,9 @@ struct rpc_task *rpc_run_task(const struct rpc_task_setup *task_setup_data)
task = rpc_new_task(task_setup_data);
+ if (!RPC_IS_ASYNC(task))
+ task->tk_flags |= RPC_TASK_CRED_NOREF;
+
rpc_task_set_client(task, task_setup_data->rpc_client);
rpc_task_set_rpc_message(task, task_setup_data->rpc_message);
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index 55e900255b0c..7eba20a88438 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -204,10 +204,6 @@ static void __rpc_add_wait_queue(struct rpc_wait_queue *queue,
struct rpc_task *task,
unsigned char queue_priority)
{
- WARN_ON_ONCE(RPC_IS_QUEUED(task));
- if (RPC_IS_QUEUED(task))
- return;
-
INIT_LIST_HEAD(&task->u.tk_wait.timer_list);
if (RPC_IS_PRIORITY(queue))
__rpc_add_wait_queue_priority(queue, task, queue_priority);
@@ -382,7 +378,7 @@ static void rpc_make_runnable(struct workqueue_struct *wq,
* NB: An RPC task will only receive interrupt-driven events as long
* as it's on a wait queue.
*/
-static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+static void __rpc_do_sleep_on_priority(struct rpc_wait_queue *q,
struct rpc_task *task,
unsigned char queue_priority)
{
@@ -395,12 +391,23 @@ static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
}
+static void __rpc_sleep_on_priority(struct rpc_wait_queue *q,
+ struct rpc_task *task,
+ unsigned char queue_priority)
+{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
+}
+
static void __rpc_sleep_on_priority_timeout(struct rpc_wait_queue *q,
struct rpc_task *task, unsigned long timeout,
unsigned char queue_priority)
{
+ if (WARN_ON_ONCE(RPC_IS_QUEUED(task)))
+ return;
if (time_is_after_jiffies(timeout)) {
- __rpc_sleep_on_priority(q, task, queue_priority);
+ __rpc_do_sleep_on_priority(q, task, queue_priority);
__rpc_add_timer(q, task, timeout);
} else
task->tk_status = -ETIMEDOUT;
@@ -1162,7 +1169,8 @@ static void rpc_release_resources_task(struct rpc_task *task)
{
xprt_release(task);
if (task->tk_msg.rpc_cred) {
- put_cred(task->tk_msg.rpc_cred);
+ if (!(task->tk_flags & RPC_TASK_CRED_NOREF))
+ put_cred(task->tk_msg.rpc_cred);
task->tk_msg.rpc_cred = NULL;
}
rpc_task_release_client(task);
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index e5497dc2475b..15b58c5144f9 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -1235,61 +1235,6 @@ xdr_encode_word(struct xdr_buf *buf, unsigned int base, u32 obj)
}
EXPORT_SYMBOL_GPL(xdr_encode_word);
-/**
- * xdr_buf_read_mic() - obtain the address of the GSS mic from xdr buf
- * @buf: pointer to buffer containing a mic
- * @mic: on success, returns the address of the mic
- * @offset: the offset in buf where mic may be found
- *
- * This function may modify the xdr buf if the mic is found to be straddling
- * a boundary between head, pages, and tail. On success the mic can be read
- * from the address returned. There is no need to free the mic.
- *
- * Return: Success returns 0, otherwise an integer error.
- */
-int xdr_buf_read_mic(struct xdr_buf *buf, struct xdr_netobj *mic, unsigned int offset)
-{
- struct xdr_buf subbuf;
- unsigned int boundary;
-
- if (xdr_decode_word(buf, offset, &mic->len))
- return -EFAULT;
- offset += 4;
-
- /* Is the mic partially in the head? */
- boundary = buf->head[0].iov_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shift_buf(buf, boundary - offset);
-
- /* Is the mic partially in the pages? */
- boundary += buf->page_len;
- if (offset < boundary && (offset + mic->len) > boundary)
- xdr_shrink_pagelen(buf, boundary - offset);
-
- if (xdr_buf_subsegment(buf, &subbuf, offset, mic->len))
- return -EFAULT;
-
- /* Is the mic contained entirely in the head? */
- mic->data = subbuf.head[0].iov_base;
- if (subbuf.head[0].iov_len == mic->len)
- return 0;
- /* ..or is the mic contained entirely in the tail? */
- mic->data = subbuf.tail[0].iov_base;
- if (subbuf.tail[0].iov_len == mic->len)
- return 0;
-
- /* Find a contiguous area in @buf to hold all of @mic */
- if (mic->len > buf->buflen - buf->len)
- return -ENOMEM;
- if (buf->tail[0].iov_len != 0)
- mic->data = buf->tail[0].iov_base + buf->tail[0].iov_len;
- else
- mic->data = buf->head[0].iov_base + buf->head[0].iov_len;
- __read_bytes_from_xdr_buf(&subbuf, mic->data, mic->len);
- return 0;
-}
-EXPORT_SYMBOL_GPL(xdr_buf_read_mic);
-
/* Returns 0 on success, or else a negative error code. */
static int
xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
index 1a0ae0c61353..c92c1aac270a 100644
--- a/net/sunrpc/xprtrdma/backchannel.c
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -44,10 +44,10 @@ int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
size_t xprt_rdma_bc_maxpayload(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
size_t maxmsg;
- maxmsg = min_t(unsigned int, ep->rep_inline_send, ep->rep_inline_recv);
+ maxmsg = min_t(unsigned int, ep->re_inline_send, ep->re_inline_recv);
maxmsg = min_t(unsigned int, maxmsg, PAGE_SIZE);
return maxmsg - RPCRDMA_HDRLEN_MIN;
}
@@ -115,7 +115,7 @@ int xprt_rdma_bc_send_reply(struct rpc_rqst *rqst)
if (rc < 0)
goto failed_marshal;
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
return 0;
@@ -190,7 +190,7 @@ create_req:
if (xprt->bc_alloc_count >= RPCRDMA_BACKWARD_WRS)
return NULL;
- size = min_t(size_t, r_xprt->rx_ep.rep_inline_recv, PAGE_SIZE);
+ size = min_t(size_t, r_xprt->rx_ep->re_inline_recv, PAGE_SIZE);
req = rpcrdma_req_create(r_xprt, size, GFP_KERNEL);
if (!req)
return NULL;
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 125297c9aa3e..ef997880e17a 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -52,7 +52,7 @@
/**
* frwr_release_mr - Destroy one MR
- * @mr: MR allocated by frwr_init_mr
+ * @mr: MR allocated by frwr_mr_init
*
*/
void frwr_release_mr(struct rpcrdma_mr *mr)
@@ -74,7 +74,7 @@ static void frwr_mr_recycle(struct rpcrdma_mr *mr)
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -106,21 +106,22 @@ void frwr_reset(struct rpcrdma_req *req)
}
/**
- * frwr_init_mr - Initialize one MR
- * @ia: interface adapter
+ * frwr_mr_init - Initialize one MR
+ * @r_xprt: controlling transport instance
* @mr: generic MR to prepare for FRWR
*
* Returns zero if successful. Otherwise a negative errno
* is returned.
*/
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr)
{
- unsigned int depth = ia->ri_max_frwr_depth;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ unsigned int depth = ep->re_max_fr_depth;
struct scatterlist *sg;
struct ib_mr *frmr;
int rc;
- frmr = ib_alloc_mr(ia->ri_pd, ia->ri_mrtype, depth);
+ frmr = ib_alloc_mr(ep->re_pd, ep->re_mrtype, depth);
if (IS_ERR(frmr))
goto out_mr_err;
@@ -128,6 +129,7 @@ int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr)
if (!sg)
goto out_list_err;
+ mr->mr_xprt = r_xprt;
mr->frwr.fr_mr = frmr;
mr->mr_dir = DMA_NONE;
INIT_LIST_HEAD(&mr->mr_list);
@@ -149,29 +151,24 @@ out_list_err:
/**
* frwr_query_device - Prepare a transport for use with FRWR
- * @r_xprt: controlling transport instance
+ * @ep: endpoint to fill in
* @device: RDMA device to query
*
* On success, sets:
- * ep->rep_attr
- * ep->rep_max_requests
- * ia->ri_max_rdma_segs
- *
- * And these FRWR-related fields:
- * ia->ri_max_frwr_depth
- * ia->ri_mrtype
+ * ep->re_attr
+ * ep->re_max_requests
+ * ep->re_max_rdma_segs
+ * ep->re_max_fr_depth
+ * ep->re_mrtype
*
* Return values:
* On success, returns zero.
* %-EINVAL - the device does not support FRWR memory registration
* %-ENOMEM - the device is not sufficiently capable for NFS/RDMA
*/
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device)
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device)
{
const struct ib_device_attr *attrs = &device->attrs;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
int max_qp_wr, depth, delta;
unsigned int max_sge;
@@ -188,23 +185,23 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
pr_err("rpcrdma: HCA provides only %u send SGEs\n", max_sge);
return -ENOMEM;
}
- ep->rep_attr.cap.max_send_sge = max_sge;
- ep->rep_attr.cap.max_recv_sge = 1;
+ ep->re_attr.cap.max_send_sge = max_sge;
+ ep->re_attr.cap.max_recv_sge = 1;
- ia->ri_mrtype = IB_MR_TYPE_MEM_REG;
+ ep->re_mrtype = IB_MR_TYPE_MEM_REG;
if (attrs->device_cap_flags & IB_DEVICE_SG_GAPS_REG)
- ia->ri_mrtype = IB_MR_TYPE_SG_GAPS;
+ ep->re_mrtype = IB_MR_TYPE_SG_GAPS;
/* Quirk: Some devices advertise a large max_fast_reg_page_list_len
* capability, but perform optimally when the MRs are not larger
* than a page.
*/
if (attrs->max_sge_rd > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_frwr_depth = attrs->max_sge_rd;
+ ep->re_max_fr_depth = attrs->max_sge_rd;
else
- ia->ri_max_frwr_depth = attrs->max_fast_reg_page_list_len;
- if (ia->ri_max_frwr_depth > RPCRDMA_MAX_DATA_SEGS)
- ia->ri_max_frwr_depth = RPCRDMA_MAX_DATA_SEGS;
+ ep->re_max_fr_depth = attrs->max_fast_reg_page_list_len;
+ if (ep->re_max_fr_depth > RPCRDMA_MAX_DATA_SEGS)
+ ep->re_max_fr_depth = RPCRDMA_MAX_DATA_SEGS;
/* Add room for frwr register and invalidate WRs.
* 1. FRWR reg WR for head
@@ -220,11 +217,11 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
/* Calculate N if the device max FRWR depth is smaller than
* RPCRDMA_MAX_DATA_SEGS.
*/
- if (ia->ri_max_frwr_depth < RPCRDMA_MAX_DATA_SEGS) {
- delta = RPCRDMA_MAX_DATA_SEGS - ia->ri_max_frwr_depth;
+ if (ep->re_max_fr_depth < RPCRDMA_MAX_DATA_SEGS) {
+ delta = RPCRDMA_MAX_DATA_SEGS - ep->re_max_fr_depth;
do {
depth += 2; /* FRWR reg + invalidate */
- delta -= ia->ri_max_frwr_depth;
+ delta -= ep->re_max_fr_depth;
} while (delta > 0);
}
@@ -233,34 +230,34 @@ int frwr_query_device(struct rpcrdma_xprt *r_xprt,
max_qp_wr -= 1;
if (max_qp_wr < RPCRDMA_MIN_SLOT_TABLE)
return -ENOMEM;
- if (ep->rep_max_requests > max_qp_wr)
- ep->rep_max_requests = max_qp_wr;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
- if (ep->rep_attr.cap.max_send_wr > max_qp_wr) {
- ep->rep_max_requests = max_qp_wr / depth;
- if (!ep->rep_max_requests)
+ if (ep->re_max_requests > max_qp_wr)
+ ep->re_max_requests = max_qp_wr;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
+ if (ep->re_attr.cap.max_send_wr > max_qp_wr) {
+ ep->re_max_requests = max_qp_wr / depth;
+ if (!ep->re_max_requests)
return -ENOMEM;
- ep->rep_attr.cap.max_send_wr = ep->rep_max_requests * depth;
+ ep->re_attr.cap.max_send_wr = ep->re_max_requests * depth;
}
- ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
- ep->rep_attr.cap.max_recv_wr = ep->rep_max_requests;
- ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
- ep->rep_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
-
- ia->ri_max_rdma_segs =
- DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ia->ri_max_frwr_depth);
+ ep->re_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_send_wr += 1; /* for ib_drain_sq */
+ ep->re_attr.cap.max_recv_wr = ep->re_max_requests;
+ ep->re_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
+ ep->re_attr.cap.max_recv_wr += 1; /* for ib_drain_rq */
+
+ ep->re_max_rdma_segs =
+ DIV_ROUND_UP(RPCRDMA_MAX_DATA_SEGS, ep->re_max_fr_depth);
/* Reply chunks require segments for head and tail buffers */
- ia->ri_max_rdma_segs += 2;
- if (ia->ri_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
- ia->ri_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
+ ep->re_max_rdma_segs += 2;
+ if (ep->re_max_rdma_segs > RPCRDMA_MAX_HDR_SEGS)
+ ep->re_max_rdma_segs = RPCRDMA_MAX_HDR_SEGS;
/* Ensure the underlying device is capable of conveying the
* largest r/wsize NFS will ask for. This guarantees that
* failing over from one RDMA device to another will not
* break NFS I/O.
*/
- if ((ia->ri_max_rdma_segs * ia->ri_max_frwr_depth) < RPCRDMA_MAX_SEGS)
+ if ((ep->re_max_rdma_segs * ep->re_max_fr_depth) < RPCRDMA_MAX_SEGS)
return -ENOMEM;
return 0;
@@ -286,14 +283,14 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_reg_wr *reg_wr;
int i, n, dma_nents;
struct ib_mr *ibmr;
u8 key;
- if (nsegs > ia->ri_max_frwr_depth)
- nsegs = ia->ri_max_frwr_depth;
+ if (nsegs > ep->re_max_fr_depth)
+ nsegs = ep->re_max_fr_depth;
for (i = 0; i < nsegs;) {
if (seg->mr_page)
sg_set_page(&mr->mr_sg[i],
@@ -306,7 +303,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
++seg;
++i;
- if (ia->ri_mrtype == IB_MR_TYPE_SG_GAPS)
+ if (ep->re_mrtype == IB_MR_TYPE_SG_GAPS)
continue;
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
@@ -315,7 +312,7 @@ struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
mr->mr_dir = rpcrdma_data_dir(writing);
mr->mr_nents = i;
- dma_nents = ib_dma_map_sg(ia->ri_id->device, mr->mr_sg, mr->mr_nents,
+ dma_nents = ib_dma_map_sg(ep->re_id->device, mr->mr_sg, mr->mr_nents,
mr->mr_dir);
if (!dma_nents)
goto out_dmamap_err;
@@ -356,8 +353,8 @@ out_mapmr_err:
/**
* frwr_wc_fastreg - Invoked by RDMA provider for a flushed FastReg WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed FastReg WR
*
*/
static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
@@ -369,20 +366,25 @@ static void frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_fastreg(wc, frwr);
/* The MR will get recycled when the associated req is retransmitted */
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
- * frwr_send - post Send WR containing the RPC Call message
- * @ia: interface adapter
- * @req: Prepared RPC Call
+ * frwr_send - post Send WRs containing the RPC Call message
+ * @r_xprt: controlling transport instance
+ * @req: prepared RPC Call
*
* For FRWR, chain any FastReg WRs to the Send WR. Only a
* single ib_post_send call is needed to register memory
* and then post the Send WR.
*
- * Returns the result of ib_post_send.
+ * Returns the return code from ib_post_send.
+ *
+ * Caller must hold the transport send lock to ensure that the
+ * pointers to the transport's rdma_cm_id and QP are stable.
*/
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *post_wr;
struct rpcrdma_mr *mr;
@@ -403,7 +405,7 @@ int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
post_wr = &frwr->fr_regwr.wr;
}
- return ib_post_send(ia->ri_id->qp, post_wr, NULL);
+ return ib_post_send(r_xprt->rx_ep->re_id->qp, post_wr, NULL);
}
/**
@@ -419,7 +421,7 @@ void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs)
list_for_each_entry(mr, mrs, mr_list)
if (mr->mr_handle == rep->rr_inv_rkey) {
list_del_init(&mr->mr_list);
- trace_xprtrdma_mr_remoteinv(mr);
+ trace_xprtrdma_mr_reminv(mr);
rpcrdma_mr_put(mr);
break; /* only one invalidated MR per RPC */
}
@@ -435,8 +437,8 @@ static void __frwr_release_mr(struct ib_wc *wc, struct rpcrdma_mr *mr)
/**
* frwr_wc_localinv - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
@@ -449,12 +451,14 @@ static void frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_li(wc, frwr);
__frwr_release_mr(wc, mr);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
* frwr_wc_localinv_wake - Invoked by RDMA provider for a LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
* Awaken anyone waiting for an MR to finish being fenced.
*/
@@ -469,6 +473,8 @@ static void frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
trace_xprtrdma_wc_li_wake(wc, frwr);
__frwr_release_mr(wc, mr);
complete(&frwr->fr_linv_done);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
@@ -526,10 +532,10 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
/* The final LOCAL_INV WR in the chain is supposed to
* do the wake. If it was never posted, the wake will
@@ -556,8 +562,8 @@ void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/**
* frwr_wc_localinv_done - Invoked by RDMA provider for a signaled LOCAL_INV WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed LocalInv WR
*
*/
static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
@@ -575,6 +581,8 @@ static void frwr_wc_localinv_done(struct ib_cq *cq, struct ib_wc *wc)
/* Ensure @rep is generated before __frwr_release_mr */
smp_rmb();
rpcrdma_complete_rqst(rep);
+
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
@@ -629,10 +637,10 @@ void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Transport disconnect drains the receive CQ before it
* replaces the QP. The RPC reply handler won't call us
- * unless ri_id->qp is a valid pointer.
+ * unless re_id->qp is a valid pointer.
*/
bad_wr = NULL;
- rc = ib_post_send(r_xprt->rx_ia.ri_id->qp, first, &bad_wr);
+ rc = ib_post_send(r_xprt->rx_ep->re_id->qp, first, &bad_wr);
if (!rc)
return;
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index 577513b7642e..4a81e6995d3e 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -103,21 +103,20 @@ static unsigned int rpcrdma_max_reply_header_size(unsigned int maxsegs)
/**
* rpcrdma_set_max_header_sizes - Initialize inline payload sizes
- * @r_xprt: transport instance to initialize
+ * @ep: endpoint to initialize
*
* The max_inline fields contain the maximum size of an RPC message
* so the marshaling code doesn't have to repeat this calculation
* for every RPC.
*/
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *r_xprt)
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep)
{
- unsigned int maxsegs = r_xprt->rx_ia.ri_max_rdma_segs;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ unsigned int maxsegs = ep->re_max_rdma_segs;
- ep->rep_max_inline_send =
- ep->rep_inline_send - rpcrdma_max_call_header_size(maxsegs);
- ep->rep_max_inline_recv =
- ep->rep_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
+ ep->re_max_inline_send =
+ ep->re_inline_send - rpcrdma_max_call_header_size(maxsegs);
+ ep->re_max_inline_recv =
+ ep->re_inline_recv - rpcrdma_max_reply_header_size(maxsegs);
}
/* The client can send a request inline as long as the RPCRDMA header
@@ -132,9 +131,10 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
struct xdr_buf *xdr = &rqst->rq_snd_buf;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count, remaining, offset;
- if (xdr->len > r_xprt->rx_ep.rep_max_inline_send)
+ if (xdr->len > ep->re_max_inline_send)
return false;
if (xdr->page_len) {
@@ -145,7 +145,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
remaining -= min_t(unsigned int,
PAGE_SIZE - offset, remaining);
offset = 0;
- if (++count > r_xprt->rx_ep.rep_attr.cap.max_send_sge)
+ if (++count > ep->re_attr.cap.max_send_sge)
return false;
}
}
@@ -162,7 +162,7 @@ static bool rpcrdma_args_inline(struct rpcrdma_xprt *r_xprt,
static bool rpcrdma_results_inline(struct rpcrdma_xprt *r_xprt,
struct rpc_rqst *rqst)
{
- return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep.rep_max_inline_recv;
+ return rqst->rq_rcv_buf.buflen <= r_xprt->rx_ep->re_max_inline_recv;
}
/* The client is required to provide a Reply chunk if the maximum
@@ -176,7 +176,7 @@ rpcrdma_nonpayload_inline(const struct rpcrdma_xprt *r_xprt,
const struct xdr_buf *buf = &rqst->rq_rcv_buf;
return (buf->head[0].iov_len + buf->tail[0].iov_len) <
- r_xprt->rx_ep.rep_max_inline_recv;
+ r_xprt->rx_ep->re_max_inline_recv;
}
/* Split @vec on page boundaries into SGEs. FMR registers pages, not
@@ -255,7 +255,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
/* When encoding a Read chunk, the tail iovec contains an
* XDR pad and may be omitted.
*/
- if (type == rpcrdma_readch && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_readch && r_xprt->rx_ep->re_implicit_roundup)
goto out;
/* When encoding a Write chunk, some servers need to see an
@@ -263,7 +263,7 @@ rpcrdma_convert_iovs(struct rpcrdma_xprt *r_xprt, struct xdr_buf *xdrbuf,
* layer provides space in the tail iovec that may be used
* for this purpose.
*/
- if (type == rpcrdma_writech && r_xprt->rx_ia.ri_implicit_roundup)
+ if (type == rpcrdma_writech && r_xprt->rx_ep->re_implicit_roundup)
goto out;
if (xdrbuf->tail[0].iov_len)
@@ -1450,8 +1450,8 @@ void rpcrdma_reply_handler(struct rpcrdma_rep *rep)
if (credits == 0)
credits = 1; /* don't deadlock */
- else if (credits > r_xprt->rx_ep.rep_max_requests)
- credits = r_xprt->rx_ep.rep_max_requests;
+ else if (credits > r_xprt->rx_ep->re_max_requests)
+ credits = r_xprt->rx_ep->re_max_requests;
if (buf->rb_credits != credits)
rpcrdma_update_cwnd(r_xprt, credits);
rpcrdma_post_recvs(r_xprt, false);
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 3cfeba68ee9a..659da37020a4 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -240,9 +240,10 @@ xprt_rdma_connect_worker(struct work_struct *work)
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
int rc;
- rc = rpcrdma_ep_connect(&r_xprt->rx_ep, &r_xprt->rx_ia);
+ rc = rpcrdma_xprt_connect(r_xprt);
xprt_clear_connecting(xprt);
- if (r_xprt->rx_ep.rep_connected > 0) {
+ if (r_xprt->rx_ep && r_xprt->rx_ep->re_connect_status > 0) {
+ xprt->connect_cookie++;
xprt->stat.connect_count++;
xprt->stat.connect_time += (long)jiffies -
xprt->stat.connect_start;
@@ -265,7 +266,7 @@ xprt_rdma_inject_disconnect(struct rpc_xprt *xprt)
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
trace_xprtrdma_op_inject_dsc(r_xprt);
- rdma_disconnect(r_xprt->rx_ia.ri_id);
+ rdma_disconnect(r_xprt->rx_ep->re_id);
}
/**
@@ -284,9 +285,8 @@ xprt_rdma_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&r_xprt->rx_connect_worker);
- rpcrdma_ep_destroy(r_xprt);
+ rpcrdma_xprt_disconnect(r_xprt);
rpcrdma_buffer_destroy(&r_xprt->rx_buf);
- rpcrdma_ia_close(&r_xprt->rx_ia);
xprt_rdma_free_addresses(xprt);
xprt_free(xprt);
@@ -316,10 +316,15 @@ xprt_setup_rdma(struct xprt_create *args)
if (args->addrlen > sizeof(xprt->addr))
return ERR_PTR(-EBADF);
+ if (!try_module_get(THIS_MODULE))
+ return ERR_PTR(-EIO);
+
xprt = xprt_alloc(args->net, sizeof(struct rpcrdma_xprt), 0,
xprt_rdma_slot_table_entries);
- if (!xprt)
+ if (!xprt) {
+ module_put(THIS_MODULE);
return ERR_PTR(-ENOMEM);
+ }
xprt->timeout = &xprt_rdma_default_timeout;
xprt->connect_timeout = xprt->timeout->to_initval;
@@ -347,23 +352,17 @@ xprt_setup_rdma(struct xprt_create *args)
xprt_rdma_format_addresses(xprt, sap);
new_xprt = rpcx_to_rdmax(xprt);
- rc = rpcrdma_ia_open(new_xprt);
- if (rc)
- goto out1;
-
- rc = rpcrdma_ep_create(new_xprt);
- if (rc)
- goto out2;
-
rc = rpcrdma_buffer_create(new_xprt);
- if (rc)
- goto out3;
-
- if (!try_module_get(THIS_MODULE))
- goto out4;
+ if (rc) {
+ xprt_rdma_free_addresses(xprt);
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+ return ERR_PTR(rc);
+ }
INIT_DELAYED_WORK(&new_xprt->rx_connect_worker,
xprt_rdma_connect_worker);
+
xprt->max_payload = RPCRDMA_MAX_DATA_SEGS << PAGE_SHIFT;
dprintk("RPC: %s: %s:%s\n", __func__,
@@ -371,19 +370,6 @@ xprt_setup_rdma(struct xprt_create *args)
xprt->address_strings[RPC_DISPLAY_PORT]);
trace_xprtrdma_create(new_xprt);
return xprt;
-
-out4:
- rpcrdma_buffer_destroy(&new_xprt->rx_buf);
- rc = -ENODEV;
-out3:
- rpcrdma_ep_destroy(new_xprt);
-out2:
- rpcrdma_ia_close(&new_xprt->rx_ia);
-out1:
- trace_xprtrdma_op_destroy(new_xprt);
- xprt_rdma_free_addresses(xprt);
- xprt_free(xprt);
- return ERR_PTR(rc);
}
/**
@@ -398,26 +384,11 @@ out1:
void xprt_rdma_close(struct rpc_xprt *xprt)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- might_sleep();
trace_xprtrdma_op_close(r_xprt);
- /* Prevent marshaling and sending of new requests */
- xprt_clear_connected(xprt);
-
- if (test_and_clear_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags)) {
- rpcrdma_ia_remove(ia);
- goto out;
- }
-
- if (ep->rep_connected == -ENODEV)
- return;
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_xprt_disconnect(r_xprt);
-out:
xprt->reestablish_timeout = 0;
++xprt->connect_cookie;
xprt_disconnect_done(xprt);
@@ -517,10 +488,11 @@ static void
xprt_rdma_connect(struct rpc_xprt *xprt, struct rpc_task *task)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned long delay;
delay = 0;
- if (r_xprt->rx_ep.rep_connected != 0) {
+ if (ep && ep->re_connect_status != 0) {
delay = xprt_reconnect_delay(xprt);
xprt_reconnect_backoff(xprt, RPCRDMA_INIT_REEST_TO);
}
@@ -694,7 +666,7 @@ xprt_rdma_send_request(struct rpc_rqst *rqst)
goto drop_connection;
rqst->rq_xtime = ktime_get();
- if (rpcrdma_ep_post(&r_xprt->rx_ia, &r_xprt->rx_ep, req))
+ if (rpcrdma_post_sends(r_xprt, req))
goto drop_connection;
rqst->rq_xmit_bytes_sent += rqst->rq_snd_buf.len;
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 353f61ac8d51..cdd84c09df10 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -84,6 +84,7 @@ static void rpcrdma_rep_destroy(struct rpcrdma_rep *rep);
static void rpcrdma_reps_unmap(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt);
static void rpcrdma_mrs_destroy(struct rpcrdma_xprt *r_xprt);
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep);
static struct rpcrdma_regbuf *
rpcrdma_regbuf_alloc(size_t size, enum dma_data_direction direction,
gfp_t flags);
@@ -96,17 +97,17 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb);
*/
static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rdma_cm_id *id = r_xprt->rx_ep->re_id;
/* Flush Receives, then wait for deferred Reply work
* to complete.
*/
- ib_drain_rq(ia->ri_id->qp);
+ ib_drain_rq(id->qp);
/* Deferred Reply processing might have scheduled
* local invalidations.
*/
- ib_drain_sq(ia->ri_id->qp);
+ ib_drain_sq(id->qp);
}
/**
@@ -115,26 +116,43 @@ static void rpcrdma_xprt_drain(struct rpcrdma_xprt *r_xprt)
* @context: ep that owns QP where event occurred
*
* Called from the RDMA provider (device driver) possibly in an interrupt
- * context.
+ * context. The QP is always destroyed before the ID, so the ID will be
+ * reliably available when this handler is invoked.
*/
-static void
-rpcrdma_qp_event_handler(struct ib_event *event, void *context)
+static void rpcrdma_qp_event_handler(struct ib_event *event, void *context)
{
struct rpcrdma_ep *ep = context;
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
- trace_xprtrdma_qp_event(r_xprt, event);
+ trace_xprtrdma_qp_event(ep, event);
+}
+
+/**
+ * rpcrdma_flush_disconnect - Disconnect on flushed completion
+ * @cq: completion queue
+ * @wc: work completion entry
+ *
+ * Must be called in process context.
+ */
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+
+ if (wc->status != IB_WC_SUCCESS &&
+ r_xprt->rx_ep->re_connect_status == 1) {
+ r_xprt->rx_ep->re_connect_status = -ECONNABORTED;
+ trace_xprtrdma_flush_dct(r_xprt, wc->status);
+ xprt_force_disconnect(xprt);
+ }
}
/**
* rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
* @cq: completion queue
- * @wc: completed WR
+ * @wc: WCE for a completed Send WR
*
*/
-static void
-rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_sendctx *sc =
@@ -143,25 +161,25 @@ rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_send(sc, wc);
rpcrdma_sendctx_put_locked((struct rpcrdma_xprt *)cq->cq_context, sc);
+ rpcrdma_flush_disconnect(cq, wc);
}
/**
* rpcrdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
- * @cq: completion queue (ignored)
- * @wc: completed WR
+ * @cq: completion queue
+ * @wc: WCE for a completed Receive WR
*
*/
-static void
-rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
+static void rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
struct ib_cqe *cqe = wc->wr_cqe;
struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
rr_cqe);
- struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
+ struct rpcrdma_xprt *r_xprt = cq->cq_context;
/* WARNING: Only wr_cqe and status are reliable at this point */
trace_xprtrdma_wc_receive(wc);
- --r_xprt->rx_ep.rep_receive_count;
+ --r_xprt->rx_ep->re_receive_count;
if (wc->status != IB_WC_SUCCESS)
goto out_flushed;
@@ -178,35 +196,35 @@ rpcrdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
return;
out_flushed:
+ rpcrdma_flush_disconnect(cq, wc);
rpcrdma_rep_destroy(rep);
}
-static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
+static void rpcrdma_update_cm_private(struct rpcrdma_ep *ep,
struct rdma_conn_param *param)
{
const struct rpcrdma_connect_private *pmsg = param->private_data;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
unsigned int rsize, wsize;
/* Default settings for RPC-over-RDMA Version One */
- r_xprt->rx_ia.ri_implicit_roundup = xprt_rdma_pad_optimize;
+ ep->re_implicit_roundup = xprt_rdma_pad_optimize;
rsize = RPCRDMA_V1_DEF_INLINE_SIZE;
wsize = RPCRDMA_V1_DEF_INLINE_SIZE;
if (pmsg &&
pmsg->cp_magic == rpcrdma_cmp_magic &&
pmsg->cp_version == RPCRDMA_CMP_VERSION) {
- r_xprt->rx_ia.ri_implicit_roundup = true;
+ ep->re_implicit_roundup = true;
rsize = rpcrdma_decode_buffer_size(pmsg->cp_send_size);
wsize = rpcrdma_decode_buffer_size(pmsg->cp_recv_size);
}
- if (rsize < ep->rep_inline_recv)
- ep->rep_inline_recv = rsize;
- if (wsize < ep->rep_inline_send)
- ep->rep_inline_send = wsize;
+ if (rsize < ep->re_inline_recv)
+ ep->re_inline_recv = rsize;
+ if (wsize < ep->re_inline_send)
+ ep->re_inline_send = wsize;
- rpcrdma_set_max_header_sizes(r_xprt);
+ rpcrdma_set_max_header_sizes(ep);
}
/**
@@ -220,116 +238,103 @@ static void rpcrdma_update_cm_private(struct rpcrdma_xprt *r_xprt,
static int
rpcrdma_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
{
- struct rpcrdma_xprt *r_xprt = id->context;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct sockaddr *sap = (struct sockaddr *)&id->route.addr.dst_addr;
+ struct rpcrdma_ep *ep = id->context;
+ struct rpc_xprt *xprt = ep->re_xprt;
might_sleep();
- trace_xprtrdma_cm_event(r_xprt, event);
switch (event->event) {
case RDMA_CM_EVENT_ADDR_RESOLVED:
case RDMA_CM_EVENT_ROUTE_RESOLVED:
- ia->ri_async_rc = 0;
- complete(&ia->ri_done);
+ ep->re_async_rc = 0;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ADDR_ERROR:
- ia->ri_async_rc = -EPROTO;
- complete(&ia->ri_done);
+ ep->re_async_rc = -EPROTO;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_ROUTE_ERROR:
- ia->ri_async_rc = -ENETUNREACH;
- complete(&ia->ri_done);
+ ep->re_async_rc = -ENETUNREACH;
+ complete(&ep->re_done);
return 0;
case RDMA_CM_EVENT_DEVICE_REMOVAL:
-#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
- pr_info("rpcrdma: removing device %s for %s:%s\n",
- ia->ri_id->device->name,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt));
-#endif
- init_completion(&ia->ri_remove_done);
- set_bit(RPCRDMA_IAF_REMOVING, &ia->ri_flags);
- ep->rep_connected = -ENODEV;
+ pr_info("rpcrdma: removing device %s for %pISpc\n",
+ ep->re_id->device->name, sap);
+ /* fall through */
+ case RDMA_CM_EVENT_ADDR_CHANGE:
+ ep->re_connect_status = -ENODEV;
xprt_force_disconnect(xprt);
- wait_for_completion(&ia->ri_remove_done);
-
- ia->ri_id = NULL;
- /* Return 1 to ensure the core destroys the id. */
- return 1;
+ goto disconnected;
case RDMA_CM_EVENT_ESTABLISHED:
- ++xprt->connect_cookie;
- ep->rep_connected = 1;
- rpcrdma_update_cm_private(r_xprt, &event->param.conn);
- trace_xprtrdma_inline_thresh(r_xprt);
- wake_up_all(&ep->rep_connect_wait);
+ kref_get(&ep->re_kref);
+ ep->re_connect_status = 1;
+ rpcrdma_update_cm_private(ep, &event->param.conn);
+ trace_xprtrdma_inline_thresh(ep);
+ wake_up_all(&ep->re_connect_wait);
break;
case RDMA_CM_EVENT_CONNECT_ERROR:
- ep->rep_connected = -ENOTCONN;
+ ep->re_connect_status = -ENOTCONN;
goto disconnected;
case RDMA_CM_EVENT_UNREACHABLE:
- ep->rep_connected = -ENETUNREACH;
+ ep->re_connect_status = -ENETUNREACH;
goto disconnected;
case RDMA_CM_EVENT_REJECTED:
- dprintk("rpcrdma: connection to %s:%s rejected: %s\n",
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- rdma_reject_msg(id, event->status));
- ep->rep_connected = -ECONNREFUSED;
+ dprintk("rpcrdma: connection to %pISpc rejected: %s\n",
+ sap, rdma_reject_msg(id, event->status));
+ ep->re_connect_status = -ECONNREFUSED;
if (event->status == IB_CM_REJ_STALE_CONN)
- ep->rep_connected = -EAGAIN;
+ ep->re_connect_status = -EAGAIN;
goto disconnected;
case RDMA_CM_EVENT_DISCONNECTED:
- ep->rep_connected = -ECONNABORTED;
+ ep->re_connect_status = -ECONNABORTED;
disconnected:
- xprt_force_disconnect(xprt);
- wake_up_all(&ep->rep_connect_wait);
- break;
+ return rpcrdma_ep_destroy(ep);
default:
break;
}
- dprintk("RPC: %s: %s:%s on %s/frwr: %s\n", __func__,
- rpcrdma_addrstr(r_xprt), rpcrdma_portstr(r_xprt),
- ia->ri_id->device->name, rdma_event_msg(event->event));
+ dprintk("RPC: %s: %pISpc on %s/frwr: %s\n", __func__, sap,
+ ep->re_id->device->name, rdma_event_msg(event->event));
return 0;
}
-static struct rdma_cm_id *
-rpcrdma_create_id(struct rpcrdma_xprt *xprt, struct rpcrdma_ia *ia)
+static struct rdma_cm_id *rpcrdma_create_id(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_ep *ep)
{
unsigned long wtimeout = msecs_to_jiffies(RDMA_RESOLVE_TIMEOUT) + 1;
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
struct rdma_cm_id *id;
int rc;
- init_completion(&ia->ri_done);
+ init_completion(&ep->re_done);
- id = rdma_create_id(xprt->rx_xprt.xprt_net, rpcrdma_cm_event_handler,
- xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(xprt->xprt_net, rpcrdma_cm_event_handler, ep,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(id))
return id;
- ia->ri_async_rc = -ETIMEDOUT;
- rc = rdma_resolve_addr(id, NULL,
- (struct sockaddr *)&xprt->rx_xprt.addr,
+ ep->re_async_rc = -ETIMEDOUT;
+ rc = rdma_resolve_addr(id, NULL, (struct sockaddr *)&xprt->addr,
RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
- ia->ri_async_rc = -ETIMEDOUT;
+ ep->re_async_rc = -ETIMEDOUT;
rc = rdma_resolve_route(id, RDMA_RESOLVE_TIMEOUT);
if (rc)
goto out;
- rc = wait_for_completion_interruptible_timeout(&ia->ri_done, wtimeout);
+ rc = wait_for_completion_interruptible_timeout(&ep->re_done, wtimeout);
if (rc < 0)
goto out;
- rc = ia->ri_async_rc;
+ rc = ep->re_async_rc;
if (rc)
goto out;
@@ -340,356 +345,181 @@ out:
return ERR_PTR(rc);
}
-/*
- * Exported functions.
- */
-
-/**
- * rpcrdma_ia_open - Open and initialize an Interface Adapter.
- * @xprt: transport with IA to (re)initialize
- *
- * Returns 0 on success, negative errno if an appropriate
- * Interface Adapter could not be found and opened.
- */
-int
-rpcrdma_ia_open(struct rpcrdma_xprt *xprt)
+static void rpcrdma_ep_put(struct kref *kref)
{
- struct rpcrdma_ia *ia = &xprt->rx_ia;
- int rc;
+ struct rpcrdma_ep *ep = container_of(kref, struct rpcrdma_ep, re_kref);
- ia->ri_id = rpcrdma_create_id(xprt, ia);
- if (IS_ERR(ia->ri_id)) {
- rc = PTR_ERR(ia->ri_id);
- goto out_err;
+ if (ep->re_id->qp) {
+ rdma_destroy_qp(ep->re_id);
+ ep->re_id->qp = NULL;
}
- ia->ri_pd = ib_alloc_pd(ia->ri_id->device, 0);
- if (IS_ERR(ia->ri_pd)) {
- rc = PTR_ERR(ia->ri_pd);
- pr_err("rpcrdma: ib_alloc_pd() returned %d\n", rc);
- goto out_err;
- }
+ if (ep->re_attr.recv_cq)
+ ib_free_cq(ep->re_attr.recv_cq);
+ ep->re_attr.recv_cq = NULL;
+ if (ep->re_attr.send_cq)
+ ib_free_cq(ep->re_attr.send_cq);
+ ep->re_attr.send_cq = NULL;
- return 0;
+ if (ep->re_pd)
+ ib_dealloc_pd(ep->re_pd);
+ ep->re_pd = NULL;
-out_err:
- rpcrdma_ia_close(ia);
- return rc;
+ kfree(ep);
+ module_put(THIS_MODULE);
}
-/**
- * rpcrdma_ia_remove - Handle device driver unload
- * @ia: interface adapter being removed
- *
- * Divest transport H/W resources associated with this adapter,
- * but allow it to be restored later.
- *
- * Caller must hold the transport send lock.
+/* Returns:
+ * %0 if @ep still has a positive kref count, or
+ * %1 if @ep was destroyed successfully.
*/
-void
-rpcrdma_ia_remove(struct rpcrdma_ia *ia)
+static int rpcrdma_ep_destroy(struct rpcrdma_ep *ep)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
-
- /* This is similar to rpcrdma_ep_destroy, but:
- * - Don't cancel the connect worker.
- * - Don't call rpcrdma_ep_disconnect, which waits
- * for another conn upcall, which will deadlock.
- * - rdma_disconnect is unneeded, the underlying
- * connection is already gone.
- */
- if (ia->ri_id->qp) {
- rpcrdma_xprt_drain(r_xprt);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
- ib_free_cq(ep->rep_attr.recv_cq);
- ep->rep_attr.recv_cq = NULL;
- ib_free_cq(ep->rep_attr.send_cq);
- ep->rep_attr.send_cq = NULL;
-
- /* The ULP is responsible for ensuring all DMA
- * mappings and MRs are gone.
- */
- rpcrdma_reps_unmap(r_xprt);
- rpcrdma_reqs_reset(r_xprt);
- rpcrdma_mrs_destroy(r_xprt);
- rpcrdma_sendctxs_destroy(r_xprt);
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
-
- /* Allow waiters to continue */
- complete(&ia->ri_remove_done);
-
- trace_xprtrdma_remove(r_xprt);
-}
-
-/**
- * rpcrdma_ia_close - Clean up/close an IA.
- * @ia: interface adapter to close
- *
- */
-void
-rpcrdma_ia_close(struct rpcrdma_ia *ia)
-{
- if (ia->ri_id != NULL && !IS_ERR(ia->ri_id)) {
- if (ia->ri_id->qp)
- rdma_destroy_qp(ia->ri_id);
- rdma_destroy_id(ia->ri_id);
- }
- ia->ri_id = NULL;
-
- /* If the pd is still busy, xprtrdma missed freeing a resource */
- if (ia->ri_pd && !IS_ERR(ia->ri_pd))
- ib_dealloc_pd(ia->ri_pd);
- ia->ri_pd = NULL;
+ return kref_put(&ep->re_kref, rpcrdma_ep_put);
}
-/**
- * rpcrdma_ep_create - Create unconnected endpoint
- * @r_xprt: transport to instantiate
- *
- * Returns zero on success, or a negative errno.
- */
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
+static int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_connect_private *pmsg = &ep->rep_cm_private;
- struct ib_cq *sendcq, *recvcq;
+ struct rpcrdma_connect_private *pmsg;
+ struct ib_device *device;
+ struct rdma_cm_id *id;
+ struct rpcrdma_ep *ep;
int rc;
- ep->rep_max_requests = r_xprt->rx_xprt.max_reqs;
- ep->rep_inline_send = xprt_rdma_max_inline_write;
- ep->rep_inline_recv = xprt_rdma_max_inline_read;
+ ep = kzalloc(sizeof(*ep), GFP_NOFS);
+ if (!ep)
+ return -EAGAIN;
+ ep->re_xprt = &r_xprt->rx_xprt;
+ kref_init(&ep->re_kref);
- rc = frwr_query_device(r_xprt, ia->ri_id->device);
+ id = rpcrdma_create_id(r_xprt, ep);
+ if (IS_ERR(id)) {
+ rc = PTR_ERR(id);
+ goto out_free;
+ }
+ __module_get(THIS_MODULE);
+ device = id->device;
+ ep->re_id = id;
+
+ ep->re_max_requests = r_xprt->rx_xprt.max_reqs;
+ ep->re_inline_send = xprt_rdma_max_inline_write;
+ ep->re_inline_recv = xprt_rdma_max_inline_read;
+ rc = frwr_query_device(ep, device);
if (rc)
- return rc;
- r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->rep_max_requests);
+ goto out_destroy;
+
+ r_xprt->rx_buf.rb_max_requests = cpu_to_be32(ep->re_max_requests);
- ep->rep_attr.event_handler = rpcrdma_qp_event_handler;
- ep->rep_attr.qp_context = ep;
- ep->rep_attr.srq = NULL;
- ep->rep_attr.cap.max_inline_data = 0;
- ep->rep_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
- ep->rep_attr.qp_type = IB_QPT_RC;
- ep->rep_attr.port_num = ~0;
+ ep->re_attr.event_handler = rpcrdma_qp_event_handler;
+ ep->re_attr.qp_context = ep;
+ ep->re_attr.srq = NULL;
+ ep->re_attr.cap.max_inline_data = 0;
+ ep->re_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
+ ep->re_attr.qp_type = IB_QPT_RC;
+ ep->re_attr.port_num = ~0;
dprintk("RPC: %s: requested max: dtos: send %d recv %d; "
"iovs: send %d recv %d\n",
__func__,
- ep->rep_attr.cap.max_send_wr,
- ep->rep_attr.cap.max_recv_wr,
- ep->rep_attr.cap.max_send_sge,
- ep->rep_attr.cap.max_recv_sge);
-
- ep->rep_send_batch = ep->rep_max_requests >> 3;
- ep->rep_send_count = ep->rep_send_batch;
- init_waitqueue_head(&ep->rep_connect_wait);
- ep->rep_receive_count = 0;
-
- sendcq = ib_alloc_cq_any(ia->ri_id->device, r_xprt,
- ep->rep_attr.cap.max_send_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(sendcq)) {
- rc = PTR_ERR(sendcq);
- goto out1;
+ ep->re_attr.cap.max_send_wr,
+ ep->re_attr.cap.max_recv_wr,
+ ep->re_attr.cap.max_send_sge,
+ ep->re_attr.cap.max_recv_sge);
+
+ ep->re_send_batch = ep->re_max_requests >> 3;
+ ep->re_send_count = ep->re_send_batch;
+ init_waitqueue_head(&ep->re_connect_wait);
+
+ ep->re_attr.send_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_send_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.send_cq)) {
+ rc = PTR_ERR(ep->re_attr.send_cq);
+ goto out_destroy;
}
- recvcq = ib_alloc_cq_any(ia->ri_id->device, NULL,
- ep->rep_attr.cap.max_recv_wr + 1,
- IB_POLL_WORKQUEUE);
- if (IS_ERR(recvcq)) {
- rc = PTR_ERR(recvcq);
- goto out2;
+ ep->re_attr.recv_cq = ib_alloc_cq_any(device, r_xprt,
+ ep->re_attr.cap.max_recv_wr,
+ IB_POLL_WORKQUEUE);
+ if (IS_ERR(ep->re_attr.recv_cq)) {
+ rc = PTR_ERR(ep->re_attr.recv_cq);
+ goto out_destroy;
}
-
- ep->rep_attr.send_cq = sendcq;
- ep->rep_attr.recv_cq = recvcq;
+ ep->re_receive_count = 0;
/* Initialize cma parameters */
- memset(&ep->rep_remote_cma, 0, sizeof(ep->rep_remote_cma));
+ memset(&ep->re_remote_cma, 0, sizeof(ep->re_remote_cma));
/* Prepare RDMA-CM private message */
+ pmsg = &ep->re_cm_private;
pmsg->cp_magic = rpcrdma_cmp_magic;
pmsg->cp_version = RPCRDMA_CMP_VERSION;
pmsg->cp_flags |= RPCRDMA_CMP_F_SND_W_INV_OK;
- pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->rep_inline_send);
- pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->rep_inline_recv);
- ep->rep_remote_cma.private_data = pmsg;
- ep->rep_remote_cma.private_data_len = sizeof(*pmsg);
+ pmsg->cp_send_size = rpcrdma_encode_buffer_size(ep->re_inline_send);
+ pmsg->cp_recv_size = rpcrdma_encode_buffer_size(ep->re_inline_recv);
+ ep->re_remote_cma.private_data = pmsg;
+ ep->re_remote_cma.private_data_len = sizeof(*pmsg);
/* Client offers RDMA Read but does not initiate */
- ep->rep_remote_cma.initiator_depth = 0;
- ep->rep_remote_cma.responder_resources =
- min_t(int, U8_MAX, ia->ri_id->device->attrs.max_qp_rd_atom);
+ ep->re_remote_cma.initiator_depth = 0;
+ ep->re_remote_cma.responder_resources =
+ min_t(int, U8_MAX, device->attrs.max_qp_rd_atom);
/* Limit transport retries so client can detect server
* GID changes quickly. RPC layer handles re-establishing
* transport connection and retransmission.
*/
- ep->rep_remote_cma.retry_count = 6;
+ ep->re_remote_cma.retry_count = 6;
/* RPC-over-RDMA handles its own flow control. In addition,
* make all RNR NAKs visible so we know that RPC-over-RDMA
* flow control is working correctly (no NAKs should be seen).
*/
- ep->rep_remote_cma.flow_control = 0;
- ep->rep_remote_cma.rnr_retry_count = 0;
+ ep->re_remote_cma.flow_control = 0;
+ ep->re_remote_cma.rnr_retry_count = 0;
- return 0;
-
-out2:
- ib_free_cq(sendcq);
-out1:
- return rc;
-}
-
-/**
- * rpcrdma_ep_destroy - Disconnect and destroy endpoint.
- * @r_xprt: transport instance to shut down
- *
- */
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt)
-{
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
-
- if (ia->ri_id && ia->ri_id->qp) {
- rpcrdma_ep_disconnect(ep, ia);
- rdma_destroy_qp(ia->ri_id);
- ia->ri_id->qp = NULL;
- }
-
- if (ep->rep_attr.recv_cq)
- ib_free_cq(ep->rep_attr.recv_cq);
- if (ep->rep_attr.send_cq)
- ib_free_cq(ep->rep_attr.send_cq);
-}
-
-/* Re-establish a connection after a device removal event.
- * Unlike a normal reconnection, a fresh PD and a new set
- * of MRs and buffers is needed.
- */
-static int rpcrdma_ep_recreate_xprt(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
- int rc, err;
-
- trace_xprtrdma_reinsert(r_xprt);
-
- rc = -EHOSTUNREACH;
- if (rpcrdma_ia_open(r_xprt))
- goto out1;
-
- rc = -ENOMEM;
- err = rpcrdma_ep_create(r_xprt);
- if (err) {
- pr_err("rpcrdma: rpcrdma_ep_create returned %d\n", err);
- goto out2;
- }
- memcpy(qp_init_attr, &ep->rep_attr, sizeof(*qp_init_attr));
-
- rc = -ENETUNREACH;
- err = rdma_create_qp(ia->ri_id, ia->ri_pd, qp_init_attr);
- if (err) {
- pr_err("rpcrdma: rdma_create_qp returned %d\n", err);
- goto out3;
- }
- return 0;
-
-out3:
- rpcrdma_ep_destroy(r_xprt);
-out2:
- rpcrdma_ia_close(ia);
-out1:
- return rc;
-}
-
-static int rpcrdma_ep_reconnect(struct rpcrdma_xprt *r_xprt,
- struct ib_qp_init_attr *qp_init_attr)
-{
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rdma_cm_id *id, *old;
- int err, rc;
-
- rpcrdma_ep_disconnect(&r_xprt->rx_ep, ia);
-
- rc = -EHOSTUNREACH;
- id = rpcrdma_create_id(r_xprt, ia);
- if (IS_ERR(id))
- goto out;
-
- /* As long as the new ID points to the same device as the
- * old ID, we can reuse the transport's existing PD and all
- * previously allocated MRs. Also, the same device means
- * the transport's previous DMA mappings are still valid.
- *
- * This is a sanity check only. There should be no way these
- * point to two different devices here.
- */
- old = id;
- rc = -ENETUNREACH;
- if (ia->ri_id->device != id->device) {
- pr_err("rpcrdma: can't reconnect on different device!\n");
+ ep->re_pd = ib_alloc_pd(device, 0);
+ if (IS_ERR(ep->re_pd)) {
+ rc = PTR_ERR(ep->re_pd);
goto out_destroy;
}
- err = rdma_create_qp(id, ia->ri_pd, qp_init_attr);
- if (err)
+ rc = rdma_create_qp(id, ep->re_pd, &ep->re_attr);
+ if (rc)
goto out_destroy;
- /* Atomically replace the transport's ID and QP. */
- rc = 0;
- old = ia->ri_id;
- ia->ri_id = id;
- rdma_destroy_qp(old);
+ r_xprt->rx_ep = ep;
+ return 0;
out_destroy:
- rdma_destroy_id(old);
-out:
+ rpcrdma_ep_destroy(ep);
+ rdma_destroy_id(id);
+out_free:
+ kfree(ep);
+ r_xprt->rx_ep = NULL;
return rc;
}
-/*
- * Connect unconnected endpoint.
+/**
+ * rpcrdma_xprt_connect - Connect an unconnected transport
+ * @r_xprt: controlling transport instance
+ *
+ * Returns 0 on success or a negative errno.
*/
-int
-rpcrdma_ep_connect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ia, struct rpcrdma_xprt,
- rx_ia);
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
- struct ib_qp_init_attr qp_init_attr;
+ struct rpcrdma_ep *ep;
int rc;
retry:
- memcpy(&qp_init_attr, &ep->rep_attr, sizeof(qp_init_attr));
- switch (ep->rep_connected) {
- case 0:
- rc = rdma_create_qp(ia->ri_id, ia->ri_pd, &qp_init_attr);
- if (rc) {
- rc = -ENETUNREACH;
- goto out_noupdate;
- }
- break;
- case -ENODEV:
- rc = rpcrdma_ep_recreate_xprt(r_xprt, &qp_init_attr);
- if (rc)
- goto out_noupdate;
- break;
- default:
- rc = rpcrdma_ep_reconnect(r_xprt, &qp_init_attr);
- if (rc)
- goto out;
- }
+ rpcrdma_xprt_disconnect(r_xprt);
+ rc = rpcrdma_ep_create(r_xprt);
+ if (rc)
+ return rc;
+ ep = r_xprt->rx_ep;
- ep->rep_connected = 0;
+ ep->re_connect_status = 0;
xprt_clear_connected(xprt);
rpcrdma_reset_cwnd(r_xprt);
@@ -699,64 +529,68 @@ retry:
if (rc)
goto out;
- rc = rdma_connect(ia->ri_id, &ep->rep_remote_cma);
+ rc = rdma_connect(ep->re_id, &ep->re_remote_cma);
if (rc)
goto out;
if (xprt->reestablish_timeout < RPCRDMA_INIT_REEST_TO)
xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
- wait_event_interruptible(ep->rep_connect_wait, ep->rep_connected != 0);
- if (ep->rep_connected <= 0) {
- if (ep->rep_connected == -EAGAIN)
+ wait_event_interruptible(ep->re_connect_wait,
+ ep->re_connect_status != 0);
+ if (ep->re_connect_status <= 0) {
+ if (ep->re_connect_status == -EAGAIN)
goto retry;
- rc = ep->rep_connected;
+ rc = ep->re_connect_status;
goto out;
}
rc = rpcrdma_reqs_setup(r_xprt);
if (rc) {
- rpcrdma_ep_disconnect(ep, ia);
+ rpcrdma_xprt_disconnect(r_xprt);
goto out;
}
rpcrdma_mrs_create(r_xprt);
out:
if (rc)
- ep->rep_connected = rc;
-
-out_noupdate:
+ ep->re_connect_status = rc;
trace_xprtrdma_connect(r_xprt, rc);
return rc;
}
/**
- * rpcrdma_ep_disconnect - Disconnect underlying transport
- * @ep: endpoint to disconnect
- * @ia: associated interface adapter
+ * rpcrdma_xprt_disconnect - Disconnect underlying transport
+ * @r_xprt: controlling transport instance
*
* Caller serializes. Either the transport send lock is held,
* or we're being called to destroy the transport.
+ *
+ * On return, @r_xprt is completely divested of all hardware
+ * resources and prepared for the next ->connect operation.
*/
-void
-rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt)
{
- struct rpcrdma_xprt *r_xprt = container_of(ep, struct rpcrdma_xprt,
- rx_ep);
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
+ struct rdma_cm_id *id;
int rc;
- /* returns without wait if ID is not connected */
- rc = rdma_disconnect(ia->ri_id);
- if (!rc)
- wait_event_interruptible(ep->rep_connect_wait,
- ep->rep_connected != 1);
- else
- ep->rep_connected = rc;
+ if (!ep)
+ return;
+
+ id = ep->re_id;
+ rc = rdma_disconnect(id);
trace_xprtrdma_disconnect(r_xprt, rc);
rpcrdma_xprt_drain(r_xprt);
+ rpcrdma_reps_unmap(r_xprt);
rpcrdma_reqs_reset(r_xprt);
rpcrdma_mrs_destroy(r_xprt);
rpcrdma_sendctxs_destroy(r_xprt);
+
+ if (rpcrdma_ep_destroy(ep))
+ rdma_destroy_id(id);
+
+ r_xprt->rx_ep = NULL;
}
/* Fixed-size circular FIFO queue. This implementation is wait-free and
@@ -793,7 +627,7 @@ static struct rpcrdma_sendctx *rpcrdma_sendctx_create(struct rpcrdma_ep *ep)
{
struct rpcrdma_sendctx *sc;
- sc = kzalloc(struct_size(sc, sc_sges, ep->rep_attr.cap.max_send_sge),
+ sc = kzalloc(struct_size(sc, sc_sges, ep->re_attr.cap.max_send_sge),
GFP_KERNEL);
if (!sc)
return NULL;
@@ -813,14 +647,14 @@ static int rpcrdma_sendctxs_create(struct rpcrdma_xprt *r_xprt)
* the ->send_request call to fail temporarily before too many
* Sends are posted.
*/
- i = r_xprt->rx_ep.rep_max_requests + RPCRDMA_MAX_BC_REQUESTS;
+ i = r_xprt->rx_ep->re_max_requests + RPCRDMA_MAX_BC_REQUESTS;
buf->rb_sc_ctxs = kcalloc(i, sizeof(sc), GFP_KERNEL);
if (!buf->rb_sc_ctxs)
return -ENOMEM;
buf->rb_sc_last = i - 1;
for (i = 0; i <= buf->rb_sc_last; i++) {
- sc = rpcrdma_sendctx_create(&r_xprt->rx_ep);
+ sc = rpcrdma_sendctx_create(r_xprt->rx_ep);
if (!sc)
return -ENOMEM;
@@ -924,10 +758,10 @@ static void
rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
unsigned int count;
- for (count = 0; count < ia->ri_max_rdma_segs; count++) {
+ for (count = 0; count < ep->re_max_rdma_segs; count++) {
struct rpcrdma_mr *mr;
int rc;
@@ -935,14 +769,12 @@ rpcrdma_mrs_create(struct rpcrdma_xprt *r_xprt)
if (!mr)
break;
- rc = frwr_init_mr(ia, mr);
+ rc = frwr_mr_init(r_xprt, mr);
if (rc) {
kfree(mr);
break;
}
- mr->mr_xprt = r_xprt;
-
spin_lock(&buf->rb_lock);
rpcrdma_mr_push(mr, &buf->rb_mrs);
list_add(&mr->mr_all, &buf->rb_all_mrs);
@@ -973,12 +805,12 @@ rpcrdma_mr_refresh_worker(struct work_struct *work)
void rpcrdma_mrs_refresh(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
- /* If there is no underlying device, it's no use to
- * wake the refresh worker.
+ /* If there is no underlying connection, it's no use
+ * to wake the refresh worker.
*/
- if (ep->rep_connected != -ENODEV) {
+ if (ep->re_connect_status == 1) {
/* The work is scheduled on a WQ_MEM_RECLAIM
* workqueue in order to prevent MR allocation
* from recursing into NFS during direct reclaim.
@@ -1042,7 +874,7 @@ int rpcrdma_req_setup(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
/* Compute maximum header buffer size in bytes */
maxhdrsize = rpcrdma_fixed_maxsz + 3 +
- r_xprt->rx_ia.ri_max_rdma_segs * rpcrdma_readchunk_maxsz;
+ r_xprt->rx_ep->re_max_rdma_segs * rpcrdma_readchunk_maxsz;
maxhdrsize *= sizeof(__be32);
rb = rpcrdma_regbuf_alloc(__roundup_pow_of_two(maxhdrsize),
DMA_TO_DEVICE, GFP_KERNEL);
@@ -1120,7 +952,7 @@ struct rpcrdma_rep *rpcrdma_rep_create(struct rpcrdma_xprt *r_xprt,
if (rep == NULL)
goto out;
- rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep.rep_inline_recv,
+ rep->rr_rdmabuf = rpcrdma_regbuf_alloc(r_xprt->rx_ep->re_inline_recv,
DMA_FROM_DEVICE, GFP_KERNEL);
if (!rep->rr_rdmabuf)
goto out_free;
@@ -1345,7 +1177,7 @@ void rpcrdma_mr_put(struct rpcrdma_mr *mr)
if (mr->mr_dir != DMA_NONE) {
trace_xprtrdma_mr_unmap(mr);
- ib_dma_unmap_sg(r_xprt->rx_ia.ri_id->device,
+ ib_dma_unmap_sg(r_xprt->rx_ep->re_id->device,
mr->mr_sg, mr->mr_nents, mr->mr_dir);
mr->mr_dir = DMA_NONE;
}
@@ -1463,7 +1295,7 @@ bool rpcrdma_regbuf_realloc(struct rpcrdma_regbuf *rb, size_t size, gfp_t flags)
bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_regbuf *rb)
{
- struct ib_device *device = r_xprt->rx_ia.ri_id->device;
+ struct ib_device *device = r_xprt->rx_ep->re_id->device;
if (rb->rg_direction == DMA_NONE)
return false;
@@ -1476,7 +1308,7 @@ bool __rpcrdma_regbuf_dma_map(struct rpcrdma_xprt *r_xprt,
}
rb->rg_device = device;
- rb->rg_iov.lkey = r_xprt->rx_ia.ri_pd->local_dma_lkey;
+ rb->rg_iov.lkey = r_xprt->rx_ep->re_pd->local_dma_lkey;
return true;
}
@@ -1502,31 +1334,28 @@ static void rpcrdma_regbuf_free(struct rpcrdma_regbuf *rb)
}
/**
- * rpcrdma_ep_post - Post WRs to a transport's Send Queue
- * @ia: transport's device information
- * @ep: transport's RDMA endpoint information
+ * rpcrdma_post_sends - Post WRs to a transport's Send Queue
+ * @r_xprt: controlling transport instance
* @req: rpcrdma_req containing the Send WR to post
*
* Returns 0 if the post was successful, otherwise -ENOTCONN
* is returned.
*/
-int
-rpcrdma_ep_post(struct rpcrdma_ia *ia,
- struct rpcrdma_ep *ep,
- struct rpcrdma_req *req)
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
{
struct ib_send_wr *send_wr = &req->rl_wr;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
int rc;
- if (!ep->rep_send_count || kref_read(&req->rl_kref) > 1) {
+ if (!ep->re_send_count || kref_read(&req->rl_kref) > 1) {
send_wr->send_flags |= IB_SEND_SIGNALED;
- ep->rep_send_count = ep->rep_send_batch;
+ ep->re_send_count = ep->re_send_batch;
} else {
send_wr->send_flags &= ~IB_SEND_SIGNALED;
- --ep->rep_send_count;
+ --ep->re_send_count;
}
- rc = frwr_send(ia, req);
+ rc = frwr_send(r_xprt, req);
trace_xprtrdma_post_send(req, rc);
if (rc)
return -ENOTCONN;
@@ -1542,7 +1371,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
- struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_ep *ep = r_xprt->rx_ep;
struct ib_recv_wr *wr, *bad_wr;
struct rpcrdma_rep *rep;
int needed, count, rc;
@@ -1551,9 +1380,9 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
count = 0;
needed = buf->rb_credits + (buf->rb_bc_srv_max_requests << 1);
- if (likely(ep->rep_receive_count > needed))
+ if (likely(ep->re_receive_count > needed))
goto out;
- needed -= ep->rep_receive_count;
+ needed -= ep->re_receive_count;
if (!temp)
needed += RPCRDMA_MAX_RECV_BATCH;
@@ -1579,7 +1408,7 @@ void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp)
if (!wr)
goto out;
- rc = ib_post_recv(r_xprt->rx_ia.ri_id->qp, wr,
+ rc = ib_post_recv(ep->re_id->qp, wr,
(const struct ib_recv_wr **)&bad_wr);
out:
trace_xprtrdma_post_recvs(r_xprt, count, rc);
@@ -1593,6 +1422,6 @@ out:
--count;
}
}
- ep->rep_receive_count += count;
+ ep->re_receive_count += count;
return;
}
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index 37d5080c250b..0a16fdb09b2c 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -65,43 +65,33 @@
#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
/*
- * Interface Adapter -- one per transport instance
+ * RDMA Endpoint -- connection endpoint details
*/
-struct rpcrdma_ia {
- struct rdma_cm_id *ri_id;
- struct ib_pd *ri_pd;
- int ri_async_rc;
- unsigned int ri_max_rdma_segs;
- unsigned int ri_max_frwr_depth;
- bool ri_implicit_roundup;
- enum ib_mr_type ri_mrtype;
- unsigned long ri_flags;
- struct completion ri_done;
- struct completion ri_remove_done;
-};
-
-enum {
- RPCRDMA_IAF_REMOVING = 0,
-};
-
-/*
- * RDMA Endpoint -- one per transport instance
- */
-
struct rpcrdma_ep {
- unsigned int rep_send_count;
- unsigned int rep_send_batch;
- unsigned int rep_max_inline_send;
- unsigned int rep_max_inline_recv;
- int rep_connected;
- struct ib_qp_init_attr rep_attr;
- wait_queue_head_t rep_connect_wait;
- struct rpcrdma_connect_private rep_cm_private;
- struct rdma_conn_param rep_remote_cma;
- unsigned int rep_max_requests; /* depends on device */
- unsigned int rep_inline_send; /* negotiated */
- unsigned int rep_inline_recv; /* negotiated */
- int rep_receive_count;
+ struct kref re_kref;
+ struct rdma_cm_id *re_id;
+ struct ib_pd *re_pd;
+ unsigned int re_max_rdma_segs;
+ unsigned int re_max_fr_depth;
+ bool re_implicit_roundup;
+ enum ib_mr_type re_mrtype;
+ struct completion re_done;
+ unsigned int re_send_count;
+ unsigned int re_send_batch;
+ unsigned int re_max_inline_send;
+ unsigned int re_max_inline_recv;
+ int re_async_rc;
+ int re_connect_status;
+ struct ib_qp_init_attr re_attr;
+ wait_queue_head_t re_connect_wait;
+ struct rpc_xprt *re_xprt;
+ struct rpcrdma_connect_private
+ re_cm_private;
+ struct rdma_conn_param re_remote_cma;
+ int re_receive_count;
+ unsigned int re_max_requests; /* depends on device */
+ unsigned int re_inline_send; /* negotiated */
+ unsigned int re_inline_recv; /* negotiated */
};
/* Pre-allocate extra Work Requests for handling backward receives
@@ -422,8 +412,7 @@ struct rpcrdma_stats {
*/
struct rpcrdma_xprt {
struct rpc_xprt rx_xprt;
- struct rpcrdma_ia rx_ia;
- struct rpcrdma_ep rx_ep;
+ struct rpcrdma_ep *rx_ep;
struct rpcrdma_buffer rx_buf;
struct delayed_work rx_connect_worker;
struct rpc_timeout rx_timeout;
@@ -455,22 +444,13 @@ extern int xprt_rdma_pad_optimize;
extern unsigned int xprt_rdma_memreg_strategy;
/*
- * Interface Adapter calls - xprtrdma/verbs.c
- */
-int rpcrdma_ia_open(struct rpcrdma_xprt *xprt);
-void rpcrdma_ia_remove(struct rpcrdma_ia *ia);
-void rpcrdma_ia_close(struct rpcrdma_ia *);
-
-/*
* Endpoint calls - xprtrdma/verbs.c
*/
-int rpcrdma_ep_create(struct rpcrdma_xprt *r_xprt);
-void rpcrdma_ep_destroy(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_connect(struct rpcrdma_ep *, struct rpcrdma_ia *);
-void rpcrdma_ep_disconnect(struct rpcrdma_ep *, struct rpcrdma_ia *);
+void rpcrdma_flush_disconnect(struct ib_cq *cq, struct ib_wc *wc);
+int rpcrdma_xprt_connect(struct rpcrdma_xprt *r_xprt);
+void rpcrdma_xprt_disconnect(struct rpcrdma_xprt *r_xprt);
-int rpcrdma_ep_post(struct rpcrdma_ia *, struct rpcrdma_ep *,
- struct rpcrdma_req *);
+int rpcrdma_post_sends(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void rpcrdma_post_recvs(struct rpcrdma_xprt *r_xprt, bool temp);
/*
@@ -536,15 +516,14 @@ rpcrdma_data_dir(bool writing)
/* Memory registration calls xprtrdma/frwr_ops.c
*/
void frwr_reset(struct rpcrdma_req *req);
-int frwr_query_device(struct rpcrdma_xprt *r_xprt,
- const struct ib_device *device);
-int frwr_init_mr(struct rpcrdma_ia *ia, struct rpcrdma_mr *mr);
+int frwr_query_device(struct rpcrdma_ep *ep, const struct ib_device *device);
+int frwr_mr_init(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr *mr);
void frwr_release_mr(struct rpcrdma_mr *mr);
struct rpcrdma_mr_seg *frwr_map(struct rpcrdma_xprt *r_xprt,
struct rpcrdma_mr_seg *seg,
int nsegs, bool writing, __be32 xid,
struct rpcrdma_mr *mr);
-int frwr_send(struct rpcrdma_ia *ia, struct rpcrdma_req *req);
+int frwr_send(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_reminv(struct rpcrdma_rep *rep, struct list_head *mrs);
void frwr_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
void frwr_unmap_async(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req);
@@ -569,7 +548,7 @@ int rpcrdma_prepare_send_sges(struct rpcrdma_xprt *r_xprt,
enum rpcrdma_chunktype rtype);
void rpcrdma_sendctx_unmap(struct rpcrdma_sendctx *sc);
int rpcrdma_marshal_req(struct rpcrdma_xprt *r_xprt, struct rpc_rqst *rqst);
-void rpcrdma_set_max_header_sizes(struct rpcrdma_xprt *);
+void rpcrdma_set_max_header_sizes(struct rpcrdma_ep *ep);
void rpcrdma_reset_cwnd(struct rpcrdma_xprt *r_xprt);
void rpcrdma_complete_rqst(struct rpcrdma_rep *rep);
void rpcrdma_reply_handler(struct rpcrdma_rep *rep);
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 17cb902e5153..0bda8a73e8a8 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -1861,7 +1861,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
struct rpc_xprt *xprt = &transport->xprt;
struct file *filp;
struct socket *sock;
- int status = -EIO;
+ int status;
status = __sock_create(xprt->xprt_net, AF_LOCAL,
SOCK_STREAM, 0, &sock, 1);