From 6278c86a6cc14fdc66988958bbefec3407fa56a0 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Oct 2024 16:33:40 -0400 Subject: NFS: Clean up locking the nfs_versions list This patch replaces the nfs_version_mutex and nfs_version_lock with a single RW lock that protects access to the nfs_versions list. The mutex around request_module() seemed unnecessary to me, and I couldn't find any other callers using a lock around calls to request_module() when I looked. At the same time, I saw fs/filesystems.c using a RW lock to protect their filesystems list. This seems like a better idea than a spinlock to me, so I'm also making that change while I'm here. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 03ecc7765615..8bcc61a61327 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -55,8 +55,7 @@ #define NFSDBG_FACILITY NFSDBG_CLIENT static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); -static DEFINE_SPINLOCK(nfs_version_lock); -static DEFINE_MUTEX(nfs_version_mutex); +static DEFINE_RWLOCK(nfs_version_lock); static LIST_HEAD(nfs_versions); /* @@ -79,16 +78,16 @@ const struct rpc_program nfs_program = { static struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; - spin_lock(&nfs_version_lock); + read_lock(&nfs_version_lock); list_for_each_entry(nfs, &nfs_versions, list) { if (nfs->rpc_ops->version == version) { - spin_unlock(&nfs_version_lock); + read_unlock(&nfs_version_lock); return nfs; } } - spin_unlock(&nfs_version_lock); + read_unlock(&nfs_version_lock); return ERR_PTR(-EPROTONOSUPPORT); } @@ -97,10 +96,8 @@ struct nfs_subversion *get_nfs_version(unsigned int version) struct nfs_subversion *nfs = find_nfs_version(version); if (IS_ERR(nfs)) { - mutex_lock(&nfs_version_mutex); request_module("nfsv%d", version); nfs = find_nfs_version(version); - mutex_unlock(&nfs_version_mutex); } if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) @@ -115,23 +112,23 @@ void put_nfs_version(struct nfs_subversion *nfs) void register_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); list_add(&nfs->list, &nfs_versions); nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(register_nfs_version); void unregister_nfs_version(struct nfs_subversion *nfs) { - spin_lock(&nfs_version_lock); + write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; list_del(&nfs->list); - spin_unlock(&nfs_version_lock); + write_unlock(&nfs_version_lock); } EXPORT_SYMBOL_GPL(unregister_nfs_version); -- cgit v1.2.3 From 11eb537fd85186cdc8654fd1a15efacb5141f90c Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Oct 2024 16:33:41 -0400 Subject: NFS: Convert the NFS module list into an array Using a linked list here seems unnecessarily complex, especially since possible index values are '2', '3', and '4'. Let's just use an array for direct access. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 26 ++++++++++++++------------ fs/nfs/nfs.h | 1 - 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 8bcc61a61327..5b547d0afa18 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -56,7 +56,12 @@ static DECLARE_WAIT_QUEUE_HEAD(nfs_client_active_wq); static DEFINE_RWLOCK(nfs_version_lock); -static LIST_HEAD(nfs_versions); + +static struct nfs_subversion *nfs_version_mods[5] = { + [2] = NULL, + [3] = NULL, + [4] = NULL, +}; /* * RPC cruft for NFS @@ -78,17 +83,14 @@ const struct rpc_program nfs_program = { static struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; - read_lock(&nfs_version_lock); - - list_for_each_entry(nfs, &nfs_versions, list) { - if (nfs->rpc_ops->version == version) { - read_unlock(&nfs_version_lock); - return nfs; - } - } + read_lock(&nfs_version_lock); + nfs = nfs_version_mods[version]; read_unlock(&nfs_version_lock); - return ERR_PTR(-EPROTONOSUPPORT); + + if (nfs == NULL) + return ERR_PTR(-EPROTONOSUPPORT); + return nfs; } struct nfs_subversion *get_nfs_version(unsigned int version) @@ -114,7 +116,7 @@ void register_nfs_version(struct nfs_subversion *nfs) { write_lock(&nfs_version_lock); - list_add(&nfs->list, &nfs_versions); + nfs_version_mods[nfs->rpc_ops->version] = nfs; nfs_version[nfs->rpc_ops->version] = nfs->rpc_vers; write_unlock(&nfs_version_lock); @@ -126,7 +128,7 @@ void unregister_nfs_version(struct nfs_subversion *nfs) write_lock(&nfs_version_lock); nfs_version[nfs->rpc_ops->version] = NULL; - list_del(&nfs->list); + nfs_version_mods[nfs->rpc_ops->version] = NULL; write_unlock(&nfs_version_lock); } diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 0d3ce0460e35..0329fc3023d0 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -19,7 +19,6 @@ struct nfs_subversion { const struct nfs_rpc_ops *rpc_ops; /* NFS operations */ const struct super_operations *sops; /* NFS Super operations */ const struct xattr_handler * const *xattr; /* NFS xattr handlers */ - struct list_head list; /* List of NFS versions */ }; struct nfs_subversion *get_nfs_version(unsigned int); -- cgit v1.2.3 From df50b5ee0564412e4570abae7767d9baa5911f23 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Oct 2024 16:33:42 -0400 Subject: NFS: Rename get_nfs_version() -> find_nfs_version() We have a put_nfs_version() that handles refcounting on the nfs version module, but get_nfs_version() does much more work to find a version module based on version number. Let's change 'get' to 'find' to better match what it's doing. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 8 ++++---- fs/nfs/fs_context.c | 2 +- fs/nfs/nfs.h | 2 +- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 5b547d0afa18..27f862490f82 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -80,7 +80,7 @@ const struct rpc_program nfs_program = { .pipe_dir_name = NFS_PIPE_DIRNAME, }; -static struct nfs_subversion *find_nfs_version(unsigned int version) +static struct nfs_subversion *__find_nfs_version(unsigned int version) { struct nfs_subversion *nfs; @@ -93,13 +93,13 @@ static struct nfs_subversion *find_nfs_version(unsigned int version) return nfs; } -struct nfs_subversion *get_nfs_version(unsigned int version) +struct nfs_subversion *find_nfs_version(unsigned int version) { - struct nfs_subversion *nfs = find_nfs_version(version); + struct nfs_subversion *nfs = __find_nfs_version(version); if (IS_ERR(nfs)) { request_module("nfsv%d", version); - nfs = find_nfs_version(version); + nfs = __find_nfs_version(version); } if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index 7e000d782e28..d553daa4c09c 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1467,7 +1467,7 @@ static int nfs_fs_context_validate(struct fs_context *fc) /* Load the NFS protocol module if we haven't done so yet */ if (!ctx->nfs_mod) { - nfs_mod = get_nfs_version(ctx->version); + nfs_mod = find_nfs_version(ctx->version); if (IS_ERR(nfs_mod)) { ret = PTR_ERR(nfs_mod); goto out_version_unavailable; diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index 0329fc3023d0..a30bf8ef79d7 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -21,7 +21,7 @@ struct nfs_subversion { const struct xattr_handler * const *xattr; /* NFS xattr handlers */ }; -struct nfs_subversion *get_nfs_version(unsigned int); +struct nfs_subversion *find_nfs_version(unsigned int); void put_nfs_version(struct nfs_subversion *); void register_nfs_version(struct nfs_subversion *); void unregister_nfs_version(struct nfs_subversion *); -- cgit v1.2.3 From 3c91e4b7ae902bd5c05c14ff6fe74acd0bdd237a Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Oct 2024 16:33:43 -0400 Subject: NFS: Clean up find_nfs_version() It's good practice to check the return value of request_module() to see if the module has been found. It's also a little easier to follow the code if __find_nfs_version() doesn't attempt to convert NULL pointers into -EPROTONOSUPPORT. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 27f862490f82..4c94fe419c40 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -87,9 +87,6 @@ static struct nfs_subversion *__find_nfs_version(unsigned int version) read_lock(&nfs_version_lock); nfs = nfs_version_mods[version]; read_unlock(&nfs_version_lock); - - if (nfs == NULL) - return ERR_PTR(-EPROTONOSUPPORT); return nfs; } @@ -97,13 +94,15 @@ struct nfs_subversion *find_nfs_version(unsigned int version) { struct nfs_subversion *nfs = __find_nfs_version(version); - if (IS_ERR(nfs)) { - request_module("nfsv%d", version); + if (!nfs && request_module("nfsv%d", version) == 0) nfs = __find_nfs_version(version); - } - if (!IS_ERR(nfs) && !try_module_get(nfs->owner)) + if (!nfs) + return ERR_PTR(-EPROTONOSUPPORT); + + if (!try_module_get(nfs->owner)) return ERR_PTR(-EAGAIN); + return nfs; } -- cgit v1.2.3 From 288d7224db0c2a85bda4e2227fad3f6eb89e2874 Mon Sep 17 00:00:00 2001 From: Anna Schumaker Date: Tue, 1 Oct 2024 16:33:44 -0400 Subject: NFS: Implement get_nfs_version() This is a pair for put_nfs_version(), and is used for incrementing the reference count on the nfs version module. I also updated the callers I could find who had this hardcoded up until now. Signed-off-by: Anna Schumaker Signed-off-by: Trond Myklebust --- fs/nfs/client.c | 10 ++++++++-- fs/nfs/fs_context.c | 4 ++-- fs/nfs/namespace.c | 2 +- fs/nfs/nfs.h | 1 + 4 files changed, 12 insertions(+), 5 deletions(-) diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 4c94fe419c40..550ca934c9cf 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -100,12 +100,18 @@ struct nfs_subversion *find_nfs_version(unsigned int version) if (!nfs) return ERR_PTR(-EPROTONOSUPPORT); - if (!try_module_get(nfs->owner)) + if (!get_nfs_version(nfs)) return ERR_PTR(-EAGAIN); return nfs; } +int get_nfs_version(struct nfs_subversion *nfs) +{ + return try_module_get(nfs->owner); +} +EXPORT_SYMBOL_GPL(get_nfs_version); + void put_nfs_version(struct nfs_subversion *nfs) { module_put(nfs->owner); @@ -149,7 +155,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_minorversion = cl_init->minorversion; clp->cl_nfs_mod = cl_init->nfs_mod; - if (!try_module_get(clp->cl_nfs_mod->owner)) + if (!get_nfs_version(clp->cl_nfs_mod)) goto error_dealloc; clp->rpc_ops = clp->cl_nfs_mod->rpc_ops; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index d553daa4c09c..b069385eea17 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -1541,7 +1541,7 @@ static int nfs_fs_context_dup(struct fs_context *fc, struct fs_context *src_fc) } nfs_copy_fh(ctx->mntfh, src->mntfh); - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ctx->client_address = NULL; ctx->mount_server.hostname = NULL; ctx->nfs_server.export_path = NULL; @@ -1633,7 +1633,7 @@ static int nfs_init_fs_context(struct fs_context *fc) } ctx->nfs_mod = nfss->nfs_client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); } else { /* defaults */ ctx->timeo = NFS_UNSPEC_TIMEO; diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index e7494cdd957e..2d53574da605 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -182,7 +182,7 @@ struct vfsmount *nfs_d_automount(struct path *path) ctx->version = client->rpc_ops->version; ctx->minorversion = client->cl_minorversion; ctx->nfs_mod = client->cl_nfs_mod; - __module_get(ctx->nfs_mod->owner); + get_nfs_version(ctx->nfs_mod); ret = client->rpc_ops->submount(fc, server); if (ret < 0) { diff --git a/fs/nfs/nfs.h b/fs/nfs/nfs.h index a30bf8ef79d7..8a5f51be013a 100644 --- a/fs/nfs/nfs.h +++ b/fs/nfs/nfs.h @@ -22,6 +22,7 @@ struct nfs_subversion { }; struct nfs_subversion *find_nfs_version(unsigned int); +int get_nfs_version(struct nfs_subversion *); void put_nfs_version(struct nfs_subversion *); void register_nfs_version(struct nfs_subversion *); void unregister_nfs_version(struct nfs_subversion *); -- cgit v1.2.3 From fb4e525da1c12d1f7aeff94797385937fd89f40b Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:01 -0400 Subject: nfs/localio: remove redundant suid/sgid handling nfs_writeback_done() will take care of suid/sgid corner case. Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 8f0ce82a677e..8d27a55209fc 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -521,12 +521,7 @@ nfs_local_write_done(struct nfs_local_kiocb *iocb, long status) } if (status < 0) nfs_reset_boot_verifier(inode); - else if (nfs_should_remove_suid(inode)) { - /* Deal with the suid/sgid bit corner case */ - spin_lock(&inode->i_lock); - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); - spin_unlock(&inode->i_lock); - } + nfs_local_pgio_done(hdr, status); } -- cgit v1.2.3 From 894f5c5593cdb57841318597a800ad1d3cb45a52 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:02 -0400 Subject: nfs/localio: eliminate unnecessary kref in nfs_local_fsync_ctx nfs_local_commit() doesn't need async cleanup of nfs_local_fsync_ctx, so there is no need to use a kref. Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 20 +++----------------- 1 file changed, 3 insertions(+), 17 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 8d27a55209fc..153e00e4dd90 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -42,7 +42,6 @@ struct nfs_local_fsync_ctx { struct nfsd_file *localio; struct nfs_commit_data *data; struct work_struct work; - struct kref kref; struct completion *done; }; static void nfs_local_fsync_work(struct work_struct *work); @@ -683,30 +682,17 @@ nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, ctx->localio = localio; ctx->data = data; INIT_WORK(&ctx->work, nfs_local_fsync_work); - kref_init(&ctx->kref); ctx->done = NULL; } return ctx; } -static void -nfs_local_fsync_ctx_kref_free(struct kref *kref) -{ - kfree(container_of(kref, struct nfs_local_fsync_ctx, kref)); -} - -static void -nfs_local_fsync_ctx_put(struct nfs_local_fsync_ctx *ctx) -{ - kref_put(&ctx->kref, nfs_local_fsync_ctx_kref_free); -} - static void nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) { nfs_local_release_commit_data(ctx->localio, ctx->data, ctx->data->task.tk_ops); - nfs_local_fsync_ctx_put(ctx); + kfree(ctx); } static void @@ -739,7 +725,7 @@ int nfs_local_commit(struct nfsd_file *localio, } nfs_local_init_commit(data, call_ops); - kref_get(&ctx->kref); + if (how & FLUSH_SYNC) { DECLARE_COMPLETION_ONSTACK(done); ctx->done = &done; @@ -747,6 +733,6 @@ int nfs_local_commit(struct nfsd_file *localio, wait_for_completion(&done); } else queue_work(nfsiod_workqueue, &ctx->work); - nfs_local_fsync_ctx_put(ctx); + return 0; } -- cgit v1.2.3 From 0978e5b85fc0867f53c5f4e5b7d2a5536a623e16 Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:03 -0400 Subject: nfs/localio: remove extra indirect nfs_to call to check {read,write}_iter Push the read_iter and write_iter availability checks down to nfs_do_local_read and nfs_do_local_write respectively. This eliminates a redundant nfs_to->nfsd_file_file() call. Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 153e00e4dd90..23f6d2a372dd 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -273,7 +273,7 @@ nfs_local_iocb_free(struct nfs_local_kiocb *iocb) static struct nfs_local_kiocb * nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, - struct nfsd_file *localio, gfp_t flags) + struct file *file, gfp_t flags) { struct nfs_local_kiocb *iocb; @@ -286,9 +286,8 @@ nfs_local_iocb_alloc(struct nfs_pgio_header *hdr, kfree(iocb); return NULL; } - init_sync_kiocb(&iocb->kiocb, nfs_to->nfsd_file_file(localio)); + init_sync_kiocb(&iocb->kiocb, file); iocb->kiocb.ki_pos = hdr->args.offset; - iocb->localio = localio; iocb->hdr = hdr; iocb->kiocb.ki_flags &= ~IOCB_APPEND; return iocb; @@ -389,13 +388,19 @@ nfs_do_local_read(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { struct nfs_local_kiocb *iocb; + struct file *file = nfs_to->nfsd_file_file(localio); + + /* Don't support filesystems without read_iter */ + if (!file->f_op->read_iter) + return -EAGAIN; dprintk("%s: vfs_read count=%u pos=%llu\n", __func__, hdr->args.count, hdr->args.offset); - iocb = nfs_local_iocb_alloc(hdr, localio, GFP_KERNEL); + iocb = nfs_local_iocb_alloc(hdr, file, GFP_KERNEL); if (iocb == NULL) return -ENOMEM; + iocb->localio = localio; nfs_local_pgio_init(hdr, call_ops); hdr->res.eof = false; @@ -558,14 +563,20 @@ nfs_do_local_write(struct nfs_pgio_header *hdr, const struct rpc_call_ops *call_ops) { struct nfs_local_kiocb *iocb; + struct file *file = nfs_to->nfsd_file_file(localio); + + /* Don't support filesystems without write_iter */ + if (!file->f_op->write_iter) + return -EAGAIN; dprintk("%s: vfs_write count=%u pos=%llu %s\n", __func__, hdr->args.count, hdr->args.offset, (hdr->args.stable == NFS_UNSTABLE) ? "unstable" : "stable"); - iocb = nfs_local_iocb_alloc(hdr, localio, GFP_NOIO); + iocb = nfs_local_iocb_alloc(hdr, file, GFP_NOIO); if (iocb == NULL) return -ENOMEM; + iocb->localio = localio; switch (hdr->args.stable) { default: @@ -591,16 +602,9 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, const struct rpc_call_ops *call_ops) { int status = 0; - struct file *filp = nfs_to->nfsd_file_file(localio); if (!hdr->args.count) return 0; - /* Don't support filesystems without read_iter/write_iter */ - if (!filp->f_op->read_iter || !filp->f_op->write_iter) { - nfs_local_disable(clp); - status = -EAGAIN; - goto out; - } switch (hdr->rw_mode) { case FMODE_READ: @@ -614,8 +618,10 @@ int nfs_local_doio(struct nfs_client *clp, struct nfsd_file *localio, hdr->rw_mode); status = -EINVAL; } -out: + if (status != 0) { + if (status == -EAGAIN) + nfs_local_disable(clp); nfs_to_nfsd_file_put_local(localio); hdr->task.tk_status = status; nfs_local_hdr_release(hdr, call_ops); -- cgit v1.2.3 From 79a66e1465561f7baa3ff3daf79800fc241afeeb Mon Sep 17 00:00:00 2001 From: Mike Snitzer Date: Thu, 3 Oct 2024 15:35:04 -0400 Subject: nfs/localio: eliminate need for nfs_local_fsync_work forward declaration Move nfs_local_fsync_ctx_alloc() after nfs_local_fsync_work(). Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 31 +++++++++++++++---------------- 1 file changed, 15 insertions(+), 16 deletions(-) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 23f6d2a372dd..81bb908ab890 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -44,7 +44,6 @@ struct nfs_local_fsync_ctx { struct work_struct work; struct completion *done; }; -static void nfs_local_fsync_work(struct work_struct *work); static bool localio_enabled __read_mostly = true; module_param(localio_enabled, bool, 0644); @@ -678,21 +677,6 @@ nfs_local_release_commit_data(struct nfsd_file *localio, call_ops->rpc_release(data); } -static struct nfs_local_fsync_ctx * -nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, - struct nfsd_file *localio, gfp_t flags) -{ - struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); - - if (ctx != NULL) { - ctx->localio = localio; - ctx->data = data; - INIT_WORK(&ctx->work, nfs_local_fsync_work); - ctx->done = NULL; - } - return ctx; -} - static void nfs_local_fsync_ctx_free(struct nfs_local_fsync_ctx *ctx) { @@ -717,6 +701,21 @@ nfs_local_fsync_work(struct work_struct *work) nfs_local_fsync_ctx_free(ctx); } +static struct nfs_local_fsync_ctx * +nfs_local_fsync_ctx_alloc(struct nfs_commit_data *data, + struct nfsd_file *localio, gfp_t flags) +{ + struct nfs_local_fsync_ctx *ctx = kmalloc(sizeof(*ctx), flags); + + if (ctx != NULL) { + ctx->localio = localio; + ctx->data = data; + INIT_WORK(&ctx->work, nfs_local_fsync_work); + ctx->done = NULL; + } + return ctx; +} + int nfs_local_commit(struct nfsd_file *localio, struct nfs_commit_data *data, const struct rpc_call_ops *call_ops, int how) -- cgit v1.2.3 From e8e26a0b09f5783be471b5ffb1e31822b1272c1d Mon Sep 17 00:00:00 2001 From: Thorsten Blum Date: Tue, 15 Oct 2024 22:27:31 +0200 Subject: nfs: Annotate struct pnfs_commit_array with __counted_by() Add the __counted_by compiler attribute to the flexible array member buckets to improve access bounds-checking via CONFIG_UBSAN_BOUNDS and CONFIG_FORTIFY_SOURCE. Compile-tested only. Signed-off-by: Thorsten Blum Signed-off-by: Trond Myklebust --- include/linux/nfs_xdr.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 12d8e47bc5a3..559273a0f16d 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1336,7 +1336,7 @@ struct pnfs_commit_array { struct rcu_head rcu; refcount_t refcount; unsigned int nbuckets; - struct pnfs_commit_bucket buckets[]; + struct pnfs_commit_bucket buckets[] __counted_by(nbuckets); }; struct pnfs_ds_commit_info { -- cgit v1.2.3 From 93970b6a143bd9f184ebab37c5862a204fb5690e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 29 Oct 2024 15:21:43 -0400 Subject: sunrpc: remove newlines from tracepoints Tracepoint strings don't require newlines (and in fact, they are undesirable). Signed-off-by: Jeff Layton Acked-by: Chuck Lever Signed-off-by: Trond Myklebust --- include/trace/events/sunrpc.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/trace/events/sunrpc.h b/include/trace/events/sunrpc.h index 5e8495216689..b13dc275ef4a 100644 --- a/include/trace/events/sunrpc.h +++ b/include/trace/events/sunrpc.h @@ -719,7 +719,7 @@ TRACE_EVENT(rpc_xdr_overflow, ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u\n", + " %sv%d %s requested=%zu p=%p end=%p xdr=[%p,%zu]/%u/[%p,%zu]/%u", __entry->task_id, __entry->client_id, __get_str(progname), __entry->version, __get_str(procedure), __entry->requested, __entry->p, __entry->end, @@ -777,7 +777,7 @@ TRACE_EVENT(rpc_xdr_alignment, ), TP_printk(SUNRPC_TRACE_TASK_SPECIFIER - " %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u\n", + " %sv%d %s offset=%zu copied=%u xdr=[%p,%zu]/%u/[%p,%zu]/%u", __entry->task_id, __entry->client_id, __get_str(progname), __entry->version, __get_str(procedure), __entry->offset, __entry->copied, -- cgit v1.2.3 From 675d4566e599bab1a225d418bbf7a53100367978 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Mon, 4 Nov 2024 17:11:27 -0500 Subject: SUNRPC: Fix a hang in TLS sock_close if sk_write_pending We've observed an NFS server shrink the TCP window and then reset the TCP connection as part of a HA failover. When the connection has TLS, often the NFS client will hang indefinitely in this stack: wait_woken+0x70/0x80 wait_on_pending_writer+0xe4/0x110 [tls] tls_sk_proto_close+0x368/0x3a0 [tls] inet_release+0x54/0xb0 __sock_release+0x48/0xc8 sock_close+0x20/0x38 __fput+0xe0/0x2f0 __fput_sync+0x58/0x70 xs_reset_transport+0xe8/0x1f8 [sunrpc] xs_tcp_shutdown+0xa4/0x190 [sunrpc] xprt_autoclose+0x68/0x170 [sunrpc] process_one_work+0x180/0x420 worker_thread+0x258/0x368 kthread+0x104/0x118 ret_from_fork+0x10/0x20 This hang prevents the client from closing the socket and reconnecting to the server. Because xs_nospace() elevates sk_write_pending, and sk_sndtimeo is MAX_SCHEDULE_TIMEOUT, tls_sk_proto_close is never able to complete its wait for pending writes to the socket. For this case where we are resetting the transport anyway, we don't expect the socket to ever have write space, so fix this by simply clearing the sock's sndtimeo under the sock's lock. Signed-off-by: Benjamin Coddington Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 1326fbf45a34..d587c261d999 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1278,6 +1278,7 @@ static void xs_reset_transport(struct sock_xprt *transport) transport->file = NULL; sk->sk_user_data = NULL; + sk->sk_sndtimeo = 0; xs_restore_old_callbacks(transport, sk); xprt_clear_connected(xprt); -- cgit v1.2.3 From c968fd23c68e9929ab6cad4faffc8ea603e98e5d Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Nov 2024 11:41:21 -0500 Subject: NFSv4.0: Fix the wake up of the next waiter in nfs_release_seqid() There is no need to wake up another waiter on the seqid list unless the seqid being removed is at the head of the list, and so is relinquishing control of the sequence counter to the next entry. Reviewed-by: Yang Erkun Signed-off-by: Trond Myklebust --- fs/nfs/nfs4state.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index dafd61186557..9a9f60a2291b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1083,14 +1083,12 @@ void nfs_release_seqid(struct nfs_seqid *seqid) return; sequence = seqid->sequence; spin_lock(&sequence->lock); - list_del_init(&seqid->list); - if (!list_empty(&sequence->list)) { - struct nfs_seqid *next; - - next = list_first_entry(&sequence->list, - struct nfs_seqid, list); + if (list_is_first(&seqid->list, &sequence->list) && + !list_is_singular(&sequence->list)) { + struct nfs_seqid *next = list_next_entry(seqid, list); rpc_wake_up_queued_task(&sequence->wait, next->task); } + list_del_init(&seqid->list); spin_unlock(&sequence->lock); } -- cgit v1.2.3 From 2fdb05dc0931250574f0cb0ebeb5ed8e20f4a889 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Fri, 8 Nov 2024 12:13:31 -0500 Subject: NFSv4.0: Fix a use-after-free problem in the asynchronous open() Yang Erkun reports that when two threads are opening files at the same time, and are forced to abort before a reply is seen, then the call to nfs_release_seqid() in nfs4_opendata_free() can result in a use-after-free of the pointer to the defunct rpc task of the other thread. The fix is to ensure that if the RPC call is aborted before the call to nfs_wait_on_sequence() is complete, then we must call nfs_release_seqid() in nfs4_open_release() before the rpc_task is freed. Reported-by: Yang Erkun Fixes: 24ac23ab88df ("NFSv4: Convert open() into an asynchronous RPC call") Reviewed-by: Yang Erkun Signed-off-by: Trond Myklebust --- fs/nfs/nfs4proc.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index 9d40319e063d..405f17e6e0b4 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -2603,12 +2603,14 @@ static void nfs4_open_release(void *calldata) struct nfs4_opendata *data = calldata; struct nfs4_state *state = NULL; + /* In case of error, no cleanup! */ + if (data->rpc_status != 0 || !data->rpc_done) { + nfs_release_seqid(data->o_arg.seqid); + goto out_free; + } /* If this request hasn't been cancelled, do nothing */ if (!data->cancelled) goto out_free; - /* In case of error, no cleanup! */ - if (data->rpc_status != 0 || !data->rpc_done) - goto out_free; /* In case we need an open_confirm, no cleanup! */ if (data->o_res.rflags & NFS4_OPEN_RESULT_CONFIRM) goto out_free; -- cgit v1.2.3 From 650703bc4ed3edf841e851c99ab8e7ba9e5262a3 Mon Sep 17 00:00:00 2001 From: NeilBrown Date: Fri, 8 Nov 2024 18:39:44 -0500 Subject: nfs/localio: must clear res.replen in nfs_local_read_done Otherwise memory corruption can occur due to NFSv3 LOCALIO reads leaving garbage in res.replen: - nfs3_read_done() copies that into server->read_hdrsize; from there nfs3_proc_read_setup() copies it to args.replen in new requests. - nfs3_xdr_enc_read3args() passes that to rpc_prepare_reply_pages() which includes it in hdrsize for xdr_init_pages, so that rq_rcv_buf contains a ridiculous len. - This is copied to rq_private_buf and xs_read_stream_request() eventually passes the kvec to sock_recvmsg() which receives incoming data into entirely the wrong place. This is easily reproduced with NFSv3 LOCALIO that is servicing reads when it is made to pivot back to using normal RPC. This switch back to using normal NFSv3 with RPC can occur for a few reasons but this issue was exposed with a test that stops and then restarts the NFSv3 server while LOCALIO is performing heavy read IO. Fixes: 70ba381e1a43 ("nfs: add LOCALIO support") Reported-by: Mike Snitzer Signed-off-by: NeilBrown Co-developed-by: Mike Snitzer Signed-off-by: Mike Snitzer Signed-off-by: Trond Myklebust --- fs/nfs/localio.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 81bb908ab890..4b8618cf114c 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -351,6 +351,12 @@ nfs_local_read_done(struct nfs_local_kiocb *iocb, long status) nfs_local_pgio_done(hdr, status); + /* + * Must clear replen otherwise NFSv3 data corruption will occur + * if/when switching from LOCALIO back to using normal RPC. + */ + hdr->res.replen = 0; + if (hdr->res.count != hdr->args.count || hdr->args.offset + hdr->res.count >= i_size_read(file_inode(filp))) hdr->res.eof = true; -- cgit v1.2.3 From 8f52caf9d231e77412766b48e5630a647e5ef774 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2024 21:01:43 -0500 Subject: Revert "fs: nfs: fix missing refcnt by replacing folio_set_private by folio_attach_private" This reverts commit 03e02b94171b1985dd0aa184296fe94425b855a3. As was pointed out during code review, there is no need to use folio_attach_private()/folio_detach_private() when a refcount to the folio is already carried by the struct nfs_page. Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index ead2dc55952d..2da00987d9ed 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -772,7 +772,8 @@ static void nfs_inode_add_request(struct nfs_page *req) nfs_lock_request(req); spin_lock(&mapping->i_private_lock); set_bit(PG_MAPPED, &req->wb_flags); - folio_attach_private(folio, req); + folio_set_private(folio); + folio->private = req; spin_unlock(&mapping->i_private_lock); atomic_long_inc(&nfsi->nrequests); /* this a head request for a page group - mark it as having an @@ -796,7 +797,8 @@ static void nfs_inode_remove_request(struct nfs_page *req) spin_lock(&mapping->i_private_lock); if (likely(folio)) { - folio_detach_private(folio); + folio->private = NULL; + folio_clear_private(folio); clear_bit(PG_MAPPED, &req->wb_head->wb_flags); } spin_unlock(&mapping->i_private_lock); -- cgit v1.2.3 From 66f9dac9077c9c063552e465212abeb8f97d28a7 Mon Sep 17 00:00:00 2001 From: Trond Myklebust Date: Mon, 4 Nov 2024 21:09:21 -0500 Subject: Revert "nfs: don't reuse partially completed requests in nfs_lock_and_join_requests" This reverts commit b571cfcb9dcac187c6d967987792d37cb0688610. This patch appears to assume that if one request is complete, then the others will complete too before unlocking. That is not a valid assumption, since other requests could hit a non-fatal error or a short write that would cause them not to complete. Reported-by: Igor Raits Link: https://bugzilla.kernel.org/show_bug.cgi?id=219508 Fixes: b571cfcb9dca ("nfs: don't reuse partially completed requests in nfs_lock_and_join_requests") Signed-off-by: Trond Myklebust --- fs/nfs/write.c | 49 +++++++++++++++++++++++++++++-------------------- 1 file changed, 29 insertions(+), 20 deletions(-) diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 2da00987d9ed..50fa539611f5 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -144,6 +144,31 @@ static void nfs_io_completion_put(struct nfs_io_completion *ioc) kref_put(&ioc->refcount, nfs_io_completion_release); } +static void +nfs_page_set_inode_ref(struct nfs_page *req, struct inode *inode) +{ + if (!test_and_set_bit(PG_INODE_REF, &req->wb_flags)) { + kref_get(&req->wb_kref); + atomic_long_inc(&NFS_I(inode)->nrequests); + } +} + +static int +nfs_cancel_remove_inode(struct nfs_page *req, struct inode *inode) +{ + int ret; + + if (!test_bit(PG_REMOVE, &req->wb_flags)) + return 0; + ret = nfs_page_group_lock(req); + if (ret) + return ret; + if (test_and_clear_bit(PG_REMOVE, &req->wb_flags)) + nfs_page_set_inode_ref(req, inode); + nfs_page_group_unlock(req); + return 0; +} + /** * nfs_folio_find_head_request - find head request associated with a folio * @folio: pointer to folio @@ -540,7 +565,6 @@ static struct nfs_page *nfs_lock_and_join_requests(struct folio *folio) struct inode *inode = folio->mapping->host; struct nfs_page *head, *subreq; struct nfs_commit_info cinfo; - bool removed; int ret; /* @@ -565,18 +589,18 @@ retry: goto retry; } - ret = nfs_page_group_lock(head); + ret = nfs_cancel_remove_inode(head, inode); if (ret < 0) goto out_unlock; - removed = test_bit(PG_REMOVE, &head->wb_flags); + ret = nfs_page_group_lock(head); + if (ret < 0) + goto out_unlock; /* lock each request in the page group */ for (subreq = head->wb_this_page; subreq != head; subreq = subreq->wb_this_page) { - if (test_bit(PG_REMOVE, &subreq->wb_flags)) - removed = true; ret = nfs_page_group_lock_subreq(head, subreq); if (ret < 0) goto out_unlock; @@ -584,21 +608,6 @@ retry: nfs_page_group_unlock(head); - /* - * If PG_REMOVE is set on any request, I/O on that request has - * completed, but some requests were still under I/O at the time - * we locked the head request. - * - * In that case the above wait for all requests means that all I/O - * has now finished, and we can restart from a clean slate. Let the - * old requests go away and start from scratch instead. - */ - if (removed) { - nfs_unroll_locks(head, head); - nfs_unlock_and_release_request(head); - goto retry; - } - nfs_init_cinfo_from_inode(&cinfo, inode); nfs_join_page_group(head, &cinfo, inode); return head; -- cgit v1.2.3 From 52cb7f8f177878b4f22397b9c4d2c8f743766be3 Mon Sep 17 00:00:00 2001 From: Li Lingfeng Date: Thu, 14 Nov 2024 12:53:03 +0800 Subject: nfs: ignore SB_RDONLY when mounting nfs When exporting only one file system with fsid=0 on the server side, the client alternately uses the ro/rw mount options to perform the mount operation, and a new vfsmount is generated each time. It can be reproduced as follows: [root@localhost ~]# mount /dev/sda /mnt2 [root@localhost ~]# echo "/mnt2 *(rw,no_root_squash,fsid=0)" >/etc/exports [root@localhost ~]# systemctl restart nfs-server [root@localhost ~]# mount -t nfs -o ro,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount -t nfs -o rw,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount -t nfs -o ro,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount -t nfs -o rw,vers=4 127.0.0.1:/ /mnt/sdaa [root@localhost ~]# mount | grep nfs4 127.0.0.1:/ on /mnt/sdaa type nfs4 (ro,relatime,vers=4.2,rsize=1048576,... 127.0.0.1:/ on /mnt/sdaa type nfs4 (rw,relatime,vers=4.2,rsize=1048576,... 127.0.0.1:/ on /mnt/sdaa type nfs4 (ro,relatime,vers=4.2,rsize=1048576,... 127.0.0.1:/ on /mnt/sdaa type nfs4 (rw,relatime,vers=4.2,rsize=1048576,... [root@localhost ~]# We expected that after mounting with the ro option, using the rw option to mount again would return EBUSY, but the actual situation was not the case. As shown above, when mounting for the first time, a superblock with the ro flag will be generated, and at the same time, in do_new_mount_fc --> do_add_mount, it detects that the superblock corresponding to the current target directory is inconsistent with the currently generated one (path->mnt->mnt_sb != newmnt->mnt.mnt_sb), and a new vfsmount will be generated. When mounting with the rw option for the second time, since no matching superblock can be found in the fs_supers list, a new superblock with the rw flag will be generated again. The superblock in use (ro) is different from the newly generated superblock (rw), and a new vfsmount will be generated again. When mounting with the ro option for the third time, the superblock (ro) is found in fs_supers, the superblock in use (rw) is different from the found superblock (ro), and a new vfsmount will be generated again. We can switch between ro/rw through remount, and only one superblock needs to be generated, thus avoiding the problem of repeated generation of vfsmount caused by switching superblocks. Furthermore, This can also resolve the issue described in the link. Fixes: 275a5d24bf56 ("NFS: Error when mounting the same filesystem with different options") Link: https://lore.kernel.org/all/20240604112636.236517-3-lilingfeng@huaweicloud.com/ Signed-off-by: Li Lingfeng Signed-off-by: Trond Myklebust --- fs/nfs/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 430733e3eff2..6bcc4b0e00ab 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -12,7 +12,7 @@ #include #include -#define NFS_SB_MASK (SB_RDONLY|SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) +#define NFS_SB_MASK (SB_NOSUID|SB_NODEV|SB_NOEXEC|SB_SYNCHRONOUS) extern const struct export_operations nfs_export_ops; -- cgit v1.2.3 From 4db9ad82a6c823094da27de4825af693a3475d51 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Fri, 15 Nov 2024 17:38:04 +0800 Subject: sunrpc: clear XPRT_SOCK_UPD_TIMEOUT when reset transport Since transport->sock has been set to NULL during reset transport, XPRT_SOCK_UPD_TIMEOUT also needs to be cleared. Otherwise, the xs_tcp_set_socket_timeouts() may be triggered in xs_tcp_send_request() to dereference the transport->sock that has been set to NULL. Fixes: 7196dbb02ea0 ("SUNRPC: Allow changing of the TCP timeout parameters on the fly") Signed-off-by: Li Lingfeng Signed-off-by: Liu Jian Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 1 + 1 file changed, 1 insertion(+) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index d587c261d999..31bc046e2850 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1198,6 +1198,7 @@ static void xs_sock_reset_state_flags(struct rpc_xprt *xprt) clear_bit(XPRT_SOCK_WAKE_WRITE, &transport->sock_state); clear_bit(XPRT_SOCK_WAKE_DISCONNECT, &transport->sock_state); clear_bit(XPRT_SOCK_NOSPACE, &transport->sock_state); + clear_bit(XPRT_SOCK_UPD_TIMEOUT, &transport->sock_state); } static void xs_run_error_worker(struct sock_xprt *transport, unsigned int nr) -- cgit v1.2.3 From d7bdd849ef1b681da03ac05ca0957b2cbe2d24b6 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 15 Nov 2024 08:59:36 -0500 Subject: SUNRPC: timeout and cancel TLS handshake with -ETIMEDOUT We've noticed a situation where an unstable TCP connection can cause the TLS handshake to timeout waiting for userspace to complete it. When this happens, we don't want to return from xs_tls_handshake_sync() with zero, as this will cause the upper xprt to be set CONNECTED, and subsequent attempts to transmit will be returned with -EPIPE. The sunrpc machine does not recover from this situation and will spin attempting to transmit. The return value of tls_handshake_cancel() can be used to detect a race with completion: * tls_handshake_cancel - cancel a pending handshake * Return values: * %true - Uncompleted handshake request was canceled * %false - Handshake request already completed or not found If true, we do not want the upper xprt to be connected, so return -ETIMEDOUT. If false, its possible the handshake request was lost and that may be the reason for our timeout. Again we do not want the upper xprt to be connected, so return -ETIMEDOUT. Ensure that we alway return an error from xs_tls_handshake_sync() if we call tls_handshake_cancel(). Signed-off-by: Benjamin Coddington Reviewed-by: Chuck Lever Fixes: 75eb6af7acdf ("SUNRPC: Add a TCP-with-TLS RPC transport class") Signed-off-by: Trond Myklebust --- net/sunrpc/xprtsock.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index 31bc046e2850..fecc8b8fa266 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -2616,11 +2616,10 @@ static int xs_tls_handshake_sync(struct rpc_xprt *lower_xprt, struct xprtsec_par rc = wait_for_completion_interruptible_timeout(&lower_transport->handshake_done, XS_TLS_HANDSHAKE_TO); if (rc <= 0) { - if (!tls_handshake_cancel(sk)) { - if (rc == 0) - rc = -ETIMEDOUT; - goto out_put_xprt; - } + tls_handshake_cancel(sk); + if (rc == 0) + rc = -ETIMEDOUT; + goto out_put_xprt; } rc = lower_transport->xprt_err; -- cgit v1.2.3 From 3f23f96528e8fcf8619895c4c916c52653892ec1 Mon Sep 17 00:00:00 2001 From: Liu Jian Date: Tue, 12 Nov 2024 21:54:34 +0800 Subject: sunrpc: fix one UAF issue caused by sunrpc kernel tcp socket BUG: KASAN: slab-use-after-free in tcp_write_timer_handler+0x156/0x3e0 Read of size 1 at addr ffff888111f322cd by task swapper/0/0 CPU: 0 UID: 0 PID: 0 Comm: swapper/0 Not tainted 6.12.0-rc4-dirty #7 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.15.0-1 Call Trace: dump_stack_lvl+0x68/0xa0 print_address_description.constprop.0+0x2c/0x3d0 print_report+0xb4/0x270 kasan_report+0xbd/0xf0 tcp_write_timer_handler+0x156/0x3e0 tcp_write_timer+0x66/0x170 call_timer_fn+0xfb/0x1d0 __run_timers+0x3f8/0x480 run_timer_softirq+0x9b/0x100 handle_softirqs+0x153/0x390 __irq_exit_rcu+0x103/0x120 irq_exit_rcu+0xe/0x20 sysvec_apic_timer_interrupt+0x76/0x90 asm_sysvec_apic_timer_interrupt+0x1a/0x20 RIP: 0010:default_idle+0xf/0x20 Code: 4c 01 c7 4c 29 c2 e9 72 ff ff ff 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 90 f3 0f 1e fa 66 90 0f 00 2d 33 f8 25 00 fb f4 c3 cc cc cc cc 66 66 2e 0f 1f 84 00 00 00 00 00 90 90 90 90 90 RSP: 0018:ffffffffa2007e28 EFLAGS: 00000242 RAX: 00000000000f3b31 RBX: 1ffffffff4400fc7 RCX: ffffffffa09c3196 RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff9f00590f RBP: 0000000000000000 R08: 0000000000000001 R09: ffffed102360835d R10: ffff88811b041aeb R11: 0000000000000001 R12: 0000000000000000 R13: ffffffffa202d7c0 R14: 0000000000000000 R15: 00000000000147d0 default_idle_call+0x6b/0xa0 cpuidle_idle_call+0x1af/0x1f0 do_idle+0xbc/0x130 cpu_startup_entry+0x33/0x40 rest_init+0x11f/0x210 start_kernel+0x39a/0x420 x86_64_start_reservations+0x18/0x30 x86_64_start_kernel+0x97/0xa0 common_startup_64+0x13e/0x141 Allocated by task 595: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 __kasan_slab_alloc+0x87/0x90 kmem_cache_alloc_noprof+0x12b/0x3f0 copy_net_ns+0x94/0x380 create_new_namespaces+0x24c/0x500 unshare_nsproxy_namespaces+0x75/0xf0 ksys_unshare+0x24e/0x4f0 __x64_sys_unshare+0x1f/0x30 do_syscall_64+0x70/0x180 entry_SYSCALL_64_after_hwframe+0x76/0x7e Freed by task 100: kasan_save_stack+0x24/0x50 kasan_save_track+0x14/0x30 kasan_save_free_info+0x3b/0x60 __kasan_slab_free+0x54/0x70 kmem_cache_free+0x156/0x5d0 cleanup_net+0x5d3/0x670 process_one_work+0x776/0xa90 worker_thread+0x2e2/0x560 kthread+0x1a8/0x1f0 ret_from_fork+0x34/0x60 ret_from_fork_asm+0x1a/0x30 Reproduction script: mkdir -p /mnt/nfsshare mkdir -p /mnt/nfs/netns_1 mkfs.ext4 /dev/sdb mount /dev/sdb /mnt/nfsshare systemctl restart nfs-server chmod 777 /mnt/nfsshare exportfs -i -o rw,no_root_squash *:/mnt/nfsshare ip netns add netns_1 ip link add name veth_1_peer type veth peer veth_1 ifconfig veth_1_peer 11.11.0.254 up ip link set veth_1 netns netns_1 ip netns exec netns_1 ifconfig veth_1 11.11.0.1 ip netns exec netns_1 /root/iptables -A OUTPUT -d 11.11.0.254 -p tcp \ --tcp-flags FIN FIN -j DROP (note: In my environment, a DESTROY_CLIENTID operation is always sent immediately, breaking the nfs tcp connection.) ip netns exec netns_1 timeout -s 9 300 mount -t nfs -o proto=tcp,vers=4.1 \ 11.11.0.254:/mnt/nfsshare /mnt/nfs/netns_1 ip netns del netns_1 The reason here is that the tcp socket in netns_1 (nfs side) has been shutdown and closed (done in xs_destroy), but the FIN message (with ack) is discarded, and the nfsd side keeps sending retransmission messages. As a result, when the tcp sock in netns_1 processes the received message, it sends the message (FIN message) in the sending queue, and the tcp timer is re-established. When the network namespace is deleted, the net structure accessed by tcp's timer handler function causes problems. To fix this problem, let's hold netns refcnt for the tcp kernel socket as done in other modules. This is an ugly hack which can easily be backported to earlier kernels. A proper fix which cleans up the interfaces will follow, but may not be so easy to backport. Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the netns of kernel sockets.") Signed-off-by: Liu Jian Acked-by: Jeff Layton Reviewed-by: Kuniyuki Iwashima Signed-off-by: Trond Myklebust --- net/sunrpc/svcsock.c | 4 ++++ net/sunrpc/xprtsock.c | 7 +++++++ 2 files changed, 11 insertions(+) diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c index 825ec5357691..59e2c46240f5 100644 --- a/net/sunrpc/svcsock.c +++ b/net/sunrpc/svcsock.c @@ -1551,6 +1551,10 @@ static struct svc_xprt *svc_create_socket(struct svc_serv *serv, newlen = error; if (protocol == IPPROTO_TCP) { + __netns_tracker_free(net, &sock->sk->ns_tracker, false); + sock->sk->sk_net_refcnt = 1; + get_net_track(net, &sock->sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); if ((error = kernel_listen(sock, 64)) < 0) goto bummer; } diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c index fecc8b8fa266..c60936d8cef7 100644 --- a/net/sunrpc/xprtsock.c +++ b/net/sunrpc/xprtsock.c @@ -1941,6 +1941,13 @@ static struct socket *xs_create_sock(struct rpc_xprt *xprt, goto out; } + if (protocol == IPPROTO_TCP) { + __netns_tracker_free(xprt->xprt_net, &sock->sk->ns_tracker, false); + sock->sk->sk_net_refcnt = 1; + get_net_track(xprt->xprt_net, &sock->sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(xprt->xprt_net, 1); + } + filp = sock_alloc_file(sock, O_NONBLOCK, NULL); if (IS_ERR(filp)) return ERR_CAST(filp); -- cgit v1.2.3 From 3a4ce14d9a6b868e0787e4582420b721c04ee41e Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 22 Nov 2024 10:11:11 -0500 Subject: nfs/blocklayout: Don't attempt unregister for invalid block device Since commit d869da91cccb ("nfs/blocklayout: Fix premature PR key unregistration") an unmount of a pNFS SCSI layout-enabled NFS may dereference a NULL block_device in: bl_unregister_scsi+0x16/0xe0 [blocklayoutdriver] bl_free_device+0x70/0x80 [blocklayoutdriver] bl_free_deviceid_node+0x12/0x30 [blocklayoutdriver] nfs4_put_deviceid_node+0x60/0xc0 [nfsv4] nfs4_deviceid_purge_client+0x132/0x190 [nfsv4] unset_pnfs_layoutdriver+0x59/0x60 [nfsv4] nfs4_destroy_server+0x36/0x70 [nfsv4] nfs_free_server+0x23/0xe0 [nfs] deactivate_locked_super+0x30/0xb0 cleanup_mnt+0xba/0x150 task_work_run+0x59/0x90 syscall_exit_to_user_mode+0x217/0x220 do_syscall_64+0x8e/0x160 This happens because even though we were able to create the nfs4_deviceid_node, the lookup for the device was unable to attach the block device to the pnfs_block_dev. If we never found a block device to register, we can avoid this case with the PNFS_BDEV_REGISTERED flag. Move the deref behind the test for the flag. Fixes: d869da91cccb ("nfs/blocklayout: Fix premature PR key unregistration") Signed-off-by: Benjamin Coddington Reviewed-by: Christoph Hellwig Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/dev.c | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/fs/nfs/blocklayout/dev.c b/fs/nfs/blocklayout/dev.c index 6252f4447945..cab8809f0e0f 100644 --- a/fs/nfs/blocklayout/dev.c +++ b/fs/nfs/blocklayout/dev.c @@ -20,9 +20,6 @@ static void bl_unregister_scsi(struct pnfs_block_dev *dev) const struct pr_ops *ops = bdev->bd_disk->fops->pr_ops; int status; - if (!test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) - return; - status = ops->pr_register(bdev, dev->pr_key, 0, false); if (status) trace_bl_pr_key_unreg_err(bdev, dev->pr_key, status); @@ -58,7 +55,8 @@ static void bl_unregister_dev(struct pnfs_block_dev *dev) return; } - if (dev->type == PNFS_BLOCK_VOLUME_SCSI) + if (dev->type == PNFS_BLOCK_VOLUME_SCSI && + test_and_clear_bit(PNFS_BDEV_REGISTERED, &dev->flags)) bl_unregister_scsi(dev); } -- cgit v1.2.3 From 614733f9441ed53bb442d4734112ec1e24bd6da7 Mon Sep 17 00:00:00 2001 From: Benjamin Coddington Date: Fri, 22 Nov 2024 10:11:12 -0500 Subject: nfs/blocklayout: Limit repeat device registration on failure Every pNFS SCSI IO wants to do LAYOUTGET, then within the layout find the device which can drive GETDEVINFO, then finally may need to prep the device with a reservation. This slow work makes a mess of IO latencies if one of the later steps is going to fail for awhile. If we're unable to register a SCSI device, ensure we mark the device as unavailable so that it will timeout and be re-added via GETDEVINFO. This avoids repeated doomed attempts to register a device in the IO path. Add some clarifying comments as well. Fixes: d869da91cccb ("nfs/blocklayout: Fix premature PR key unregistration") Signed-off-by: Benjamin Coddington Reviewed-by: Christoph Hellwig Reviewed-by: Chuck Lever Signed-off-by: Trond Myklebust --- fs/nfs/blocklayout/blocklayout.c | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/fs/nfs/blocklayout/blocklayout.c b/fs/nfs/blocklayout/blocklayout.c index 0becdec12970..47189476b553 100644 --- a/fs/nfs/blocklayout/blocklayout.c +++ b/fs/nfs/blocklayout/blocklayout.c @@ -571,19 +571,32 @@ retry: if (!node) return ERR_PTR(-ENODEV); + /* + * Devices that are marked unavailable are left in the cache with a + * timeout to avoid sending GETDEVINFO after every LAYOUTGET, or + * constantly attempting to register the device. Once marked as + * unavailable they must be deleted and never reused. + */ if (test_bit(NFS_DEVICEID_UNAVAILABLE, &node->flags)) { unsigned long end = jiffies; unsigned long start = end - PNFS_DEVICE_RETRY_TIMEOUT; if (!time_in_range(node->timestamp_unavailable, start, end)) { + /* Uncork subsequent GETDEVINFO operations for this device */ nfs4_delete_deviceid(node->ld, node->nfs_client, id); goto retry; } goto out_put; } - if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) + if (!bl_register_dev(container_of(node, struct pnfs_block_dev, node))) { + /* + * If we cannot register, treat this device as transient: + * Make a negative cache entry for the device + */ + nfs4_mark_deviceid_unavailable(node); goto out_put; + } return node; -- cgit v1.2.3 From 38a125b31504f91bf6fdd3cfc3a3e9a721e6c97a Mon Sep 17 00:00:00 2001 From: Max Kellermann Date: Thu, 21 Nov 2024 14:53:51 +0100 Subject: fs/nfs/io: make nfs_start_io_*() killable This allows killing processes that wait for a lock when one process is stuck waiting for the NFS server. This aims to complete the coverage of NFS operations being killable, like nfs_direct_wait() does, for example. Signed-off-by: Max Kellermann Signed-off-by: Trond Myklebust --- fs/nfs/direct.c | 21 ++++++++++++++++++--- fs/nfs/file.c | 14 +++++++++++--- fs/nfs/internal.h | 7 ++++--- fs/nfs/io.c | 44 +++++++++++++++++++++++++++++++++----------- 4 files changed, 66 insertions(+), 20 deletions(-) diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index 90079ca134dd..b08dbe96bc57 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -454,8 +454,16 @@ ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter, if (user_backed_iter(iter)) dreq->flags = NFS_ODIRECT_SHOULD_DIRTY; - if (!swap) - nfs_start_io_direct(inode); + if (!swap) { + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_read_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } + } NFS_I(inode)->read_io += count; requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos); @@ -1007,7 +1015,14 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter, requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, FLUSH_STABLE); } else { - nfs_start_io_direct(inode); + result = nfs_start_io_direct(inode); + if (result) { + /* release the reference that would usually be + * consumed by nfs_direct_write_schedule_iovec() + */ + nfs_direct_req_release(dreq); + goto out_release; + } requested = nfs_direct_write_schedule_iovec(dreq, iter, pos, FLUSH_COND_STABLE); diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 6800ee92d742..1bb646752e46 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -166,7 +166,10 @@ nfs_file_read(struct kiocb *iocb, struct iov_iter *to) iocb->ki_filp, iov_iter_count(to), (unsigned long) iocb->ki_pos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, iocb->ki_filp->f_mapping); if (!result) { result = generic_file_read_iter(iocb, to); @@ -187,7 +190,10 @@ nfs_file_splice_read(struct file *in, loff_t *ppos, struct pipe_inode_info *pipe dprintk("NFS: splice_read(%pD2, %zu@%llu)\n", in, len, *ppos); - nfs_start_io_read(inode); + result = nfs_start_io_read(inode); + if (result) + return result; + result = nfs_revalidate_mapping(inode, in->f_mapping); if (!result) { result = filemap_splice_read(in, ppos, pipe, len, flags); @@ -668,7 +674,9 @@ ssize_t nfs_file_write(struct kiocb *iocb, struct iov_iter *from) nfs_clear_invalid_mapping(file->f_mapping); since = filemap_sample_wb_err(file->f_mapping); - nfs_start_io_write(inode); + error = nfs_start_io_write(inode); + if (error) + return error; result = generic_write_checks(iocb, from); if (result > 0) result = generic_perform_write(iocb, from); diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6bcc4b0e00ab..e564bd11ba60 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -6,6 +6,7 @@ #include "nfs4_fs.h" #include #include +#include #include #include #include @@ -516,11 +517,11 @@ extern const struct netfs_request_ops nfs_netfs_ops; #endif /* io.c */ -extern void nfs_start_io_read(struct inode *inode); +extern __must_check int nfs_start_io_read(struct inode *inode); extern void nfs_end_io_read(struct inode *inode); -extern void nfs_start_io_write(struct inode *inode); +extern __must_check int nfs_start_io_write(struct inode *inode); extern void nfs_end_io_write(struct inode *inode); -extern void nfs_start_io_direct(struct inode *inode); +extern __must_check int nfs_start_io_direct(struct inode *inode); extern void nfs_end_io_direct(struct inode *inode); static inline bool nfs_file_io_is_buffered(struct nfs_inode *nfsi) diff --git a/fs/nfs/io.c b/fs/nfs/io.c index b5551ed8f648..3388faf2acb9 100644 --- a/fs/nfs/io.c +++ b/fs/nfs/io.c @@ -39,19 +39,28 @@ static void nfs_block_o_direct(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. the reads. */ -void +int nfs_start_io_read(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) == 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; nfs_block_o_direct(nfsi, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** @@ -74,11 +83,15 @@ nfs_end_io_read(struct inode *inode) * Declare that a buffered read operation is about to start, and ensure * that we block all direct I/O. */ -void +int nfs_start_io_write(struct inode *inode) { - down_write(&inode->i_rwsem); - nfs_block_o_direct(NFS_I(inode), inode); + int err; + + err = down_write_killable(&inode->i_rwsem); + if (!err) + nfs_block_o_direct(NFS_I(inode), inode); + return err; } /** @@ -119,19 +132,28 @@ static void nfs_block_buffered(struct nfs_inode *nfsi, struct inode *inode) * Note that buffered writes and truncates both take a write lock on * inode->i_rwsem, meaning that those are serialised w.r.t. O_DIRECT. */ -void +int nfs_start_io_direct(struct inode *inode) { struct nfs_inode *nfsi = NFS_I(inode); + int err; + /* Be an optimist! */ - down_read(&inode->i_rwsem); + err = down_read_killable(&inode->i_rwsem); + if (err) + return err; if (test_bit(NFS_INO_ODIRECT, &nfsi->flags) != 0) - return; + return 0; up_read(&inode->i_rwsem); + /* Slow path.... */ - down_write(&inode->i_rwsem); + err = down_write_killable(&inode->i_rwsem); + if (err) + return err; nfs_block_buffered(nfsi, inode); downgrade_write(&inode->i_rwsem); + + return 0; } /** -- cgit v1.2.3