From 006ff7498fe89bd9dfb891101f02557d5cfcf427 Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Thu, 24 Apr 2025 01:45:05 -0400
Subject: saner calling conventions for ->d_automount()

Currently the calling conventions for ->d_automount() instances have
an odd wart - returned new mount to be attached is expected to have
refcount 2.

That kludge is intended to make sure that mark_mounts_for_expiry() called
before we get around to attaching that new mount to the tree won't decide
to take it out.  finish_automount() drops the extra reference after it's
done with attaching mount to the tree - or drops the reference twice in
case of error.  ->d_automount() instances have rather counterintuitive
boilerplate in them.

There's a much simpler approach: have mark_mounts_for_expiry() skip the
mounts that are yet to be mounted.  And to hell with grabbing/dropping
those extra references.  Makes for simpler correctness analysis, at that...

Reviewed-by: Christian Brauner <brauner@kernel.org>
Reviewed-by: Jeff Layton <jlayton@kernel.org>
Reviewed-by: Paulo Alcantara (Red Hat) <pc@manguebit.com>
Acked-by: David Howells <dhowells@redhat.com>
Tested-by: David Howells <dhowells@redhat.com>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 Documentation/filesystems/porting.rst | 7 +++++++
 Documentation/filesystems/vfs.rst     | 4 +---
 fs/afs/mntpt.c                        | 1 -
 fs/fuse/dir.c                         | 3 ---
 fs/namespace.c                        | 9 +++------
 fs/nfs/namespace.c                    | 1 -
 fs/smb/client/namespace.c             | 1 -
 kernel/trace/trace.c                  | 2 --
 8 files changed, 11 insertions(+), 17 deletions(-)

diff --git a/Documentation/filesystems/porting.rst b/Documentation/filesystems/porting.rst
index 767b2927c762..749637231773 100644
--- a/Documentation/filesystems/porting.rst
+++ b/Documentation/filesystems/porting.rst
@@ -1203,3 +1203,10 @@ should use d_drop();d_splice_alias() and return the result of the latter.
 If a positive dentry cannot be returned for some reason, in-kernel
 clients such as cachefiles, nfsd, smb/server may not perform ideally but
 will fail-safe.
+
+---
+
+**mandatory**
+
+Calling conventions for ->d_automount() have changed; we should *not* grab
+an extra reference to new mount - it should be returned with refcount 1.
diff --git a/Documentation/filesystems/vfs.rst b/Documentation/filesystems/vfs.rst
index ae79c30b6c0c..cc0a58e96770 100644
--- a/Documentation/filesystems/vfs.rst
+++ b/Documentation/filesystems/vfs.rst
@@ -1411,9 +1411,7 @@ defined:
 
 	If a vfsmount is returned, the caller will attempt to mount it
 	on the mountpoint and will remove the vfsmount from its
-	expiration list in the case of failure.  The vfsmount should be
-	returned with 2 refs on it to prevent automatic expiration - the
-	caller will clean up the additional ref.
+	expiration list in the case of failure.
 
 	This function is only used if DCACHE_NEED_AUTOMOUNT is set on
 	the dentry.  This is set by __d_instantiate() if S_AUTOMOUNT is
diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c
index 45cee6534122..9434a5399f2b 100644
--- a/fs/afs/mntpt.c
+++ b/fs/afs/mntpt.c
@@ -189,7 +189,6 @@ struct vfsmount *afs_d_automount(struct path *path)
 	if (IS_ERR(newmnt))
 		return newmnt;
 
-	mntget(newmnt); /* prevent immediate expiration */
 	mnt_set_expiry(newmnt, &afs_vfsmounts);
 	queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer,
 			   afs_mntpt_expiry_timeout * HZ);
diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c
index 83ac192e7fdd..05d8584fd3b9 100644
--- a/fs/fuse/dir.c
+++ b/fs/fuse/dir.c
@@ -319,9 +319,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path)
 
 	/* Create the submount */
 	mnt = fc_mount(fsc);
-	if (!IS_ERR(mnt))
-		mntget(mnt);
-
 	put_fs_context(fsc);
 	return mnt;
 }
diff --git a/fs/namespace.c b/fs/namespace.c
index 98a5cd756e9a..018e95fe5459 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -3902,10 +3902,6 @@ int finish_automount(struct vfsmount *m, const struct path *path)
 		return PTR_ERR(m);
 
 	mnt = real_mount(m);
-	/* The new mount record should have at least 2 refs to prevent it being
-	 * expired before we get a chance to add it
-	 */
-	BUG_ON(mnt_get_count(mnt) < 2);
 
 	if (m->mnt_sb == path->mnt->mnt_sb &&
 	    m->mnt_root == dentry) {
@@ -3938,7 +3934,6 @@ int finish_automount(struct vfsmount *m, const struct path *path)
 	unlock_mount(mp);
 	if (unlikely(err))
 		goto discard;
-	mntput(m);
 	return 0;
 
 discard_locked:
@@ -3952,7 +3947,6 @@ discard:
 		namespace_unlock();
 	}
 	mntput(m);
-	mntput(m);
 	return err;
 }
 
@@ -3989,11 +3983,14 @@ void mark_mounts_for_expiry(struct list_head *mounts)
 
 	/* extract from the expiration list every vfsmount that matches the
 	 * following criteria:
+	 * - already mounted
 	 * - only referenced by its parent vfsmount
 	 * - still marked for expiry (marked on the last call here; marks are
 	 *   cleared by mntput())
 	 */
 	list_for_each_entry_safe(mnt, next, mounts, mnt_expire) {
+		if (!is_mounted(&mnt->mnt))
+			continue;
 		if (!xchg(&mnt->mnt_expiry_mark, 1) ||
 			propagate_mount_busy(mnt, 1))
 			continue;
diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c
index 973aed9cc5fe..7f1ec9c67ff2 100644
--- a/fs/nfs/namespace.c
+++ b/fs/nfs/namespace.c
@@ -195,7 +195,6 @@ struct vfsmount *nfs_d_automount(struct path *path)
 	if (IS_ERR(mnt))
 		goto out_fc;
 
-	mntget(mnt); /* prevent immediate expiration */
 	if (timeout <= 0)
 		goto out_fc;
 
diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c
index e3f9213131c4..778daf11f1db 100644
--- a/fs/smb/client/namespace.c
+++ b/fs/smb/client/namespace.c
@@ -283,7 +283,6 @@ struct vfsmount *cifs_d_automount(struct path *path)
 		return newmnt;
 	}
 
-	mntget(newmnt); /* prevent immediate expiration */
 	mnt_set_expiry(newmnt, &cifs_automount_list);
 	schedule_delayed_work(&cifs_automount_task,
 			      cifs_mountpoint_expiry_timeout);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index 5b8db27fb6ef..e22aacb0028a 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -10088,8 +10088,6 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 	put_filesystem(type);
 	if (IS_ERR(mnt))
 		return NULL;
-	mntget(mnt);
-
 	return mnt;
 }
 
-- 
cgit v1.2.3


From 2dbf6e0df447d1542f8fd158b17a06d2e8ede15e Mon Sep 17 00:00:00 2001
From: Al Viro <viro@zeniv.linux.org.uk>
Date: Sun, 4 May 2025 21:04:08 -0400
Subject: kill vfs_submount()

The last remaining user of vfs_submount() (tracefs) is easy to convert
to fs_context_for_submount(); do that and bury that thing, along with
SB_SUBMOUNT

Reviewed-by: Jan Kara <jack@suse.cz>
Acked-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Tested-by: Steven Rostedt (Google) <rostedt@goodmis.org>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
---
 fs/namespace.c        | 15 ---------------
 fs/super.c            |  9 +--------
 include/linux/fs.h    |  1 -
 include/linux/mount.h |  3 ---
 kernel/trace/trace.c  | 19 ++++++++++++++++---
 5 files changed, 17 insertions(+), 30 deletions(-)

diff --git a/fs/namespace.c b/fs/namespace.c
index 018e95fe5459..0577a9fb6050 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -1329,21 +1329,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 }
 EXPORT_SYMBOL_GPL(vfs_kern_mount);
 
-struct vfsmount *
-vfs_submount(const struct dentry *mountpoint, struct file_system_type *type,
-	     const char *name, void *data)
-{
-	/* Until it is worked out how to pass the user namespace
-	 * through from the parent mount to the submount don't support
-	 * unprivileged mounts with submounts.
-	 */
-	if (mountpoint->d_sb->s_user_ns != &init_user_ns)
-		return ERR_PTR(-EPERM);
-
-	return vfs_kern_mount(type, SB_SUBMOUNT, name, data);
-}
-EXPORT_SYMBOL_GPL(vfs_submount);
-
 static struct mount *clone_mnt(struct mount *old, struct dentry *root,
 					int flag)
 {
diff --git a/fs/super.c b/fs/super.c
index 97a17f9d9023..1886e4c930e0 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -823,13 +823,6 @@ struct super_block *sget(struct file_system_type *type,
 	struct super_block *old;
 	int err;
 
-	/* We don't yet pass the user namespace of the parent
-	 * mount through to here so always use &init_user_ns
-	 * until that changes.
-	 */
-	if (flags & SB_SUBMOUNT)
-		user_ns = &init_user_ns;
-
 retry:
 	spin_lock(&sb_lock);
 	if (test) {
@@ -849,7 +842,7 @@ retry:
 	}
 	if (!s) {
 		spin_unlock(&sb_lock);
-		s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns);
+		s = alloc_super(type, flags, user_ns);
 		if (!s)
 			return ERR_PTR(-ENOMEM);
 		goto retry;
diff --git a/include/linux/fs.h b/include/linux/fs.h
index 016b0fe1536e..515e702d98ae 100644
--- a/include/linux/fs.h
+++ b/include/linux/fs.h
@@ -1240,7 +1240,6 @@ extern int send_sigurg(struct file *file);
 /* These sb flags are internal to the kernel */
 #define SB_DEAD         BIT(21)
 #define SB_DYING        BIT(24)
-#define SB_SUBMOUNT     BIT(26)
 #define SB_FORCE        BIT(27)
 #define SB_NOSEC        BIT(28)
 #define SB_BORN         BIT(29)
diff --git a/include/linux/mount.h b/include/linux/mount.h
index dcc17ce8a959..d4eb90a367af 100644
--- a/include/linux/mount.h
+++ b/include/linux/mount.h
@@ -98,9 +98,6 @@ extern struct vfsmount *vfs_create_mount(struct fs_context *fc);
 extern struct vfsmount *vfs_kern_mount(struct file_system_type *type,
 				      int flags, const char *name,
 				      void *data);
-extern struct vfsmount *vfs_submount(const struct dentry *mountpoint,
-				     struct file_system_type *type,
-				     const char *name, void *data);
 
 extern void mnt_set_expiry(struct vfsmount *mnt, struct list_head *expiry_list);
 extern void mark_mounts_for_expiry(struct list_head *mounts);
diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c
index e22aacb0028a..936a615e8c56 100644
--- a/kernel/trace/trace.c
+++ b/kernel/trace/trace.c
@@ -51,6 +51,7 @@
 #include <linux/workqueue.h>
 #include <linux/sort.h>
 #include <linux/io.h> /* vmap_page_range() */
+#include <linux/fs_context.h>
 
 #include <asm/setup.h> /* COMMAND_LINE_SIZE */
 
@@ -10075,6 +10076,8 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 {
 	struct vfsmount *mnt;
 	struct file_system_type *type;
+	struct fs_context *fc;
+	int ret;
 
 	/*
 	 * To maintain backward compatibility for tools that mount
@@ -10084,10 +10087,20 @@ static struct vfsmount *trace_automount(struct dentry *mntpt, void *ingore)
 	type = get_fs_type("tracefs");
 	if (!type)
 		return NULL;
-	mnt = vfs_submount(mntpt, type, "tracefs", NULL);
+
+	fc = fs_context_for_submount(type, mntpt);
 	put_filesystem(type);
-	if (IS_ERR(mnt))
-		return NULL;
+	if (IS_ERR(fc))
+		return ERR_CAST(fc);
+
+	ret = vfs_parse_fs_string(fc, "source",
+				  "tracefs", strlen("tracefs"));
+	if (!ret)
+		mnt = fc_mount(fc);
+	else
+		mnt = ERR_PTR(ret);
+
+	put_fs_context(fc);
 	return mnt;
 }
 
-- 
cgit v1.2.3