diff options
author | Kent Overstreet <kent.overstreet@linux.dev> | 2024-09-28 21:31:10 -0400 |
---|---|---|
committer | Kent Overstreet <kent.overstreet@linux.dev> | 2024-10-01 19:40:47 -0400 |
commit | 9ec9b917b3f6cf676226f074fdcdeceebe1a0b29 (patch) | |
tree | 7c71ca81faa20a1e44624f7489fae1264db8e269 | |
parent | 32cb8103ecfacdd5ed8e1eb390221c3f8339de6f (diff) |
vfs: use fast_list for superblock's inode listfast_list
Use the new fast_list for super_block.s_inodes.
This gives similar performance to Dave's dlock list approach [1]; lock
contention is now moved to the lru_list locks.
Iteration is now fully lockless - instead we iterate using
rcu_read_lock(), which means we must take care for racing with removal.
Generally this is already handled - code that iterates over s_inodes
takes i_lock and checks i_state, skipping inodes that are
I_WILL_FREE|I_FREEING. However, code may also check for nonzero
i_sb_list_idx if it wishes to iterate over precisely the inodes that are
on the s_inodes list.
[1]: https://lore.kernel.org/linux-fsdevel/20231206060629.2827226-4-david@fromorbit.com/
Cc: Christian Brauner <brauner@kernel.org>
Cc: Dave Chinner <dchinner@redhat.com>
Cc: Waiman Long <longman@redhat.com>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev>
-rw-r--r-- | block/bdev.c | 32 | ||||
-rw-r--r-- | fs/drop_caches.c | 21 | ||||
-rw-r--r-- | fs/gfs2/ops_fstype.c | 22 | ||||
-rw-r--r-- | fs/inode.c | 60 | ||||
-rw-r--r-- | fs/notify/fsnotify.c | 25 | ||||
-rw-r--r-- | fs/quota/dquot.c | 53 | ||||
-rw-r--r-- | fs/super.c | 9 | ||||
-rw-r--r-- | include/linux/fs.h | 7 | ||||
-rw-r--r-- | security/landlock/fs.c | 41 |
9 files changed, 142 insertions, 128 deletions
diff --git a/block/bdev.c b/block/bdev.c index 738e3c8457e7..210ad4aee905 100644 --- a/block/bdev.c +++ b/block/bdev.c @@ -474,13 +474,19 @@ void bdev_drop(struct block_device *bdev) long nr_blockdev_pages(void) { - struct inode *inode; + struct genradix_iter iter; + void **i; long ret = 0; - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) + rcu_read_lock(); + genradix_for_each(&blockdev_superblock->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + ret += inode->i_mapping->nrpages; - spin_unlock(&blockdev_superblock->s_inode_list_lock); + } + rcu_read_unlock(); return ret; } @@ -1216,10 +1222,16 @@ EXPORT_SYMBOL_GPL(bdev_mark_dead); void sync_bdevs(bool wait) { - struct inode *inode, *old_inode = NULL; + struct genradix_iter iter; + void **i; + struct inode *old_inode = NULL; + + rcu_read_lock(); + genradix_for_each(&blockdev_superblock->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; - spin_lock(&blockdev_superblock->s_inode_list_lock); - list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; struct block_device *bdev; @@ -1231,7 +1243,7 @@ void sync_bdevs(bool wait) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&blockdev_superblock->s_inode_list_lock); + rcu_read_unlock(); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the @@ -1260,9 +1272,9 @@ void sync_bdevs(bool wait) } mutex_unlock(&bdev->bd_disk->open_mutex); - spin_lock(&blockdev_superblock->s_inode_list_lock); + rcu_read_lock(); } - spin_unlock(&blockdev_superblock->s_inode_list_lock); + rcu_read_unlock(); iput(old_inode); } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d45ef541d848..72c59fb22c81 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -18,10 +18,15 @@ int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { - struct inode *inode, *toput_inode = NULL; + struct genradix_iter iter; + void **i; + + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); /* * We must skip inodes in unusual state. We may also skip @@ -35,17 +40,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); invalidate_mapping_pages(inode->i_mapping, 0, -1); - iput(toput_inode); - toput_inode = inode; + iput(inode); cond_resched(); - spin_lock(&sb->s_inode_list_lock); + rcu_read_lock(); } - spin_unlock(&sb->s_inode_list_lock); - iput(toput_inode); + rcu_read_unlock(); } int drop_caches_sysctl_handler(const struct ctl_table *table, int write, diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e83d293c3614..9cb222e68154 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -1736,13 +1736,18 @@ static int gfs2_meta_init_fs_context(struct fs_context *fc) */ static void gfs2_evict_inodes(struct super_block *sb) { - struct inode *inode, *toput_inode = NULL; + struct genradix_iter iter; + void **i; struct gfs2_sbd *sdp = sb->s_fs_info; set_bit(SDF_EVICTING, &sdp->sd_flags); - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) && !need_resched()) { @@ -1751,16 +1756,13 @@ static void gfs2_evict_inodes(struct super_block *sb) } atomic_inc(&inode->i_count); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - iput(toput_inode); - toput_inode = inode; + rcu_read_unlock(); + iput(inode); cond_resched(); - spin_lock(&sb->s_inode_list_lock); + rcu_read_lock(); } - spin_unlock(&sb->s_inode_list_lock); - iput(toput_inode); + rcu_read_unlock(); } static void gfs2_kill_sb(struct super_block *sb) diff --git a/fs/inode.c b/fs/inode.c index 471ae4a31549..1b6a2f5cede4 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -230,8 +230,15 @@ int inode_init_always(struct super_block *sb, struct inode *inode) #endif inode->i_flctx = NULL; - if (unlikely(security_inode_alloc(inode))) + int idx = fast_list_get_idx(&sb->s_inodes); + if (idx < 0) return -ENOMEM; + inode->i_sb_list_idx = idx; + + if (unlikely(security_inode_alloc(inode))) { + fast_list_remove(&sb->s_inodes, idx); + return -ENOMEM; + } this_cpu_inc(nr_inodes); @@ -425,7 +432,6 @@ void inode_init_once(struct inode *inode) INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_wb_list); INIT_LIST_HEAD(&inode->i_lru); - INIT_LIST_HEAD(&inode->i_sb_list); __address_space_init_once(&inode->i_data); i_size_ordered_init(inode); } @@ -540,19 +546,14 @@ static void inode_wait_for_lru_isolating(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + *genradix_ptr_inlined(&inode->i_sb->s_inodes.items, inode->i_sb_list_idx) = inode; } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { - if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); - } + *genradix_ptr(&inode->i_sb->s_inodes.items, inode->i_sb_list_idx) = NULL; + inode->i_sb_list_idx = 0; } static unsigned long hash(struct super_block *sb, unsigned long hashval) @@ -785,12 +786,16 @@ static void dispose_list(struct list_head *head) */ void evict_inodes(struct super_block *sb) { - struct inode *inode, *next; + struct genradix_iter iter; + void **i; LIST_HEAD(dispose); - again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + if (atomic_read(&inode->i_count)) continue; @@ -815,13 +820,13 @@ again: * bit so we don't livelock. */ if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); cond_resched(); dispose_list(&dispose); goto again; } } - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); dispose_list(&dispose); } @@ -835,12 +840,16 @@ EXPORT_SYMBOL_GPL(evict_inodes); */ void invalidate_inodes(struct super_block *sb) { - struct inode *inode, *next; + struct genradix_iter iter; + void **i; LIST_HEAD(dispose); - again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { spin_unlock(&inode->i_lock); @@ -856,13 +865,13 @@ again: spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); cond_resched(); dispose_list(&dispose); goto again; } } - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); dispose_list(&dispose); } @@ -1290,12 +1299,7 @@ again: hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - /* - * Add inode to the sb list if it's not already. It has I_NEW at this - * point, so it should be safe to test i_sb_list locklessly. - */ - if (list_empty(&inode->i_sb_list)) - inode_sb_list_add(inode); + inode_sb_list_add(inode); unlock: spin_unlock(&inode_hash_lock); diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index 272c8a1dab3c..9a8da7a02a6f 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -33,14 +33,19 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. + * concurrent modifiers. Can block. */ static void fsnotify_unmount_inodes(struct super_block *sb) { - struct inode *inode, *iput_inode = NULL; + struct genradix_iter iter; + void **i; + + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We cannot __iget() an inode in state I_FREEING, * I_WILL_FREE, or I_NEW which is fine because by that point @@ -68,23 +73,19 @@ static void fsnotify_unmount_inodes(struct super_block *sb) __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); - - iput(iput_inode); + rcu_read_unlock(); /* for each watch, send FS_UNMOUNT and then remove it */ fsnotify_inode(inode, FS_UNMOUNT); fsnotify_inode_delete(inode); - iput_inode = inode; + iput(inode); cond_resched(); - spin_lock(&sb->s_inode_list_lock); + rcu_read_lock(); } - spin_unlock(&sb->s_inode_list_lock); - - iput(iput_inode); + rcu_read_unlock(); } void fsnotify_sb_delete(struct super_block *sb) diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index b40410cd39af..1d5d08787cce 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -1020,14 +1020,19 @@ static int dqinit_needed(struct inode *inode, int type) /* This routine is guarded by s_umount semaphore */ static int add_dquot_ref(struct super_block *sb, int type) { - struct inode *inode, *old_inode = NULL; + void **i; + struct genradix_iter iter; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif int err = 0; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || !atomic_read(&inode->i_writecount) || @@ -1037,33 +1042,21 @@ static int add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) reserved = 1; #endif - iput(old_inode); err = __dquot_initialize(inode, type); - if (err) { - iput(inode); + iput(inode); + if (err) goto out; - } - /* - * We hold a reference to 'inode' so it couldn't have been - * removed from s_inodes list while we dropped the - * s_inode_list_lock. We cannot iput the inode now as we can be - * holding the last reference and we cannot iput it under - * s_inode_list_lock. So we keep the reference and iput it - * later. - */ - old_inode = inode; cond_resched(); - spin_lock(&sb->s_inode_list_lock); + rcu_read_lock(); } - spin_unlock(&sb->s_inode_list_lock); - iput(old_inode); + rcu_read_unlock(); out: #ifdef CONFIG_QUOTA_DEBUG if (reserved) { @@ -1077,13 +1070,25 @@ out: static void remove_dquot_ref(struct super_block *sb, int type) { - struct inode *inode; + struct genradix_iter iter; + void **i; #ifdef CONFIG_QUOTA_DEBUG int reserved = 0; #endif - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + + spin_lock(&inode->i_lock); + bool on_list = inode->i_sb_list_idx != 0; + spin_unlock(&inode->i_lock); + + if (!on_list) + continue; + /* * We have to scan also I_NEW inodes because they can already * have quota pointer initialized. Luckily, we need to touch @@ -1107,7 +1112,7 @@ static void remove_dquot_ref(struct super_block *sb, int type) } spin_unlock(&dq_data_lock); } - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" diff --git a/fs/super.c b/fs/super.c index 1db230432960..132af76cf5c3 100644 --- a/fs/super.c +++ b/fs/super.c @@ -274,6 +274,7 @@ static void destroy_super_work(struct work_struct *work) { struct super_block *s = container_of(work, struct super_block, destroy_work); + fast_list_exit(&s->s_inodes); fsnotify_sb_free(s); security_sb_free(s); put_user_ns(s->s_user_ns); @@ -359,8 +360,6 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_roots); mutex_init(&s->s_sync_lock); - INIT_LIST_HEAD(&s->s_inodes); - spin_lock_init(&s->s_inode_list_lock); INIT_LIST_HEAD(&s->s_inodes_wb); spin_lock_init(&s->s_inode_wblist_lock); @@ -375,6 +374,9 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags, s->s_time_min = TIME64_MIN; s->s_time_max = TIME64_MAX; + if (fast_list_init(&s->s_inodes)) + goto fail; + s->s_shrink = shrinker_alloc(SHRINKER_NUMA_AWARE | SHRINKER_MEMCG_AWARE, "sb-%s", type->name); if (!s->s_shrink) @@ -646,7 +648,7 @@ void generic_shutdown_super(struct super_block *sb) * the fscrypt keyring can be destroyed. */ fscrypt_destroy_keyring(sb); - +#if 0 if (CHECK_DATA_CORRUPTION(!list_empty(&sb->s_inodes), "VFS: Busy inodes after unmount of %s (%s)", sb->s_id, sb->s_type->name)) { @@ -665,6 +667,7 @@ void generic_shutdown_super(struct super_block *sb) } spin_unlock(&sb->s_inode_list_lock); } +#endif } /* * Broadcast to everyone that grabbed a temporary reference to this diff --git a/include/linux/fs.h b/include/linux/fs.h index e3c603d01337..755ff1f15baa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -45,6 +45,7 @@ #include <linux/slab.h> #include <linux/maple_tree.h> #include <linux/rw_hint.h> +#include <linux/fast_list.h> #include <asm/byteorder.h> #include <uapi/linux/fs.h> @@ -700,7 +701,6 @@ struct inode { u16 i_wb_frn_history; #endif struct list_head i_lru; /* inode LRU list */ - struct list_head i_sb_list; struct list_head i_wb_list; /* backing dev writeback list */ union { struct hlist_head i_dentry; @@ -714,6 +714,7 @@ struct inode { #if defined(CONFIG_IMA) || defined(CONFIG_FILE_LOCKING) atomic_t i_readcount; /* struct files open RO */ #endif + unsigned i_sb_list_idx; union { const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ void (*free_inode)(struct inode *); @@ -1384,9 +1385,7 @@ struct super_block { */ int s_stack_depth; - /* s_inode_list_lock protects s_inodes */ - spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; - struct list_head s_inodes; /* all inodes */ + struct fast_list s_inodes; /* all inodes */ spinlock_t s_inode_wblist_lock; struct list_head s_inodes_wb; /* writeback inodes */ diff --git a/security/landlock/fs.c b/security/landlock/fs.c index 7d79fc8abe21..9e4d3bd56e3d 100644 --- a/security/landlock/fs.c +++ b/security/landlock/fs.c @@ -1228,13 +1228,18 @@ static void hook_inode_free_security_rcu(void *inode_security) */ static void hook_sb_delete(struct super_block *const sb) { - struct inode *inode, *prev_inode = NULL; + struct genradix_iter iter; + void **i; if (!landlock_initialized) return; - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { + rcu_read_lock(); + genradix_for_each(&sb->s_inodes.items, iter, i) { + struct inode *inode = *((struct inode **) i); + if (!inode) + continue; + struct landlock_object *object; /* Only handles referenced inodes. */ @@ -1258,10 +1263,8 @@ static void hook_sb_delete(struct super_block *const sb) continue; } - rcu_read_lock(); object = rcu_dereference(landlock_inode(inode)->object); if (!object) { - rcu_read_unlock(); spin_unlock(&inode->i_lock); continue; } @@ -1278,7 +1281,6 @@ static void hook_sb_delete(struct super_block *const sb) if (object->underobj == inode) { object->underobj = NULL; spin_unlock(&object->lock); - rcu_read_unlock(); /* * Because object->underobj was not NULL, @@ -1299,32 +1301,15 @@ static void hook_sb_delete(struct super_block *const sb) iput(inode); } else { spin_unlock(&object->lock); - rcu_read_unlock(); } - if (prev_inode) { - /* - * At this point, we still own the __iget() reference - * that we just set in this loop walk. Therefore we - * can drop the list lock and know that the inode won't - * disappear from under us until the next loop walk. - */ - spin_unlock(&sb->s_inode_list_lock); - /* - * We can now actually put the inode reference from the - * previous loop walk, which is not needed anymore. - */ - iput(prev_inode); - cond_resched(); - spin_lock(&sb->s_inode_list_lock); - } - prev_inode = inode; + rcu_read_unlock(); + iput(inode); + cond_resched(); + rcu_read_lock(); } - spin_unlock(&sb->s_inode_list_lock); + rcu_read_unlock(); - /* Puts the inode reference from the last loop walk, if any. */ - if (prev_inode) - iput(prev_inode); /* Waits for pending iput() in release_inode(). */ wait_var_event(&landlock_superblock(sb)->inode_refs, !atomic_long_read(&landlock_superblock(sb)->inode_refs)); |