summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorKent Overstreet <kent.overstreet@linux.dev>2023-11-22 18:15:33 -0500
committerKent Overstreet <kent.overstreet@linux.dev>2025-04-28 22:43:00 -0400
commit3350ac74b3aa53332f5cf8528ce930308d9c464d (patch)
tree2664ac8c8bb38e874ddbb658707c801af35c7414
parent6c15786b3ec758557167367e9663f5caa1fe93c9 (diff)
mm: shrinker: Add a .to_text() method for shrinkers
This adds a new callback method to shrinkers which they can use to describe anything relevant to memory reclaim about their internal state, for example object dirtyness. This patch also adds shrinkers_to_text(), which reports on the top 10 shrinkers - by object count - in sorted order, to be used in OOM reporting. Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Qi Zheng <zhengqi.arch@bytedance.com> Cc: Roman Gushchin <roman.gushchin@linux.dev> Cc: linux-mm@kvack.org Signed-off-by: Kent Overstreet <kent.overstreet@linux.dev> From david@fromorbit.com Tue Aug 27 23:32:26 2024 > > + if (!mutex_trylock(&shrinker_mutex)) { > > + seq_buf_puts(out, "(couldn't take shrinker lock)"); > > + return; > > + } > > Please don't use the shrinker_mutex like this. There can be tens of > thousands of entries in the shrinker list (because memcgs) and > holding the shrinker_mutex for long running traversals like this is > known to cause latency problems for memcg reaping. If we are at > ENOMEM, the last thing we want to be doing is preventing memcgs from > being reaped. > > > + list_for_each_entry(shrinker, &shrinker_list, list) { > > + struct shrink_control sc = { .gfp_mask = GFP_KERNEL, }; > > This iteration and counting setup is neither node or memcg aware. > For node aware shrinkers, this will only count the items freeable > on node 0, and ignore all the other memory in the system. For memcg > systems, it will also only scan the root memcg and so miss counting > any memory in memcg owned caches. > > IOWs, the shrinker iteration mechanism needs to iterate both by NUMA > node and by memcg. On large machines with multiple nodes and hosting > thousands of memcgs, a total shrinker state iteration is has to walk > a -lot- of structures. > > And example of this is drop_slab() - called from > /proc/sys/vm/drop_caches(). It does this to iterate all the > shrinkers for all the nodes and memcgs in the system: > > static unsigned long drop_slab_node(int nid) > { > unsigned long freed = 0; > struct mem_cgroup *memcg = NULL; > > memcg = mem_cgroup_iter(NULL, NULL, NULL); > do { > freed += shrink_slab(GFP_KERNEL, nid, memcg, 0); > } while ((memcg = mem_cgroup_iter(NULL, memcg, NULL)) != NULL); > > return freed; > } > > void drop_slab(void) > { > int nid; > int shift = 0; > unsigned long freed; > > do { > freed = 0; > for_each_online_node(nid) { > if (fatal_signal_pending(current)) > return; > > freed += drop_slab_node(nid); > } > } while ((freed >> shift++) > 1); > } > > Hence any iteration for finding the 10 largest shrinkable caches in > the system needs to do something similar. Only, it needs to iterate > memcgs first and then aggregate object counts across all nodes for > shrinkers that are NUMA aware. > > Because it needs direct access to the shrinkers, it will need to use > the RCU lock + refcount method of traversal because that's the only > safe way to go from memcg to shrinker instance. IOWs, it > needs to mirror the code in shrink_slab/shrink_slab_memcg to obtain > a safe reference to the relevant shrinker so it can call > ->count_objects() and store a refcounted pointer to the shrinker(s) > that will get printed out after the scan is done.... > > Once the shrinker iteration is sorted out, I'll look further at the > rest of the code in this patch... > > -Dave.
-rw-r--r--include/linux/shrinker.h7
-rw-r--r--mm/shrinker.c73
2 files changed, 78 insertions, 2 deletions
diff --git a/include/linux/shrinker.h b/include/linux/shrinker.h
index 1a00be90d93a..6193612617a1 100644
--- a/include/linux/shrinker.h
+++ b/include/linux/shrinker.h
@@ -24,6 +24,8 @@ struct shrinker_info {
struct shrinker_info_unit *unit[];
};
+struct seq_buf;
+
/*
* This struct is used to pass information from page reclaim to the shrinkers.
* We consolidate the values for easier extension later.
@@ -80,10 +82,12 @@ struct shrink_control {
* @flags determine the shrinker abilities, like numa awareness
*/
struct shrinker {
+ const char *name;
unsigned long (*count_objects)(struct shrinker *,
struct shrink_control *sc);
unsigned long (*scan_objects)(struct shrinker *,
struct shrink_control *sc);
+ void (*to_text)(struct seq_buf *, struct shrinker *);
long batch; /* reclaim batch size, 0 = default */
int seeks; /* seeks to recreate an obj */
@@ -110,7 +114,6 @@ struct shrinker {
#endif
#ifdef CONFIG_SHRINKER_DEBUG
int debugfs_id;
- const char *name;
struct dentry *debugfs_entry;
#endif
/* objs pending delete, per node */
@@ -135,6 +138,8 @@ __printf(2, 3)
struct shrinker *shrinker_alloc(unsigned int flags, const char *fmt, ...);
void shrinker_register(struct shrinker *shrinker);
void shrinker_free(struct shrinker *shrinker);
+void shrinker_to_text(struct seq_buf *, struct shrinker *);
+void shrinkers_to_text(struct seq_buf *);
static inline bool shrinker_try_get(struct shrinker *shrinker)
{
diff --git a/mm/shrinker.c b/mm/shrinker.c
index 4a93fd433689..82d2161d6b4b 100644
--- a/mm/shrinker.c
+++ b/mm/shrinker.c
@@ -1,8 +1,9 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/memcontrol.h>
+#include <linux/rculist.h>
#include <linux/rwsem.h>
+#include <linux/seq_buf.h>
#include <linux/shrinker.h>
-#include <linux/rculist.h>
#include <trace/events/vmscan.h>
#include "internal.h"
@@ -809,3 +810,73 @@ void shrinker_free(struct shrinker *shrinker)
call_rcu(&shrinker->rcu, shrinker_free_rcu_cb);
}
EXPORT_SYMBOL_GPL(shrinker_free);
+
+void shrinker_to_text(struct seq_buf *out, struct shrinker *shrinker)
+{
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+
+ seq_buf_puts(out, shrinker->name);
+ seq_buf_printf(out, " objects: %lu\n", shrinker->count_objects(shrinker, &sc));
+
+ if (shrinker->to_text) {
+ shrinker->to_text(out, shrinker);
+ seq_buf_puts(out, "\n");
+ }
+}
+
+/**
+ * shrinkers_to_text - Report on shrinkers with highest usage
+ *
+ * This reports on the top 10 shrinkers, by object counts, in sorted order:
+ * intended to be used for OOM reporting.
+ */
+void shrinkers_to_text(struct seq_buf *out)
+{
+ struct shrinker *shrinker;
+ struct shrinker_by_mem {
+ struct shrinker *shrinker;
+ unsigned long mem;
+ } shrinkers_by_mem[4];
+ int i, nr = 0;
+
+ if (!mutex_trylock(&shrinker_mutex)) {
+ seq_buf_puts(out, "(couldn't take shrinker lock)");
+ return;
+ }
+
+ list_for_each_entry(shrinker, &shrinker_list, list) {
+ struct shrink_control sc = { .gfp_mask = GFP_KERNEL, };
+ unsigned long mem = shrinker->count_objects(shrinker, &sc);
+
+ if (!mem || mem == SHRINK_STOP || mem == SHRINK_EMPTY)
+ continue;
+
+ for (i = 0; i < nr; i++)
+ if (mem < shrinkers_by_mem[i].mem)
+ break;
+
+ if (nr < ARRAY_SIZE(shrinkers_by_mem)) {
+ memmove(&shrinkers_by_mem[i + 1],
+ &shrinkers_by_mem[i],
+ sizeof(shrinkers_by_mem[0]) * (nr - i));
+ nr++;
+ } else if (i) {
+ i--;
+ memmove(&shrinkers_by_mem[0],
+ &shrinkers_by_mem[1],
+ sizeof(shrinkers_by_mem[0]) * i);
+ } else {
+ continue;
+ }
+
+ shrinkers_by_mem[i] = (struct shrinker_by_mem) {
+ .shrinker = shrinker,
+ .mem = mem,
+ };
+ }
+
+ for (i = nr - 1; i >= 0; --i)
+ shrinker_to_text(out, shrinkers_by_mem[i].shrinker);
+
+ mutex_unlock(&shrinker_mutex);
+}