From b14ff274e8aa5517ff86c94d682bf26bf8b5dcc8 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:47 +0100 Subject: slab, rcu: move TINY_RCU variant of kvfree_rcu() to SLAB Following the move of TREE_RCU implementation, let's move also the TINY_RCU one for consistency and subsequent refactoring. For simplicity, remove the separate inline __kvfree_call_rcu() as TINY_RCU is not meant for high-performance hardware anyway. Declare kvfree_call_rcu() in rcupdate.h to avoid header dependency issues. Also move the kvfree_rcu_barrier() declaration to slab.h Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 5 +++++ include/linux/rcutiny.h | 36 ------------------------------------ include/linux/rcutree.h | 3 --- include/linux/slab.h | 14 ++++++++++++++ kernel/rcu/tiny.c | 11 ----------- mm/slab_common.c | 19 +++++++++++++++++++ 6 files changed, 38 insertions(+), 50 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 48e5c03df1dd..3f70d1c81444 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1082,6 +1082,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define kfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) #define kvfree_rcu_mightsleep(ptr) kvfree_rcu_arg_1(ptr) +/* + * In mm/slab_common.c, no suitable header to include here. + */ +void kvfree_call_rcu(struct rcu_head *head, void *ptr); + #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ diff --git a/include/linux/rcutiny.h b/include/linux/rcutiny.h index fe42315f667f..f519cd680228 100644 --- a/include/linux/rcutiny.h +++ b/include/linux/rcutiny.h @@ -90,41 +90,6 @@ static inline void synchronize_rcu_expedited(void) synchronize_rcu(); } -/* - * Add one more declaration of kvfree() here. It is - * not so straight forward to just include - * where it is defined due to getting many compile - * errors caused by that include. - */ -extern void kvfree(const void *addr); - -static inline void __kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) { - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); - return; - } - - // kvfree_rcu(one_arg) call. - might_sleep(); - synchronize_rcu(); - kvfree(ptr); -} - -static inline void kvfree_rcu_barrier(void) -{ - rcu_barrier(); -} - -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -#else -static inline void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - __kvfree_call_rcu(head, ptr); -} -#endif - void rcu_qs(void); static inline void rcu_softirq_qs(void) @@ -164,7 +129,6 @@ static inline void rcu_end_inkernel_boot(void) { } static inline bool rcu_inkernel_boot_has_ended(void) { return true; } static inline bool rcu_is_watching(void) { return true; } static inline void rcu_momentary_eqs(void) { } -static inline void kfree_rcu_scheduler_running(void) { } /* Avoid RCU read-side critical sections leaking across. */ static inline void rcu_all_qs(void) { barrier(); } diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h index 27d86d912781..dbe77b5fe06e 100644 --- a/include/linux/rcutree.h +++ b/include/linux/rcutree.h @@ -34,12 +34,9 @@ static inline void rcu_virt_note_context_switch(void) } void synchronize_rcu_expedited(void); -void kvfree_call_rcu(struct rcu_head *head, void *ptr); -void kvfree_rcu_barrier(void); void rcu_barrier(void); void rcu_momentary_eqs(void); -void kfree_rcu_scheduler_running(void); struct rcu_gp_oldstate { unsigned long rgos_norm; diff --git a/include/linux/slab.h b/include/linux/slab.h index 09eedaecf120..bcc62e5656c3 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -1082,6 +1083,19 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); +#ifdef CONFIG_TINY_RCU +static inline void kvfree_rcu_barrier(void) +{ + rcu_barrier(); +} + +static inline void kfree_rcu_scheduler_running(void) { } +#else +void kvfree_rcu_barrier(void); + +void kfree_rcu_scheduler_running(void); +#endif + /** * kmalloc_size_roundup - Report allocation bucket size for the given size * diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 4b3f31911465..0ec27093d0e1 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -246,17 +246,6 @@ bool poll_state_synchronize_rcu(unsigned long oldstate) } EXPORT_SYMBOL_GPL(poll_state_synchronize_rcu); -#ifdef CONFIG_KASAN_GENERIC -void kvfree_call_rcu(struct rcu_head *head, void *ptr) -{ - if (head) - kasan_record_aux_stack(ptr); - - __kvfree_call_rcu(head, ptr); -} -EXPORT_SYMBOL_GPL(kvfree_call_rcu); -#endif - void __init rcu_init(void) { open_softirq(RCU_SOFTIRQ, rcu_process_callbacks); diff --git a/mm/slab_common.c b/mm/slab_common.c index 4030907b6b7d..81a0ce77b11c 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1284,6 +1284,25 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); +#ifdef CONFIG_TINY_RCU + +void kvfree_call_rcu(struct rcu_head *head, void *ptr) +{ + if (head) { + kasan_record_aux_stack(ptr); + call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); + return; + } + + // kvfree_rcu(one_arg) call. + might_sleep(); + synchronize_rcu(); + kvfree(ptr); +} +EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +#endif + /* * This rcu parameter is runtime-read-only. It reflects * a minimum allowed number of objects which can be cached -- cgit v1.2.3 From 7f4b19ef3129e1f2e1856b3ee475a02c0be34891 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:48 +0100 Subject: rcu: remove trace_rcu_kvfree_callback Tree RCU does not handle kvfree_rcu() by queueing individual objects by call_rcu() anymore, thus the tracepoint and associated __is_kvfree_rcu_offset() check is dead code now. Remove it. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/trace/events/rcu.h | 34 ---------------------------------- kernel/rcu/tree.c | 9 ++------- 2 files changed, 2 insertions(+), 41 deletions(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index e81431deaa50..ac3b28b8939b 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -560,40 +560,6 @@ TRACE_EVENT_RCU(rcu_segcb_stats, ); -/* - * Tracepoint for the registration of a single RCU callback of the special - * kvfree() form. The first argument is the RCU type, the second argument - * is a pointer to the RCU callback, the third argument is the offset - * of the callback within the enclosing RCU-protected data structure, - * the fourth argument is the number of lazy callbacks queued, and the - * fifth argument is the total number of callbacks queued. - */ -TRACE_EVENT_RCU(rcu_kvfree_callback, - - TP_PROTO(const char *rcuname, struct rcu_head *rhp, unsigned long offset, - long qlen), - - TP_ARGS(rcuname, rhp, offset, qlen), - - TP_STRUCT__entry( - __field(const char *, rcuname) - __field(void *, rhp) - __field(unsigned long, offset) - __field(long, qlen) - ), - - TP_fast_assign( - __entry->rcuname = rcuname; - __entry->rhp = rhp; - __entry->offset = offset; - __entry->qlen = qlen; - ), - - TP_printk("%s rhp=%p func=%ld %ld", - __entry->rcuname, __entry->rhp, __entry->offset, - __entry->qlen) -); - /* * Tracepoint for marking the beginning rcu_do_batch, performed to start * RCU callback invocation. The first argument is the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 475f31deed14..5dbc4189037c 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -2931,13 +2931,8 @@ static int __init rcu_spawn_core_kthreads(void) static void rcutree_enqueue(struct rcu_data *rdp, struct rcu_head *head, rcu_callback_t func) { rcu_segcblist_enqueue(&rdp->cblist, head); - if (__is_kvfree_rcu_offset((unsigned long)func)) - trace_rcu_kvfree_callback(rcu_state.name, head, - (unsigned long)func, - rcu_segcblist_n_cbs(&rdp->cblist)); - else - trace_rcu_callback(rcu_state.name, head, - rcu_segcblist_n_cbs(&rdp->cblist)); + trace_rcu_callback(rcu_state.name, head, + rcu_segcblist_n_cbs(&rdp->cblist)); trace_rcu_segcb_stats(&rdp->cblist, TPS("SegCBQueued")); } -- cgit v1.2.3 From 49d5377b38aa127451cf5dc6d6ea5d9da7f465a4 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:49 +0100 Subject: rcu, slab: use a regular callback function for kvfree_rcu RCU has been special-casing callback function pointers that are integers lower than 4096 as offsets of rcu_head for kvfree() instead. The tree RCU implementation no longer does that as the batched kvfree_rcu() is not a simple call_rcu(). The tiny RCU still does, and the plan is also to make tree RCU use call_rcu() for SLUB_TINY configurations. Instead of teaching tree RCU again to special case the offsets, let's remove the special casing completely. Since there's no SLOB anymore, it is possible to create a callback function that can take a pointer to a middle of slab object with unknown offset and determine the object's pointer before freeing it, so implement that as kvfree_rcu_cb(). Large kmalloc and vmalloc allocations are handled simply by aligning down to page size. For that we retain the requirement that the offset is smaller than 4096. But we can remove __is_kvfree_rcu_offset() completely and instead just opencode the condition in the BUILD_BUG_ON() check. Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/rcupdate.h | 28 +++++++++++++--------------- kernel/rcu/tiny.c | 14 -------------- mm/slab.h | 2 ++ mm/slab_common.c | 5 ++--- mm/slub.c | 46 ++++++++++++++++++++++++++++++++++++++++++++++ 5 files changed, 63 insertions(+), 32 deletions(-) diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 3f70d1c81444..23bcf71ffb06 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -1025,12 +1025,6 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) #define RCU_POINTER_INITIALIZER(p, v) \ .p = RCU_INITIALIZER(v) -/* - * Does the specified offset indicate that the corresponding rcu_head - * structure can be handled by kvfree_rcu()? - */ -#define __is_kvfree_rcu_offset(offset) ((offset) < 4096) - /** * kfree_rcu() - kfree an object after a grace period. * @ptr: pointer to kfree for double-argument invocations. @@ -1041,11 +1035,11 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) * when they are used in a kernel module, that module must invoke the * high-latency rcu_barrier() function at module-unload time. * - * The kfree_rcu() function handles this issue. Rather than encoding a - * function address in the embedded rcu_head structure, kfree_rcu() instead - * encodes the offset of the rcu_head structure within the base structure. - * Because the functions are not allowed in the low-order 4096 bytes of - * kernel virtual memory, offsets up to 4095 bytes can be accommodated. + * The kfree_rcu() function handles this issue. In order to have a universal + * callback function handling different offsets of rcu_head, the callback needs + * to determine the starting address of the freed object, which can be a large + * kmalloc or vmalloc allocation. To allow simply aligning the pointer down to + * page boundary for those, only offsets up to 4095 bytes can be accommodated. * If the offset is larger than 4095 bytes, a compile-time error will * be generated in kvfree_rcu_arg_2(). If this error is triggered, you can * either fall back to use of call_rcu() or rearrange the structure to @@ -1087,14 +1081,18 @@ static inline notrace void rcu_read_unlock_sched_notrace(void) */ void kvfree_call_rcu(struct rcu_head *head, void *ptr); +/* + * The BUILD_BUG_ON() makes sure the rcu_head offset can be handled. See the + * comment of kfree_rcu() for details. + */ #define kvfree_rcu_arg_2(ptr, rhf) \ do { \ typeof (ptr) ___p = (ptr); \ \ - if (___p) { \ - BUILD_BUG_ON(!__is_kvfree_rcu_offset(offsetof(typeof(*(ptr)), rhf))); \ - kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ - } \ + if (___p) { \ + BUILD_BUG_ON(offsetof(typeof(*(ptr)), rhf) >= 4096); \ + kvfree_call_rcu(&((___p)->rhf), (void *) (___p)); \ + } \ } while (0) #define kvfree_rcu_arg_1(ptr) \ diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 0ec27093d0e1..7a34a99d4664 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -85,15 +85,8 @@ void rcu_sched_clock_irq(int user) static inline bool rcu_reclaim_tiny(struct rcu_head *head) { rcu_callback_t f; - unsigned long offset = (unsigned long)head->func; rcu_lock_acquire(&rcu_callback_map); - if (__is_kvfree_rcu_offset(offset)) { - trace_rcu_invoke_kvfree_callback("", head, offset); - kvfree((void *)head - offset); - rcu_lock_release(&rcu_callback_map); - return true; - } trace_rcu_invoke_callback("", head); f = head->func; @@ -159,10 +152,6 @@ void synchronize_rcu(void) } EXPORT_SYMBOL_GPL(synchronize_rcu); -static void tiny_rcu_leak_callback(struct rcu_head *rhp) -{ -} - /* * Post an RCU callback to be invoked after the end of an RCU grace * period. But since we have but one CPU, that would be after any @@ -178,9 +167,6 @@ void call_rcu(struct rcu_head *head, rcu_callback_t func) pr_err("%s(): Double-freed CB %p->%pS()!!! ", __func__, head, head->func); mem_dump_obj(head); } - - if (!__is_kvfree_rcu_offset((unsigned long)head->func)) - WRITE_ONCE(head->func, tiny_rcu_leak_callback); return; } diff --git a/mm/slab.h b/mm/slab.h index e9fd9bf0bfa6..2f01c7317988 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -604,6 +604,8 @@ void __memcg_slab_free_hook(struct kmem_cache *s, struct slab *slab, void **p, int objects, struct slabobj_ext *obj_exts); #endif +void kvfree_rcu_cb(struct rcu_head *head); + size_t __ksize(const void *objp); static inline size_t slab_ksize(const struct kmem_cache *s) diff --git a/mm/slab_common.c b/mm/slab_common.c index 81a0ce77b11c..6438a38aa5dc 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1290,7 +1290,7 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) { if (head) { kasan_record_aux_stack(ptr); - call_rcu(head, (rcu_callback_t) ((void *) head - ptr)); + call_rcu(head, kvfree_rcu_cb); return; } @@ -1551,8 +1551,7 @@ kvfree_rcu_list(struct rcu_head *head) rcu_lock_acquire(&rcu_callback_map); trace_rcu_invoke_kvfree_callback("slab", head, offset); - if (!WARN_ON_ONCE(!__is_kvfree_rcu_offset(offset))) - kvfree(ptr); + kvfree(ptr); rcu_lock_release(&rcu_callback_map); cond_resched_tasks_rcu_qs(); diff --git a/mm/slub.c b/mm/slub.c index 1f50129dcfb3..e8273f286569 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -19,6 +19,7 @@ #include #include #include "slab.h" +#include #include #include #include @@ -4728,6 +4729,51 @@ static void free_large_kmalloc(struct folio *folio, void *object) folio_put(folio); } +/* + * Given an rcu_head embedded within an object obtained from kvmalloc at an + * offset < 4k, free the object in question. + */ +void kvfree_rcu_cb(struct rcu_head *head) +{ + void *obj = head; + struct folio *folio; + struct slab *slab; + struct kmem_cache *s; + void *slab_addr; + + if (is_vmalloc_addr(obj)) { + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + vfree(obj); + return; + } + + folio = virt_to_folio(obj); + if (!folio_test_slab(folio)) { + /* + * rcu_head offset can be only less than page size so no need to + * consider folio order + */ + obj = (void *) PAGE_ALIGN_DOWN((unsigned long)obj); + free_large_kmalloc(folio, obj); + return; + } + + slab = folio_slab(folio); + s = slab->slab_cache; + slab_addr = folio_address(folio); + + if (is_kfence_address(obj)) { + obj = kfence_object_start(obj); + } else { + unsigned int idx = __obj_to_index(s, slab_addr, obj); + + obj = slab_addr + s->size * idx; + obj = fixup_red_left(s, obj); + } + + slab_free(s, slab, obj, _RET_IP_); +} + /** * kfree - free previously allocated memory * @object: pointer returned by kmalloc() or kmem_cache_alloc() -- cgit v1.2.3 From c9f8f1242a4c3e48adc6c3cf6b31c1ffbaa49943 Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Mon, 3 Feb 2025 10:28:50 +0100 Subject: slab: don't batch kvfree_rcu() with SLUB_TINY kvfree_rcu() is batched for better performance except on TINY_RCU, which is a simple implementation for small UP systems. Similarly SLUB_TINY is an option intended for small systems, whether or not used together with TINY_RCU. In case SLUB_TINY is used with !TINY_RCU, it makes arguably sense to not do the batching and limit the memory footprint. It's also suboptimal to have RCU-specific #ifdefs in slab code. With that, add CONFIG_KVFREE_RCU_BATCHED to determine whether batching kvfree_rcu() implementation is used. It is not set by a user prompt, but enabled by default and disabled in case TINY_RCU or SLUB_TINY are enabled. Use the new config for #ifdef's in slab code and extend their scope to cover all code used by the batched kvfree_rcu(). For example there's no need to perform kvfree_rcu_init() if the batching is disabled. Reviewed-by: Uladzislau Rezki (Sony) Reviewed-by: Joel Fernandes (Google) Reviewed-by: Hyeonggon Yoo <42.hyeyoo@gmail.com> Tested-by: Paul E. McKenney Signed-off-by: Vlastimil Babka --- include/linux/slab.h | 2 +- mm/Kconfig | 4 ++++ mm/slab_common.c | 15 +++++++++------ 3 files changed, 14 insertions(+), 7 deletions(-) diff --git a/include/linux/slab.h b/include/linux/slab.h index bcc62e5656c3..7686054dd494 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -1083,7 +1083,7 @@ extern void kvfree_sensitive(const void *addr, size_t len); unsigned int kmem_cache_size(struct kmem_cache *s); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED static inline void kvfree_rcu_barrier(void) { rcu_barrier(); diff --git a/mm/Kconfig b/mm/Kconfig index 1b501db06417..0b7f4bb5cb80 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -242,6 +242,10 @@ menu "Slab allocator options" config SLUB def_bool y +config KVFREE_RCU_BATCHED + def_bool y + depends on !SLUB_TINY && !TINY_RCU + config SLUB_TINY bool "Configure for minimal memory footprint" depends on EXPERT diff --git a/mm/slab_common.c b/mm/slab_common.c index 6438a38aa5dc..46d0a4cd33b5 100644 --- a/mm/slab_common.c +++ b/mm/slab_common.c @@ -1284,7 +1284,7 @@ EXPORT_TRACEPOINT_SYMBOL(kmem_cache_alloc); EXPORT_TRACEPOINT_SYMBOL(kfree); EXPORT_TRACEPOINT_SYMBOL(kmem_cache_free); -#ifdef CONFIG_TINY_RCU +#ifndef CONFIG_KVFREE_RCU_BATCHED void kvfree_call_rcu(struct rcu_head *head, void *ptr) { @@ -1301,7 +1301,11 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) } EXPORT_SYMBOL_GPL(kvfree_call_rcu); -#endif +void __init kvfree_rcu_init(void) +{ +} + +#else /* CONFIG_KVFREE_RCU_BATCHED */ /* * This rcu parameter is runtime-read-only. It reflects @@ -1879,8 +1883,6 @@ add_ptr_to_bulk_krc_lock(struct kfree_rcu_cpu **krcp, return true; } -#if !defined(CONFIG_TINY_RCU) - static enum hrtimer_restart schedule_page_work_fn(struct hrtimer *t) { @@ -2089,8 +2091,6 @@ void kvfree_rcu_barrier(void) } EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); -#endif /* #if !defined(CONFIG_TINY_RCU) */ - static unsigned long kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) { @@ -2180,3 +2180,6 @@ void __init kvfree_rcu_init(void) shrinker_register(kfree_rcu_shrinker); } + +#endif /* CONFIG_KVFREE_RCU_BATCHED */ + -- cgit v1.2.3