diff options
-rw-r--r-- | .mailmap | 3 | ||||
-rw-r--r-- | Documentation/RCU/whatisRCU.rst | 6 | ||||
-rw-r--r-- | Documentation/admin-guide/kernel-parameters.txt | 14 | ||||
-rw-r--r-- | MAINTAINERS | 2 | ||||
-rw-r--r-- | arch/Kconfig | 4 | ||||
-rw-r--r-- | include/linux/rcupdate.h | 22 | ||||
-rw-r--r-- | include/linux/rcupdate_wait.h | 18 | ||||
-rw-r--r-- | include/linux/srcutiny.h | 2 | ||||
-rw-r--r-- | include/trace/events/rcu.h | 27 | ||||
-rw-r--r-- | kernel/bpf/Kconfig | 2 | ||||
-rw-r--r-- | kernel/bpf/trampoline.c | 2 | ||||
-rw-r--r-- | kernel/rcu/Kconfig | 8 | ||||
-rw-r--r-- | kernel/rcu/srcutiny.c | 31 | ||||
-rw-r--r-- | kernel/rcu/sync.c | 8 | ||||
-rw-r--r-- | kernel/rcu/tasks.h | 23 | ||||
-rw-r--r-- | kernel/rcu/tiny.c | 4 | ||||
-rw-r--r-- | kernel/rcu/tree.c | 417 | ||||
-rw-r--r-- | kernel/rcu/tree.h | 24 | ||||
-rw-r--r-- | kernel/rcu/tree_exp.h | 2 | ||||
-rw-r--r-- | kernel/rcu/tree_plugin.h | 4 | ||||
-rw-r--r-- | kernel/rcu/tree_stall.h | 11 | ||||
-rw-r--r-- | kernel/rcu/update.c | 4 | ||||
-rw-r--r-- | kernel/trace/Kconfig | 4 | ||||
-rw-r--r-- | kernel/trace/ftrace.c | 3 |
24 files changed, 575 insertions, 70 deletions
@@ -445,7 +445,8 @@ Nadav Amit <nadav.amit@gmail.com> <namit@cs.technion.ac.il> Nadia Yvette Chambers <nyc@holomorphy.com> William Lee Irwin III <wli@holomorphy.com> Naoya Horiguchi <naoya.horiguchi@nec.com> <n-horiguchi@ah.jp.nec.com> Nathan Chancellor <nathan@kernel.org> <natechancellor@gmail.com> -Neeraj Upadhyay <quic_neeraju@quicinc.com> <neeraju@codeaurora.org> +Neeraj Upadhyay <neeraj.upadhyay@kernel.org> <quic_neeraju@quicinc.com> +Neeraj Upadhyay <neeraj.upadhyay@kernel.org> <neeraju@codeaurora.org> Neil Armstrong <neil.armstrong@linaro.org> <narmstrong@baylibre.com> Nguyen Anh Quynh <aquynh@gmail.com> Nicholas Piggin <npiggin@gmail.com> <npiggen@suse.de> diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 872ac665223f..94838c65c7d9 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -427,7 +427,7 @@ their assorted primitives. This section shows a simple use of the core RCU API to protect a global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst. +uses of RCU may be found in listRCU.rst and NMI-RCU.rst. :: struct foo { @@ -510,8 +510,8 @@ So, to sum up: data item. See checklist.rst for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in listRCU.rst, -arrayRCU.rst, and NMI-RCU.rst. +And again, more-typical uses of RCU may be found in listRCU.rst +and NMI-RCU.rst. .. _4_whatisRCU: diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..0a3b0fd1910e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5091,6 +5091,20 @@ delay, memory pressure or callback list growing too big. + rcutree.rcu_normal_wake_from_gp= [KNL] + Reduces a latency of synchronize_rcu() call. This approach + maintains its own track of synchronize_rcu() callers, so it + does not interact with regular callbacks because it does not + use a call_rcu[_hurry]() path. Please note, this is for a + normal grace period. + + How to enable it: + + echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp + or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1" + + Default is 0. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..0370e571f312 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18591,7 +18591,7 @@ F: tools/testing/selftests/resctrl/ READ-COPY UPDATE (RCU) M: "Paul E. McKenney" <paulmck@kernel.org> M: Frederic Weisbecker <frederic@kernel.org> (kernel/rcu/tree_nocb.h) -M: Neeraj Upadhyay <quic_neeraju@quicinc.com> (kernel/rcu/tasks.h) +M: Neeraj Upadhyay <neeraj.upadhyay@kernel.org> (kernel/rcu/tasks.h) M: Joel Fernandes <joel@joelfernandes.org> M: Josh Triplett <josh@joshtriplett.org> M: Boqun Feng <boqun.feng@gmail.com> diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71..ae4a4f37bbf0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -55,7 +55,7 @@ config KPROBES depends on MODULES depends on HAVE_KPROBES select KALLSYMS - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes @@ -104,7 +104,7 @@ config STATIC_CALL_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU config KPROBES_ON_FTRACE def_bool y diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 17d7ed5f3ae6..dfd2399f2cde 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -401,15 +401,15 @@ static inline int debug_lockdep_rcu_enabled(void) } \ } while (0) -#if defined(CONFIG_PROVE_RCU) && !defined(CONFIG_PREEMPT_RCU) +#ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { RCU_LOCKDEP_WARN(lock_is_held(&rcu_lock_map), "Illegal context switch in RCU read-side critical section"); } -#else /* #ifdef CONFIG_PROVE_RCU */ +#else // #ifndef CONFIG_PREEMPT_RCU static inline void rcu_preempt_sleep_check(void) { } -#endif /* #else #ifdef CONFIG_PROVE_RCU */ +#endif // #else // #ifndef CONFIG_PREEMPT_RCU #define rcu_sleep_check() \ do { \ @@ -809,9 +809,9 @@ static inline void rcu_read_unlock(void) { RCU_LOCKDEP_WARN(!rcu_is_watching(), "rcu_read_unlock() used illegally while idle"); + rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ __release(RCU); __rcu_read_unlock(); - rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */ } /** @@ -1090,6 +1090,18 @@ rcu_head_after_call_rcu(struct rcu_head *rhp, rcu_callback_t f) extern int rcu_expedited; extern int rcu_normal; -DEFINE_LOCK_GUARD_0(rcu, rcu_read_lock(), rcu_read_unlock()) +DEFINE_LOCK_GUARD_0(rcu, + do { + rcu_read_lock(); + /* + * sparse doesn't call the cleanup function, + * so just release immediately and don't track + * the context. We don't need to anyway, since + * the whole point of the guard is to not need + * the explicit unlock. + */ + __release(RCU); + } while (0), + rcu_read_unlock()) #endif /* __LINUX_RCUPDATE_H */ diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index d07f0848802e..303ab9bee155 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -19,18 +19,18 @@ struct rcu_synchronize { }; void wakeme_after_rcu(struct rcu_head *head); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); -#define _wait_rcu_gp(checktiny, ...) \ -do { \ - call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ - struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ - __wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array), \ - __crcu_array, __rs_array); \ +#define _wait_rcu_gp(checktiny, state, ...) \ +do { \ + call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ + struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ + __wait_rcu_gp(checktiny, state, ARRAY_SIZE(__crcu_array), __crcu_array, __rs_array); \ } while (0) -#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) +#define wait_rcu_gp(...) _wait_rcu_gp(false, TASK_UNINTERRUPTIBLE, __VA_ARGS__) +#define wait_rcu_gp_state(state, ...) _wait_rcu_gp(false, state, __VA_ARGS__) /** * synchronize_rcu_mult - Wait concurrently for multiple grace periods @@ -54,7 +54,7 @@ do { \ * grace period. */ #define synchronize_rcu_mult(...) \ - _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), TASK_UNINTERRUPTIBLE, __VA_ARGS__) static inline void cond_resched_rcu(void) { diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h index 447133171d95..4d96bbdb45f0 100644 --- a/include/linux/srcutiny.h +++ b/include/linux/srcutiny.h @@ -64,8 +64,10 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) { int idx; + preempt_disable(); // Needed for PREEMPT_AUTO idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); + preempt_enable(); return idx; } diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 2ef9c719772a..31b3e0d3e65f 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -708,6 +708,33 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback, ); /* + * Tracepoint for a normal synchronize_rcu() states. The first argument + * is the RCU flavor, the second argument is a pointer to rcu_head the + * last one is an event. + */ +TRACE_EVENT_RCU(rcu_sr_normal, + + TP_PROTO(const char *rcuname, struct rcu_head *rhp, const char *srevent), + + TP_ARGS(rcuname, rhp, srevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(void *, rhp) + __field(const char *, srevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->rhp = rhp; + __entry->srevent = srevent; + ), + + TP_printk("%s rhp=0x%p event=%s", + __entry->rcuname, __entry->rhp, __entry->srevent) +); + +/* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been * invoked. The first argument is the name of the RCU flavor, * the second argument is number of callbacks actually invoked, diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bc25f5098a25..4100df44c665 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -28,7 +28,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59c78..88673a4267eb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -333,7 +333,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index e7d2dd267593..3e079de0f5b4 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -31,7 +31,7 @@ config PREEMPT_RCU config TINY_RCU bool - default y if !PREEMPTION && !SMP + default y if !PREEMPT_RCU && !SMP help This option selects the RCU implementation that is designed for UP systems from which real-time response @@ -85,9 +85,13 @@ config FORCE_TASKS_RCU idle, and user-mode execution as quiescent states. Not for manual selection in most cases. -config TASKS_RCU +config NEED_TASKS_RCU bool default n + +config TASKS_RCU + bool + default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) select IRQ_WORK config FORCE_TASKS_RUDE_RCU diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c index c38e5933a5d6..5afd5cf494db 100644 --- a/kernel/rcu/srcutiny.c +++ b/kernel/rcu/srcutiny.c @@ -96,9 +96,12 @@ EXPORT_SYMBOL_GPL(cleanup_srcu_struct); */ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) { - int newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; + int newval; + preempt_disable(); // Needed for PREEMPT_AUTO + newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); + preempt_enable(); if (!newval && READ_ONCE(ssp->srcu_gp_waiting) && in_task()) swake_up_one(&ssp->srcu_wq); } @@ -117,8 +120,11 @@ void srcu_drive_gp(struct work_struct *wp) struct srcu_struct *ssp; ssp = container_of(wp, struct srcu_struct, srcu_work); - if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + preempt_disable(); // Needed for PREEMPT_AUTO + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { return; /* Already running or nothing to do. */ + preempt_enable(); + } /* Remove recently arrived callbacks and wait for readers. */ WRITE_ONCE(ssp->srcu_gp_running, true); @@ -130,9 +136,12 @@ void srcu_drive_gp(struct work_struct *wp) idx = (ssp->srcu_idx & 0x2) / 2; WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + preempt_enable(); swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); + preempt_disable(); // Needed for PREEMPT_AUTO WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + preempt_enable(); /* Invoke the callbacks we removed above. */ while (lh) { @@ -150,8 +159,11 @@ void srcu_drive_gp(struct work_struct *wp) * at interrupt level, but the ->srcu_gp_running checks will * straighten that out. */ + preempt_disable(); // Needed for PREEMPT_AUTO WRITE_ONCE(ssp->srcu_gp_running, false); - if (ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) + idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); + preempt_enable(); + if (idx) schedule_work(&ssp->srcu_work); } EXPORT_SYMBOL_GPL(srcu_drive_gp); @@ -160,9 +172,12 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) { unsigned long cookie; + preempt_disable(); // Needed for PREEMPT_AUTO cookie = get_state_synchronize_srcu(ssp); - if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { + preempt_enable(); return; + } WRITE_ONCE(ssp->srcu_idx_max, cookie); if (!READ_ONCE(ssp->srcu_gp_running)) { if (likely(srcu_init_done)) @@ -170,6 +185,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) else if (list_empty(&ssp->srcu_work.entry)) list_add(&ssp->srcu_work.entry, &srcu_boot_list); } + preempt_enable(); } /* @@ -183,11 +199,13 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, rhp->func = func; rhp->next = NULL; + preempt_disable(); // Needed for PREEMPT_AUTO local_irq_save(flags); *ssp->srcu_cb_tail = rhp; ssp->srcu_cb_tail = &rhp->next; local_irq_restore(flags); srcu_gp_start_if_needed(ssp); + preempt_enable(); } EXPORT_SYMBOL_GPL(call_srcu); @@ -241,9 +259,12 @@ EXPORT_SYMBOL_GPL(get_state_synchronize_srcu); */ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) { - unsigned long ret = get_state_synchronize_srcu(ssp); + unsigned long ret; + preempt_disable(); // Needed for PREEMPT_AUTO + ret = get_state_synchronize_srcu(ssp); srcu_gp_start_if_needed(ssp); + preempt_enable(); return ret; } EXPORT_SYMBOL_GPL(start_poll_synchronize_srcu); diff --git a/kernel/rcu/sync.c b/kernel/rcu/sync.c index 86df878a2fee..6c2bd9001adc 100644 --- a/kernel/rcu/sync.c +++ b/kernel/rcu/sync.c @@ -122,7 +122,7 @@ void rcu_sync_enter(struct rcu_sync *rsp) * we are called at early boot time but this shouldn't happen. */ } - rsp->gp_count++; + WRITE_ONCE(rsp->gp_count, rsp->gp_count + 1); spin_unlock_irq(&rsp->rss_lock); if (gp_state == GP_IDLE) { @@ -151,11 +151,15 @@ void rcu_sync_enter(struct rcu_sync *rsp) */ void rcu_sync_exit(struct rcu_sync *rsp) { + int gpc; + WARN_ON_ONCE(READ_ONCE(rsp->gp_state) == GP_IDLE); WARN_ON_ONCE(READ_ONCE(rsp->gp_count) == 0); spin_lock_irq(&rsp->rss_lock); - if (!--rsp->gp_count) { + gpc = rsp->gp_count - 1; + WRITE_ONCE(rsp->gp_count, gpc); + if (!gpc) { if (rsp->gp_state == GP_PASSED) { WRITE_ONCE(rsp->gp_state, GP_EXIT); rcu_sync_call(rsp); diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index a1af7dadc0f7..e1bf33018e6d 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -74,6 +74,7 @@ struct rcu_tasks_percpu { * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. @@ -107,6 +108,7 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; int percpu_enqueue_shift; int percpu_enqueue_lim; @@ -134,6 +136,7 @@ static struct rcu_tasks rt_name = \ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ + .wait_state = TASK_UNINTERRUPTIBLE, \ .rtpcpu = &rt_name ## __percpu, \ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ @@ -147,7 +150,7 @@ static struct rcu_tasks rt_name = \ #ifdef CONFIG_TASKS_RCU -/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ +/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif @@ -638,7 +641,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) // If the grace-period kthread is running, use it. if (READ_ONCE(rtp->kthread_ptr)) { - wait_rcu_gp(rtp->call_func); + wait_rcu_gp_state(rtp->wait_state, rtp->call_func); return; } rcu_tasks_one_gp(rtp, true); @@ -1160,6 +1163,7 @@ static int __init rcu_spawn_tasks_kthread(void) rcu_tasks.postscan_func = rcu_tasks_postscan; rcu_tasks.holdouts_func = check_all_holdout_tasks; rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_tasks.wait_state = TASK_IDLE; rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } @@ -1206,8 +1210,7 @@ void exit_tasks_rcu_start(void) rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); t->rcu_tasks_exit_cpu = smp_processor_id(); raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - if (!rtpcp->rtp_exit_list.next) - INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + WARN_ON_ONCE(!rtpcp->rtp_exit_list.next); list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); preempt_enable(); @@ -1471,6 +1474,7 @@ static void rcu_st_need_qs(struct task_struct *t, u8 v) /* * Do a cmpxchg() on ->trc_reader_special.b.need_qs, allowing for * the four-byte operand-size restriction of some platforms. + * * Returns the old value, which is often ignored. */ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) @@ -1482,7 +1486,14 @@ u8 rcu_trc_cmpxchg_need_qs(struct task_struct *t, u8 old, u8 new) if (trs_old.b.need_qs != old) return trs_old.b.need_qs; trs_new.b.need_qs = new; - ret.s = cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s); + + // Although cmpxchg() appears to KCSAN to update all four bytes, + // only the .b.need_qs byte actually changes. + instrument_atomic_read_write(&t->trc_reader_special.b.need_qs, + sizeof(t->trc_reader_special.b.need_qs)); + // Avoid false-positive KCSAN failures. + ret.s = data_race(cmpxchg(&t->trc_reader_special.s, trs_old.s, trs_new.s)); + return ret.b.need_qs; } EXPORT_SYMBOL_GPL(rcu_trc_cmpxchg_need_qs); @@ -2008,7 +2019,7 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%lu h:%lu/%lu/%lu", + snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c index 705c0d16850a..4402d6f5f857 100644 --- a/kernel/rcu/tiny.c +++ b/kernel/rcu/tiny.c @@ -130,9 +130,7 @@ static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused next = list->next; prefetch(next); debug_rcu_head_unqueue(list); - local_bh_disable(); rcu_reclaim_tiny(list); - local_bh_enable(); list = next; } } @@ -155,7 +153,9 @@ void synchronize_rcu(void) lock_is_held(&rcu_lock_map) || lock_is_held(&rcu_sched_lock_map), "Illegal synchronize_rcu() in RCU read-side critical section"); + preempt_disable(); WRITE_ONCE(rcu_ctrlblk.gp_seq, rcu_ctrlblk.gp_seq + 2); + preempt_enable(); } EXPORT_SYMBOL_GPL(synchronize_rcu); diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 60e79ed73700..28c7031711a3 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -75,6 +75,7 @@ #define MODULE_PARAM_PREFIX "rcutree." /* Data structures. */ +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, @@ -93,6 +94,8 @@ static struct rcu_state rcu_state = { .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, + .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, + rcu_sr_normal_gp_cleanup_work), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -240,8 +243,36 @@ static long rcu_get_n_cbs_cpu(int cpu) return 0; } +/** + * rcu_softirq_qs - Provide a set of RCU quiescent states in softirq processing + * + * Mark a quiescent state for RCU, Tasks RCU, and Tasks Trace RCU. + * This is a special-purpose function to be used in the softirq + * infrastructure and perhaps the occasional long-running softirq + * handler. + * + * Note that from RCU's viewpoint, a call to rcu_softirq_qs() is + * equivalent to momentarily completely enabling preemption. For + * example, given this code:: + * + * local_bh_disable(); + * do_something(); + * rcu_softirq_qs(); // A + * do_something_else(); + * local_bh_enable(); // B + * + * A call to synchronize_rcu() that began concurrently with the + * call to do_something() would be guaranteed to wait only until + * execution reached statement A. Without that rcu_softirq_qs(), + * that same synchronize_rcu() would instead be guaranteed to wait + * until execution reached statement B. + */ void rcu_softirq_qs(void) { + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map) || + lock_is_held(&rcu_lock_map) || + lock_is_held(&rcu_sched_lock_map), + "Illegal rcu_softirq_qs() in RCU read-side critical section"); rcu_qs(); rcu_preempt_deferred_qs(current); rcu_tasks_qs(current, false); @@ -806,8 +837,8 @@ static int rcu_implicit_dynticks_qs(struct rcu_data *rdp) __func__, rnp1->grplo, rnp1->grphi, rnp1->qsmask, rnp1->qsmaskinit, rnp1->qsmaskinitnext, rnp1->rcu_gp_init_mask); pr_info("%s %d: %c online: %ld(%d) offline: %ld(%d)\n", __func__, rdp->cpu, ".o"[rcu_rdp_cpu_online(rdp)], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); return 1; /* Break things loose after complaining. */ } @@ -1416,6 +1447,305 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) } /* + * There is a single llist, which is used for handling + * synchronize_rcu() users' enqueued rcu_synchronize nodes. + * Within this llist, there are two tail pointers: + * + * wait tail: Tracks the set of nodes, which need to + * wait for the current GP to complete. + * done tail: Tracks the set of nodes, for which grace + * period has elapsed. These nodes processing + * will be done as part of the cleanup work + * execution by a kworker. + * + * At every grace period init, a new wait node is added + * to the llist. This wait node is used as wait tail + * for this new grace period. Given that there are a fixed + * number of wait nodes, if all wait nodes are in use + * (which can happen when kworker callback processing + * is delayed) and additional grace period is requested. + * This means, a system is slow in processing callbacks. + * + * TODO: If a slow processing is detected, a first node + * in the llist should be used as a wait-tail for this + * grace period, therefore users which should wait due + * to a slow process are handled by _this_ grace period + * and not next. + * + * Below is an illustration of how the done and wait + * tail pointers move from one set of rcu_synchronize nodes + * to the other, as grace periods start and finish and + * nodes are processed by kworker. + * + * + * a. Initial llist callbacks list: + * + * +----------+ +--------+ +-------+ + * | | | | | | + * | head |---------> | cb2 |--------->| cb1 | + * | | | | | | + * +----------+ +--------+ +-------+ + * + * + * + * b. New GP1 Start: + * + * WAIT TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * c. GP completion: + * + * WAIT_TAIL == DONE_TAIL + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * d. New callbacks and GP2 start: + * + * WAIT TAIL DONE TAIL + * | | + * | | + * v v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * + * e. GP2 completion: + * + * WAIT_TAIL == DONE_TAIL + * DONE TAIL + * | + * | + * v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * While the llist state transitions from d to e, a kworker + * can start executing rcu_sr_normal_gp_cleanup_work() and + * can observe either the old done tail (@c) or the new + * done tail (@e). So, done tail updates and reads need + * to use the rel-acq semantics. If the concurrent kworker + * observes the old done tail, the newly queued work + * execution will process the updated done tail. If the + * concurrent kworker observes the new done tail, then + * the newly queued work will skip processing the done + * tail, as workqueue semantics guarantees that the new + * work is executed only after the previous one completes. + * + * f. kworker callbacks processing complete: + * + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ + * | | | | + * | head ------> wait | + * | | | head2 | + * +----------+ +--------+ + * + */ +static bool rcu_sr_is_wait_head(struct llist_node *node) +{ + return &(rcu_state.srs_wait_nodes)[0].node <= node && + node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node; +} + +static struct llist_node *rcu_sr_get_wait_head(void) +{ + struct sr_wait_node *sr_wn; + int i; + + for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) { + sr_wn = &(rcu_state.srs_wait_nodes)[i]; + + if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1)) + return &sr_wn->node; + } + + return NULL; +} + +static void rcu_sr_put_wait_head(struct llist_node *node) +{ + struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node); + + atomic_set_release(&sr_wn->inuse, 0); +} + +/* Disabled by default. */ +static int rcu_normal_wake_from_gp; +module_param(rcu_normal_wake_from_gp, int, 0644); +static struct workqueue_struct *sync_wq; + +static void rcu_sr_normal_complete(struct llist_node *node) +{ + struct rcu_synchronize *rs = container_of( + (struct rcu_head *) node, struct rcu_synchronize, head); + unsigned long oldstate = (unsigned long) rs->head.func; + + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + !poll_state_synchronize_rcu(oldstate), + "A full grace period is not passed yet: %lu", + rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + + /* Finally. */ + complete(&rs->completion); +} + +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) +{ + struct llist_node *done, *rcu, *next, *head; + + /* + * This work execution can potentially execute + * while a new done tail is being updated by + * grace period kthread in rcu_sr_normal_gp_cleanup(). + * So, read and updates of done tail need to + * follow acq-rel semantics. + * + * Given that wq semantics guarantees that a single work + * cannot execute concurrently by multiple kworkers, + * the done tail list manipulations are protected here. + */ + done = smp_load_acquire(&rcu_state.srs_done_tail); + if (!done) + return; + + WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); + head = done->next; + done->next = NULL; + + /* + * The dummy node, which is pointed to by the + * done tail which is acq-read above is not removed + * here. This allows lockless additions of new + * rcu_synchronize nodes in rcu_sr_normal_add_req(), + * while the cleanup work executes. The dummy + * nodes is removed, in next round of cleanup + * work execution. + */ + llist_for_each_safe(rcu, next, head) { + if (!rcu_sr_is_wait_head(rcu)) { + rcu_sr_normal_complete(rcu); + continue; + } + + rcu_sr_put_wait_head(rcu); + } +} + +/* + * Helper function for rcu_gp_cleanup(). + */ +static void rcu_sr_normal_gp_cleanup(void) +{ + struct llist_node *wait_tail, *next, *rcu; + int done = 0; + + wait_tail = rcu_state.srs_wait_tail; + if (wait_tail == NULL) + return; + + rcu_state.srs_wait_tail = NULL; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail)); + + /* + * Process (a) and (d) cases. See an illustration. + */ + llist_for_each_safe(rcu, next, wait_tail->next) { + if (rcu_sr_is_wait_head(rcu)) + break; + + rcu_sr_normal_complete(rcu); + // It can be last, update a next on this step. + wait_tail->next = next; + + if (++done == SR_MAX_USERS_WAKE_FROM_GP) + break; + } + + // concurrent sr_normal_gp_cleanup work might observe this update. + smp_store_release(&rcu_state.srs_done_tail, wait_tail); + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + + /* + * We schedule a work in order to perform a final processing + * of outstanding users(if still left) and releasing wait-heads + * added by rcu_sr_normal_gp_init() call. + */ + queue_work(sync_wq, &rcu_state.srs_cleanup_work); +} + +/* + * Helper function for rcu_gp_init(). + */ +static bool rcu_sr_normal_gp_init(void) +{ + struct llist_node *first; + struct llist_node *wait_head; + bool start_new_poll = false; + + first = READ_ONCE(rcu_state.srs_next.first); + if (!first || rcu_sr_is_wait_head(first)) + return start_new_poll; + + wait_head = rcu_sr_get_wait_head(); + if (!wait_head) { + // Kick another GP to retry. + start_new_poll = true; + return start_new_poll; + } + + /* Inject a wait-dummy-node. */ + llist_add(wait_head, &rcu_state.srs_next); + + /* + * A waiting list of rcu_synchronize nodes should be empty on + * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(), + * rolls it over. If not, it is a BUG, warn a user. + */ + WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL); + rcu_state.srs_wait_tail = wait_head; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + return start_new_poll; +} + +static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) +{ + llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next); +} + +/* * Initialize a new grace period. Return false if no grace period required. */ static noinline_for_stack bool rcu_gp_init(void) @@ -1425,10 +1755,11 @@ static noinline_for_stack bool rcu_gp_init(void) unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); + bool start_new_poll; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); - if (!READ_ONCE(rcu_state.gp_flags)) { + if (!rcu_state.gp_flags) { /* Spurious wakeup, tell caller to go back to sleep. */ raw_spin_unlock_irq_rcu_node(rnp); return false; @@ -1449,11 +1780,25 @@ static noinline_for_stack bool rcu_gp_init(void) /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); + start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); /* + * The "start_new_poll" is set to true, only when this GP is not able + * to handle anything and there are outstanding users. It happens when + * the rcu_sr_normal_gp_init() function was not able to insert a dummy + * separator to the llist, because there were no left any dummy-nodes. + * + * Number of dummy-nodes is fixed, it could be that we are run out of + * them, if so we start a new pool request to repeat a try. It is rare + * and it means that a system is doing a slow processing of callbacks. + */ + if (start_new_poll) + (void) start_poll_synchronize_rcu(); + + /* * Apply per-leaf buffered online and offline operations to * the rcu_node tree. Note that this new grace period need not * wait for subsequent online CPUs, and that RCU hooks in the CPU @@ -1613,8 +1958,7 @@ static void rcu_gp_fqs(bool first_time) /* Clear flag to prevent immediate re-entry. */ if (READ_ONCE(rcu_state.gp_flags) & RCU_GP_FLAG_FQS) { raw_spin_lock_irq_rcu_node(rnp); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) & ~RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags & ~RCU_GP_FLAG_FQS); raw_spin_unlock_irq_rcu_node(rnp); } } @@ -1818,6 +2162,9 @@ static noinline void rcu_gp_cleanup(void) } raw_spin_unlock_irq_rcu_node(rnp); + // Make synchronize_rcu() users aware of the end of old grace period. + rcu_sr_normal_gp_cleanup(); + // If strict, make all CPUs aware of the end of the old grace period. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) on_each_cpu(rcu_strict_gp_boundary, NULL, 0); @@ -1875,8 +2222,7 @@ static void rcu_report_qs_rsp(unsigned long flags) { raw_lockdep_assert_held_rcu_node(rcu_get_root()); WARN_ON_ONCE(!rcu_gp_in_progress()); - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rcu_get_root(), flags); rcu_gp_kthread_wake(); } @@ -2391,8 +2737,7 @@ void rcu_force_quiescent_state(void) raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); return; /* Someone beat us to it. */ } - WRITE_ONCE(rcu_state.gp_flags, - READ_ONCE(rcu_state.gp_flags) | RCU_GP_FLAG_FQS); + WRITE_ONCE(rcu_state.gp_flags, rcu_state.gp_flags | RCU_GP_FLAG_FQS); raw_spin_unlock_irqrestore_rcu_node(rnp_old, flags); rcu_gp_kthread_wake(); } @@ -3552,6 +3897,43 @@ static int rcu_blocking_is_gp(void) return true; } +/* + * Helper function for the synchronize_rcu() API. + */ +static void synchronize_rcu_normal(void) +{ + struct rcu_synchronize rs; + + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); + + if (!READ_ONCE(rcu_normal_wake_from_gp)) { + wait_rcu_gp(call_rcu_hurry); + goto trace_complete_out; + } + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + + /* + * This code might be preempted, therefore take a GP + * snapshot before adding a request. + */ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + rs.head.func = (void *) get_state_synchronize_rcu(); + + rcu_sr_normal_add_req(&rs); + + /* Kick a GP and start waiting. */ + (void) start_poll_synchronize_rcu(); + + /* Now we can wait. */ + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); + +trace_complete_out: + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete")); +} + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -3603,7 +3985,7 @@ void synchronize_rcu(void) if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } @@ -4296,7 +4678,7 @@ EXPORT_SYMBOL_GPL(rcu_lockdep_current_cpu_online); // whether spinlocks may be acquired safely. static bool rcu_init_invoked(void) { - return !!rcu_state.n_online_cpus; + return !!READ_ONCE(rcu_state.n_online_cpus); } /* @@ -4388,9 +4770,9 @@ rcu_boot_init_percpu_data(int cpu) WARN_ON_ONCE(rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu))); rdp->barrier_seq_snap = rcu_state.barrier_sequence; rdp->rcu_ofl_gp_seq = rcu_state.gp_seq; - rdp->rcu_ofl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_ofl_gp_state = RCU_GP_CLEANED; rdp->rcu_onl_gp_seq = rcu_state.gp_seq; - rdp->rcu_onl_gp_flags = RCU_GP_CLEANED; + rdp->rcu_onl_gp_state = RCU_GP_CLEANED; rdp->last_sched_clock = jiffies; rdp->cpu = cpu; rcu_boot_init_nocb_percpu_data(rdp); @@ -4506,6 +4888,7 @@ int rcutree_prepare_cpu(unsigned int cpu) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); rcu_spawn_rnp_kthreads(rnp); rcu_spawn_cpu_nocb_kthread(cpu); + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus + 1); return 0; @@ -4649,7 +5032,7 @@ void rcutree_report_cpu_starting(unsigned int cpu) ASSERT_EXCLUSIVE_WRITER(rcu_state.ncpus); rcu_gpnum_ovf(rnp, rdp); /* Offline-induced counter wrap? */ rdp->rcu_onl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_onl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_onl_gp_state = READ_ONCE(rcu_state.gp_state); /* An incoming CPU should never be blocking a grace period. */ if (WARN_ON_ONCE(rnp->qsmask & mask)) { /* RCU waiting on incoming CPU? */ @@ -4700,7 +5083,7 @@ void rcutree_report_cpu_dead(void) arch_spin_lock(&rcu_state.ofl_lock); raw_spin_lock_irqsave_rcu_node(rnp, flags); /* Enforce GP memory-order guarantee. */ rdp->rcu_ofl_gp_seq = READ_ONCE(rcu_state.gp_seq); - rdp->rcu_ofl_gp_flags = READ_ONCE(rcu_state.gp_flags); + rdp->rcu_ofl_gp_state = READ_ONCE(rcu_state.gp_state); if (rnp->qsmask & mask) { /* RCU waiting on outgoing CPU? */ /* Report quiescent state -before- changing ->qsmaskinitnext! */ rcu_disable_urgency_upon_qs(rdp); @@ -4774,6 +5157,7 @@ void rcutree_migrate_callbacks(int cpu) */ int rcutree_dead_cpu(unsigned int cpu) { + ASSERT_EXCLUSIVE_WRITER(rcu_state.n_online_cpus); WRITE_ONCE(rcu_state.n_online_cpus, rcu_state.n_online_cpus - 1); // Stop-machine done, so allow nohz_full to disable tick. tick_dep_clear(TICK_DEP_BIT_RCU); @@ -5222,6 +5606,9 @@ void __init rcu_init(void) rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); + WARN_ON(!sync_wq); + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df48160b3136..bae7925c497f 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -273,9 +273,9 @@ struct rcu_data { bool rcu_iw_pending; /* Is ->rcu_iw pending? */ unsigned long rcu_iw_gp_seq; /* ->gp_seq associated with ->rcu_iw. */ unsigned long rcu_ofl_gp_seq; /* ->gp_seq at last offline. */ - short rcu_ofl_gp_flags; /* ->gp_flags at last offline. */ + short rcu_ofl_gp_state; /* ->gp_state at last offline. */ unsigned long rcu_onl_gp_seq; /* ->gp_seq at last online. */ - short rcu_onl_gp_flags; /* ->gp_flags at last online. */ + short rcu_onl_gp_state; /* ->gp_state at last online. */ unsigned long last_fqs_resched; /* Time of last rcu_resched(). */ unsigned long last_sched_clock; /* Jiffies of last rcu_sched_clock_irq(). */ struct rcu_snap_record snap_record; /* Snapshot of core stats at half of */ @@ -316,6 +316,19 @@ do { \ } while (0) /* + * A max threshold for synchronize_rcu() users which are + * awaken directly by the rcu_gp_kthread(). Left part is + * deferred to the main worker. + */ +#define SR_MAX_USERS_WAKE_FROM_GP 5 +#define SR_NORMAL_GP_WAIT_HEAD_MAX 5 + +struct sr_wait_node { + atomic_t inuse; + struct llist_node node; +}; + +/* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) * of the hierarchy is in ->node[0] (referenced by ->level[0]), the second @@ -400,6 +413,13 @@ struct rcu_state { /* Synchronize offline with */ /* GP pre-initialization. */ int nocb_is_setup; /* nocb is setup from boot */ + + /* synchronize_rcu() part. */ + struct llist_head srs_next; /* request a GP users. */ + struct llist_node *srs_wait_tail; /* wait for GP users. */ + struct llist_node *srs_done_tail; /* ready for GP users. */ + struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; + struct work_struct srs_cleanup_work; }; /* Values for rcu_state structure's gp_flags field. */ diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6b83537480b1..8a1d9c8bd9f7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -930,7 +930,7 @@ void synchronize_rcu_expedited(void) /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 36a8b5dbf5b5..340bbefe5f65 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -805,8 +805,8 @@ dump_blkd_tasks(struct rcu_node *rnp, int ncheck) rdp = per_cpu_ptr(&rcu_data, cpu); pr_info("\t%d: %c online: %ld(%d) offline: %ld(%d)\n", cpu, ".o"[rcu_rdp_cpu_online(rdp)], - (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_flags, - (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_flags); + (long)rdp->rcu_onl_gp_seq, rdp->rcu_onl_gp_state, + (long)rdp->rcu_ofl_gp_seq, rdp->rcu_ofl_gp_state); } } diff --git a/kernel/rcu/tree_stall.h b/kernel/rcu/tree_stall.h index 5d666428546b..460efecd077b 100644 --- a/kernel/rcu/tree_stall.h +++ b/kernel/rcu/tree_stall.h @@ -504,7 +504,8 @@ static void print_cpu_stall_info(int cpu) rcu_dynticks_in_eqs(rcu_dynticks_snap(cpu)); rcuc_starved = rcu_is_rcuc_kthread_starving(rdp, &j); if (rcuc_starved) - sprintf(buf, " rcuc=%ld jiffies(starved)", j); + // Print signed value, as negative values indicate a probable bug. + snprintf(buf, sizeof(buf), " rcuc=%ld jiffies(starved)", j); pr_err("\t%d-%c%c%c%c: (%lu %s) idle=%04x/%ld/%#lx softirq=%u/%u fqs=%ld%s%s\n", cpu, "O."[!!cpu_online(cpu)], @@ -579,7 +580,7 @@ static void rcu_check_gp_kthread_expired_fqs_timer(void) pr_err("%s kthread timer wakeup didn't happen for %ld jiffies! g%ld f%#x %s(%d) ->state=%#x\n", rcu_state.name, (jiffies - jiffies_fqs), (long)rcu_seq_current(&rcu_state.gp_seq), - data_race(rcu_state.gp_flags), + data_race(READ_ONCE(rcu_state.gp_flags)), // Diagnostic read gp_state_getname(RCU_GP_WAIT_FQS), RCU_GP_WAIT_FQS, data_race(READ_ONCE(gpk->__state))); pr_err("\tPossible timer handling issue on cpu=%d timer-softirq=%u\n", @@ -628,7 +629,8 @@ static void print_other_cpu_stall(unsigned long gp_seq, unsigned long gps) totqlen += rcu_get_n_cbs_cpu(cpu); pr_err("\t(detected by %d, t=%ld jiffies, g=%ld, q=%lu ncpus=%d)\n", smp_processor_id(), (long)(jiffies - gps), - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read if (ndetected) { rcu_dump_cpu_stacks(); @@ -689,7 +691,8 @@ static void print_cpu_stall(unsigned long gps) totqlen += rcu_get_n_cbs_cpu(cpu); pr_err("\t(t=%lu jiffies g=%ld q=%lu ncpus=%d)\n", jiffies - gps, - (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, rcu_state.n_online_cpus); + (long)rcu_seq_current(&rcu_state.gp_seq), totqlen, + data_race(rcu_state.n_online_cpus)); // Diagnostic read rcu_check_gp_kthread_expired_fqs_timer(); rcu_check_gp_kthread_starvation(); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 46aaaa9fe339..f8436969e0c8 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head) } EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; @@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, if (crcu_array[j] == crcu_array[i]) break; if (j == i) { - wait_for_completion(&rs_array[i].completion); + wait_for_completion_state(&rs_array[i].completion, state); destroy_rcu_head_on_stack(&rs_array[i].head); } } diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596..6cdc5ff919b0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -163,7 +163,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU config GENERIC_TRACER bool @@ -204,7 +204,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..6c96b30f3d63 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3157,8 +3157,7 @@ out: * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - synchronize_rcu_tasks(); + synchronize_rcu_tasks(); ftrace_trampoline_free(ops); } |