From fe09d2e314e5501e4673018270e6e1ed92e6e446 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 31 Jan 2024 04:15:42 -0800 Subject: [PATCH 01/32] scftorture: Increase memory provided to guest OS The tradition, extending back almost a full year, has been 2GB plus an additional number of GBs equal to the number of CPUs divided by sixteen. This tradition has served scftorture well, even the CONFIG_PREEMPT=y version running KASAN within guest OSes having 40 CPUs. However, this test recently started OOMing on larger systems, and this commit therefore gives this test an additional GB of memory. It is quite possible that further testing on larger systems will show a need to decrease the divisor from 16 to (say) 8, but that is a change to make once it has been demonstrated to be required. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index bbac5f4b03d0..42f0aee09e51 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -425,7 +425,7 @@ fi if test "$do_scftorture" = "yes" then # Scale memory based on the number of CPUs. - scfmem=$((2+HALF_ALLOTED_CPUS/16)) + scfmem=$((3+HALF_ALLOTED_CPUS/16)) torture_bootargs="scftorture.nthreads=$HALF_ALLOTED_CPUS torture.disable_onoff_at_boot csdlock_debug=1" torture_set "scftorture" tools/testing/selftests/rcutorture/bin/kvm.sh --torture scf --allcpus --duration "$duration_scftorture" --configs "$configs_scftorture" --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory ${scfmem}G --trust-make fi From 9e97ea7796412e23b8ecc946fd0488a2de39ea76 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 9 Feb 2024 03:11:25 -0800 Subject: [PATCH 02/32] rcutorture: Disable tracing to permit Tasks Rude RCU testing Now that the KPROBES, TRACING, BLK_DEV_IO_TRACE, and UPROBE_EVENTS Kconfig options select the TASKS_TRACE_RCU option, the torture.sh tests of enabling exactly one of the RCU Tasks flavors fail. This commit therefore disables these options to allow this testing to succeed. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 42f0aee09e51..13875ee7b050 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -391,7 +391,7 @@ __EOF__ forceflavor="`echo $flavor | sed -e 's/^CONFIG/CONFIG_FORCE/'`" deselectedflavors="`grep -v $flavor $T/rcutasksflavors | tr '\012' ' ' | tr -s ' ' | sed -e 's/ *$//'`" echo " --- Running RCU Tasks Trace flavor $flavor `date`" >> $rtfdir/log - tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 + tools/testing/selftests/rcutorture/bin/kvm.sh --datestamp "$ds/results-rcutasksflavors/$flavor" --buildonly --configs "TINY01 TREE04" --kconfig "CONFIG_RCU_EXPERT=y CONFIG_RCU_SCALE_TEST=y CONFIG_KPROBES=n CONFIG_RCU_TRACE=n CONFIG_TRACING=n CONFIG_BLK_DEV_IO_TRACE=n CONFIG_UPROBE_EVENTS=n $forceflavor=y $deselectedflavors" --trust-make > $T/$flavor.out 2>&1 retcode=$? if test "$retcode" -ne 0 then From 1a140b46da8f3be8e55fbd0950653c9a235b6ce9 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sat, 17 Feb 2024 17:57:35 -0800 Subject: [PATCH 03/32] rcutorture: Enable RCU priority boosting for TREE09 The TREE09 rcutorture scenario exhausts memory from time to time, and this is due to a reader being preempted and blocking grace periods, thus preventing recycling of the memory used in callback-flooding tests. This commit therefore enables RCU priority boosting and sets the boosting delay to 100 milliseconds after grace-period start. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/configs/rcu/TREE09 | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 index fc45645bb5f4..9ecd1b4e653d 100644 --- a/tools/testing/selftests/rcutorture/configs/rcu/TREE09 +++ b/tools/testing/selftests/rcutorture/configs/rcu/TREE09 @@ -10,8 +10,9 @@ CONFIG_NO_HZ_FULL=n CONFIG_RCU_TRACE=n CONFIG_RCU_NOCB_CPU=n CONFIG_DEBUG_LOCK_ALLOC=n -CONFIG_RCU_BOOST=n +CONFIG_RCU_BOOST=y +CONFIG_RCU_BOOST_DELAY=100 CONFIG_DEBUG_OBJECTS_RCU_HEAD=n -#CHECK#CONFIG_RCU_EXPERT=n +CONFIG_RCU_EXPERT=y CONFIG_KPROBES=n CONFIG_FTRACE=n From 0a0467af0a4d8b4e3832945d1cc287d1cb52573e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Mon, 19 Feb 2024 17:00:31 -0800 Subject: [PATCH 04/32] rcutorture: Dump # online CPUs on insufficient cb-flood laundering This commit adds the number of online CPUs to the state dump following an unsuccesful callback-flood test. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 45d6b4c3d199..6611ef3e71c3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2833,12 +2833,12 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); - pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld\n", + pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n", __func__, stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, n_launders + n_max_cbs - n_launders_cb_snap, n_launders, n_launders_sa, - n_max_gps, n_max_cbs, cver, gps); + n_max_gps, n_max_cbs, cver, gps, num_online_cpus()); atomic_long_add(n_max_cbs, &rcu_fwd_max_cbs); mutex_lock(&rcu_fwd_mutex); // Serialize histograms. rcu_torture_fwd_cb_hist(rfp); From f8039457eedc378fa7a186054ee3032b8a41eaee Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Sun, 18 Feb 2024 09:11:08 -0800 Subject: [PATCH 05/32] rcutorture: Dump GP kthread state on insufficient cb-flood laundering If a callback flood prevents grace period from completing, rcutorture does a WARN_ON(). Avoiding this WARN_ON() currently requires that at least three grace periods elapse during an eight-second callback-flood interval. Unfortunately, the current debug information does not include anything about the grace-period state. This commit therefore adds a call to cur_ops->gp_kthread_dbg(), if this function pointer is non-NULL. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6611ef3e71c3..eff51a26216f 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2832,7 +2832,8 @@ static void rcu_torture_fwd_prog_cr(struct rcu_fwd *rfp) if (!torture_must_stop() && !READ_ONCE(rcu_fwd_emergency_stop) && !shutdown_time_arrived()) { - WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED); + if (WARN_ON(n_max_gps < MIN_FWD_CBS_LAUNDERED) && cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); pr_alert("%s Duration %lu barrier: %lu pending %ld n_launders: %ld n_launders_sa: %ld n_max_gps: %ld n_max_cbs: %ld cver %ld gps %ld #online %u\n", __func__, stoppedat - rfp->rcu_fwd_startat, jiffies - stoppedat, From c507e195016ccca18e375d14cfeb1186dac60b2c Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2024 17:00:39 -0800 Subject: [PATCH 06/32] rcutorture: ASSERT_EXCLUSIVE_WRITER() for ->rtort_pipe_count updates It turns out that only one CPU at a time will ever invoke rcu_torture_pipe_update_one() on a given rcu_torture structure. This commit therefore adds three ASSERT_EXCLUSIVE_WRITER() calls to enlist KCSAN's aid in checking this. Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index eff51a26216f..d8c12eba35b7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -466,6 +466,7 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(rp->rtort_pipe_count, i + 1); + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; @@ -1399,6 +1400,7 @@ rcu_torture_writer(void *arg) if (rp == NULL) continue; rp->rtort_pipe_count = 0; + ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); rcu_torture_writer_state = RTWS_DELAY; udelay(torture_random(&rand) & 0x3ff); rcu_torture_writer_state = RTWS_REPLACE; @@ -1414,6 +1416,7 @@ rcu_torture_writer(void *arg) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(old_rp->rtort_pipe_count, old_rp->rtort_pipe_count + 1); + ASSERT_EXCLUSIVE_WRITER(old_rp->rtort_pipe_count); // Make sure readers block polled grace periods. if (cur_ops->get_gp_state && cur_ops->poll_gp_state) { From c342b42fa47f4257fccfeadc8e32c51b1be17a1f Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Feb 2024 15:16:20 -0800 Subject: [PATCH 07/32] rcu-tasks: Make Tasks RCU wait idly for grace-period delays Currently, all waits for grace periods sleep at TASK_UNINTERRUPTIBLE, regardless of RCU flavor. This has worked well, but there have been cases where a longer-than-average Tasks RCU grace period has triggered softlockup splats, many of them, before the Tasks RCU CPU stall warning appears. These softlockup splats unnecessarily consume console bandwidth and complicate diagnosis of the underlying problem. Plus a long but not pathologically long Tasks RCU grace period might trigger a few softlockup splats before completing normally, which generates noise for no good reason. This commit therefore causes Tasks RCU grace periods to sleep at TASK_IDLE priority. If there really is a persistent problem, the eventual Tasks RCU CPU stall warning will flag it, and without the extra noise. Reported-by: Breno Leitao Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/linux/rcupdate_wait.h | 18 +++++++++--------- kernel/rcu/tasks.h | 6 +++++- kernel/rcu/update.c | 4 ++-- 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/linux/rcupdate_wait.h b/include/linux/rcupdate_wait.h index d07f0848802e..303ab9bee155 100644 --- a/include/linux/rcupdate_wait.h +++ b/include/linux/rcupdate_wait.h @@ -19,18 +19,18 @@ struct rcu_synchronize { }; void wakeme_after_rcu(struct rcu_head *head); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array); -#define _wait_rcu_gp(checktiny, ...) \ -do { \ - call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ - struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ - __wait_rcu_gp(checktiny, ARRAY_SIZE(__crcu_array), \ - __crcu_array, __rs_array); \ +#define _wait_rcu_gp(checktiny, state, ...) \ +do { \ + call_rcu_func_t __crcu_array[] = { __VA_ARGS__ }; \ + struct rcu_synchronize __rs_array[ARRAY_SIZE(__crcu_array)]; \ + __wait_rcu_gp(checktiny, state, ARRAY_SIZE(__crcu_array), __crcu_array, __rs_array); \ } while (0) -#define wait_rcu_gp(...) _wait_rcu_gp(false, __VA_ARGS__) +#define wait_rcu_gp(...) _wait_rcu_gp(false, TASK_UNINTERRUPTIBLE, __VA_ARGS__) +#define wait_rcu_gp_state(state, ...) _wait_rcu_gp(false, state, __VA_ARGS__) /** * synchronize_rcu_mult - Wait concurrently for multiple grace periods @@ -54,7 +54,7 @@ do { \ * grace period. */ #define synchronize_rcu_mult(...) \ - _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), __VA_ARGS__) + _wait_rcu_gp(IS_ENABLED(CONFIG_TINY_RCU), TASK_UNINTERRUPTIBLE, __VA_ARGS__) static inline void cond_resched_rcu(void) { diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 147b5945d67a..82e458ea0728 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -74,6 +74,7 @@ struct rcu_tasks_percpu { * @holdouts_func: This flavor's holdout-list scan function (optional). * @postgp_func: This flavor's post-grace-period function (optional). * @call_func: This flavor's call_rcu()-equivalent function. + * @wait_state: Task state for synchronous grace-period waits (default TASK_UNINTERRUPTIBLE). * @rtpcpu: This flavor's rcu_tasks_percpu structure. * @percpu_enqueue_shift: Shift down CPU ID this much when enqueuing callbacks. * @percpu_enqueue_lim: Number of per-CPU callback queues in use for enqueuing. @@ -107,6 +108,7 @@ struct rcu_tasks { holdouts_func_t holdouts_func; postgp_func_t postgp_func; call_rcu_func_t call_func; + unsigned int wait_state; struct rcu_tasks_percpu __percpu *rtpcpu; int percpu_enqueue_shift; int percpu_enqueue_lim; @@ -134,6 +136,7 @@ static struct rcu_tasks rt_name = \ .tasks_gp_mutex = __MUTEX_INITIALIZER(rt_name.tasks_gp_mutex), \ .gp_func = gp, \ .call_func = call, \ + .wait_state = TASK_UNINTERRUPTIBLE, \ .rtpcpu = &rt_name ## __percpu, \ .lazy_jiffies = DIV_ROUND_UP(HZ, 4), \ .name = n, \ @@ -638,7 +641,7 @@ static void synchronize_rcu_tasks_generic(struct rcu_tasks *rtp) // If the grace-period kthread is running, use it. if (READ_ONCE(rtp->kthread_ptr)) { - wait_rcu_gp(rtp->call_func); + wait_rcu_gp_state(rtp->wait_state, rtp->call_func); return; } rcu_tasks_one_gp(rtp, true); @@ -1160,6 +1163,7 @@ static int __init rcu_spawn_tasks_kthread(void) rcu_tasks.postscan_func = rcu_tasks_postscan; rcu_tasks.holdouts_func = check_all_holdout_tasks; rcu_tasks.postgp_func = rcu_tasks_postgp; + rcu_tasks.wait_state = TASK_IDLE; rcu_spawn_tasks_kthread_generic(&rcu_tasks); return 0; } diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 46aaaa9fe339..f8436969e0c8 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -408,7 +408,7 @@ void wakeme_after_rcu(struct rcu_head *head) } EXPORT_SYMBOL_GPL(wakeme_after_rcu); -void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, +void __wait_rcu_gp(bool checktiny, unsigned int state, int n, call_rcu_func_t *crcu_array, struct rcu_synchronize *rs_array) { int i; @@ -440,7 +440,7 @@ void __wait_rcu_gp(bool checktiny, int n, call_rcu_func_t *crcu_array, if (crcu_array[j] == crcu_array[i]) break; if (j == i) { - wait_for_completion(&rs_array[i].completion); + wait_for_completion_state(&rs_array[i].completion, state); destroy_rcu_head_on_stack(&rs_array[i].head); } } From e7d420afb9d9b7dfc3fcfdcbc9bc60d219055d87 Mon Sep 17 00:00:00 2001 From: Zenghui Yu Date: Fri, 16 Feb 2024 23:44:55 +0800 Subject: [PATCH 08/32] doc: Remove references to arrayRCU.rst arrayRCU.rst has been removed since commit ef2555cf68c3 ("doc: Remove arrayRCU.rst") but is still referenced by whatisRCU.rst. Update it to reflect the current state of the documentation. Signed-off-by: Zenghui Yu Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- Documentation/RCU/whatisRCU.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index 872ac665223f..94838c65c7d9 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -427,7 +427,7 @@ their assorted primitives. This section shows a simple use of the core RCU API to protect a global pointer to a dynamically allocated structure. More-typical -uses of RCU may be found in listRCU.rst, arrayRCU.rst, and NMI-RCU.rst. +uses of RCU may be found in listRCU.rst and NMI-RCU.rst. :: struct foo { @@ -510,8 +510,8 @@ So, to sum up: data item. See checklist.rst for additional rules to follow when using RCU. -And again, more-typical uses of RCU may be found in listRCU.rst, -arrayRCU.rst, and NMI-RCU.rst. +And again, more-typical uses of RCU may be found in listRCU.rst +and NMI-RCU.rst. .. _4_whatisRCU: From 179f4ce102eb62b4b8efbd8371ee7d25c1082467 Mon Sep 17 00:00:00 2001 From: Neeraj Upadhyay Date: Sat, 24 Feb 2024 10:27:30 +0530 Subject: [PATCH 09/32] MAINTAINERS: Update Neeraj's email address Update my email-address in MAINTAINERS and .mailmap entries to my kernel.org account. Signed-off-by: Neeraj Upadhyay Reviewed-by: Joel Fernandes Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- .mailmap | 3 ++- MAINTAINERS | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/.mailmap b/.mailmap index 59c9a841bf71..32e12c26bdda 100644 --- a/.mailmap +++ b/.mailmap @@ -445,7 +445,8 @@ Nadav Amit Nadia Yvette Chambers William Lee Irwin III Naoya Horiguchi Nathan Chancellor -Neeraj Upadhyay +Neeraj Upadhyay +Neeraj Upadhyay Neil Armstrong Nguyen Anh Quynh Nicholas Piggin diff --git a/MAINTAINERS b/MAINTAINERS index 7c121493f43d..0370e571f312 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -18591,7 +18591,7 @@ F: tools/testing/selftests/resctrl/ READ-COPY UPDATE (RCU) M: "Paul E. McKenney" M: Frederic Weisbecker (kernel/rcu/tree_nocb.h) -M: Neeraj Upadhyay (kernel/rcu/tasks.h) +M: Neeraj Upadhyay (kernel/rcu/tasks.h) M: Joel Fernandes M: Josh Triplett M: Boqun Feng From b993115b44d769cb34b394d92658226df81a6c89 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:16:17 -0800 Subject: [PATCH 10/32] bpf: Select new NEED_TASKS_RCU Kconfig option Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. A new NEED_TASKS_RCU Kconfig option has been created to allow this enablement logic to be in one place in kernel/rcu/Kconfig. Therefore, make BPF select the new NEED_TASKS_RCU Kconfig option. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: John Fastabend Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Cc: Ankur Arora Cc: Thomas Gleixner Cc: Steven Rostedt Acked-by: Mark Rutland Signed-off-by: Uladzislau Rezki (Sony) --- kernel/bpf/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/Kconfig b/kernel/bpf/Kconfig index bc25f5098a25..4100df44c665 100644 --- a/kernel/bpf/Kconfig +++ b/kernel/bpf/Kconfig @@ -28,7 +28,7 @@ config BPF_SYSCALL bool "Enable bpf() system call" select BPF select IRQ_WORK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_TRACE_RCU select BINARY_PRINTF select NET_SOCK_MSG if NET From 900da4d2a57c1a7e63578cb173e16af0ade3cd7b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:22:27 -0800 Subject: [PATCH 11/32] arch: Select new NEED_TASKS_RCU Kconfig option Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. A new NEED_TASKS_RCU Kconfig option has been created to allow this enablement logic to be in one place in kernel/rcu/Kconfig. Therefore, select the new NEED_TASKS_RCU Kconfig option instead of the old TASKS_RCU option. Signed-off-by: Paul E. McKenney Cc: Andrew Morton Cc: Thomas Gleixner Cc: Heiko Carstens Cc: Arnd Bergmann Cc: Douglas Anderson Cc: Ankur Arora Acked-by: Mark Rutland Reviewed-by: Steven Rostedt (Google) Signed-off-by: Uladzislau Rezki (Sony) --- arch/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 9f066785bb71..ae4a4f37bbf0 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -55,7 +55,7 @@ config KPROBES depends on MODULES depends on HAVE_KPROBES select KALLSYMS - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU help Kprobes allows you to trap at almost any kernel address and execute a callback function. register_kprobe() establishes @@ -104,7 +104,7 @@ config STATIC_CALL_SELFTEST config OPTPROBES def_bool y depends on KPROBES && HAVE_OPTPROBES - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU config KPROBES_ON_FTRACE def_bool y From dfd458a95d78ce31855fe06bbfde4f4fe60c40db Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:04 +0100 Subject: [PATCH 12/32] rcu: Add data structures for synchronize_rcu() The synchronize_rcu() call is going to be reworked, thus this patch adds dedicated fields into the rcu_state structure. Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index df48160b3136..b942b9437438 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -315,6 +315,13 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +#define SR_NORMAL_GP_WAIT_HEAD_MAX 5 + +struct sr_wait_node { + atomic_t inuse; + struct llist_node node; +}; + /* * RCU global state, including node hierarchy. This hierarchy is * represented in "heap" form in a dense array. The root (first level) @@ -400,6 +407,13 @@ struct rcu_state { /* Synchronize offline with */ /* GP pre-initialization. */ int nocb_is_setup; /* nocb is setup from boot */ + + /* synchronize_rcu() part. */ + struct llist_head srs_next; /* request a GP users. */ + struct llist_node *srs_wait_tail; /* wait for GP users. */ + struct llist_node *srs_done_tail; /* ready for GP users. */ + struct sr_wait_node srs_wait_nodes[SR_NORMAL_GP_WAIT_HEAD_MAX]; + struct work_struct srs_cleanup_work; }; /* Values for rcu_state structure's gp_flags field. */ From 02b3c5fcdfe46f9c9dd5d3fc199612b13bf47c06 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 22 Feb 2024 10:25:44 -0800 Subject: [PATCH 13/32] tracing: Select new NEED_TASKS_RCU Kconfig option Currently, if a Kconfig option depends on TASKS_RCU, it conditionally does "select TASKS_RCU if PREEMPTION". This works, but requires any change in this enablement logic to be replicated across all such "select" clauses. A new NEED_TASKS_RCU Kconfig option has been created to allow this enablement logic to be in one place in kernel/rcu/Kconfig. Therefore, select the new NEED_TASKS_RCU Kconfig option instead of the old TASKS_RCU option. Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mathieu Desnoyers Cc: Cc: Ankur Arora Cc: Thomas Gleixner Acked-by: Mark Rutland Signed-off-by: Uladzislau Rezki (Sony) --- kernel/trace/Kconfig | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/trace/Kconfig b/kernel/trace/Kconfig index 61c541c36596..6cdc5ff919b0 100644 --- a/kernel/trace/Kconfig +++ b/kernel/trace/Kconfig @@ -163,7 +163,7 @@ config TRACING select BINARY_PRINTF select EVENT_TRACING select TRACE_CLOCK - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU config GENERIC_TRACER bool @@ -204,7 +204,7 @@ config FUNCTION_TRACER select GENERIC_TRACER select CONTEXT_SWITCH_TRACER select GLOB - select TASKS_RCU if PREEMPTION + select NEED_TASKS_RCU select TASKS_RUDE_RCU help Enable the kernel to trace every kernel function. This is done From 1e52af7f023c44859868306fac1a4b6b556cf47b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Feb 2024 11:17:28 -0800 Subject: [PATCH 14/32] bpf: Choose RCU Tasks based on TASKS_RCU rather than PREEMPTION The advent of CONFIG_PREEMPT_AUTO, AKA lazy preemption, will mean that even kernels built with CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY might see the occasional preemption, and that this preemption just might happen within a trampoline. Therefore, update bpf_tramp_image_put() to choose call_rcu_tasks() based on CONFIG_TASKS_RCU instead of CONFIG_PREEMPTION. This change might enable further simplifications, but the goal of this effort is to make the code safe, not necessarily optimal. Signed-off-by: Paul E. McKenney Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: John Fastabend Cc: Andrii Nakryiko Cc: Martin KaFai Lau Cc: Song Liu Cc: Yonghong Song Cc: KP Singh Cc: Stanislav Fomichev Cc: Hao Luo Cc: Jiri Olsa Cc: Ankur Arora Cc: Thomas Gleixner Cc: Signed-off-by: Uladzislau Rezki (Sony) --- kernel/bpf/trampoline.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/bpf/trampoline.c b/kernel/bpf/trampoline.c index db7599c59c78..88673a4267eb 100644 --- a/kernel/bpf/trampoline.c +++ b/kernel/bpf/trampoline.c @@ -333,7 +333,7 @@ static void bpf_tramp_image_put(struct bpf_tramp_image *im) int err = bpf_arch_text_poke(im->ip_after_call, BPF_MOD_JUMP, NULL, im->ip_epilogue); WARN_ON(err); - if (IS_ENABLED(CONFIG_PREEMPTION)) + if (IS_ENABLED(CONFIG_TASKS_RCU)) call_rcu_tasks(&im->rcu, __bpf_tramp_image_put_rcu_tasks); else percpu_ref_kill(&im->pcref); From 64ec8b6ad61997262fb3373970377f5fb709454b Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 28 Feb 2024 11:30:43 -0800 Subject: [PATCH 15/32] ftrace: Choose RCU Tasks based on TASKS_RCU rather than PREEMPTION The advent of CONFIG_PREEMPT_AUTO, AKA lazy preemption, will mean that even kernels built with CONFIG_PREEMPT_NONE or CONFIG_PREEMPT_VOLUNTARY might see the occasional preemption, and that this preemption just might happen within a trampoline. Therefore, update ftrace_shutdown() to invoke synchronize_rcu_tasks() based on CONFIG_TASKS_RCU instead of CONFIG_PREEMPTION. [ paulmck: Apply Steven Rostedt feedback. ] Signed-off-by: Paul E. McKenney Cc: Steven Rostedt Cc: Masami Hiramatsu Cc: Mark Rutland Cc: Mathieu Desnoyers Cc: Ankur Arora Cc: Thomas Gleixner Cc: Signed-off-by: Uladzislau Rezki (Sony) --- kernel/trace/ftrace.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/trace/ftrace.c b/kernel/trace/ftrace.c index da1710499698..6c96b30f3d63 100644 --- a/kernel/trace/ftrace.c +++ b/kernel/trace/ftrace.c @@ -3157,8 +3157,7 @@ out: * synchronize_rcu_tasks() will wait for those tasks to * execute and either schedule voluntarily or enter user space. */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - synchronize_rcu_tasks(); + synchronize_rcu_tasks(); ftrace_trampoline_free(ops); } From 8db610c3bd93e6929348f6e5271f2f905f80d82e Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Fri, 23 Feb 2024 17:07:24 -0800 Subject: [PATCH 16/32] rcu-tasks: Replace exit_tasks_rcu_start() initialization with WARN_ON_ONCE() Because the Tasks RCU ->rtp_exit_list is initialized at rcu_init() time while there is only one CPU running with interrupts disabled, it is not possible for an exiting task to encounter an uninitialized list. This commit therefore replaces the conditional initialization with a WARN_ON_ONCE(). Reported-by: Frederic Weisbecker Closes: https://lore.kernel.org/all/ZdiNXmO3wRvmzPsr@lothringen/ Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 82e458ea0728..78d74c81cc24 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1203,8 +1203,7 @@ void exit_tasks_rcu_start(void) rtpcp = this_cpu_ptr(rcu_tasks.rtpcpu); t->rcu_tasks_exit_cpu = smp_processor_id(); raw_spin_lock_irqsave_rcu_node(rtpcp, flags); - if (!rtpcp->rtp_exit_list.next) - INIT_LIST_HEAD(&rtpcp->rtp_exit_list); + WARN_ON_ONCE(!rtpcp->rtp_exit_list.next); list_add(&t->rcu_tasks_exit_list, &rtpcp->rtp_exit_list); raw_spin_unlock_irqrestore_rcu_node(rtpcp, flags); preempt_enable(); From 5f48fa85fdb92569f58374b6e040c956628fdcf5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 26 Feb 2024 11:24:39 +0800 Subject: [PATCH 17/32] rcu-tasks: Fix the comments for tasks_rcu_exit_srcu_stall_timer The synchronize_srcu() has been removed by commit("rcu-tasks: Eliminate deadlocks involving do_exit() and RCU tasks") in rcu_tasks_postscan. This commit therefore fixes the tasks_rcu_exit_srcu_stall_timer comment. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 78d74c81cc24..d5319bbe8c98 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -150,7 +150,7 @@ static struct rcu_tasks rt_name = \ #ifdef CONFIG_TASKS_RCU -/* Report delay in synchronize_srcu() completion in rcu_tasks_postscan(). */ +/* Report delay of scan exiting tasklist in rcu_tasks_postscan(). */ static void tasks_rcu_exit_srcu_stall(struct timer_list *unused); static DEFINE_TIMER(tasks_rcu_exit_srcu_stall_timer, tasks_rcu_exit_srcu_stall); #endif From cc5645fddb0ce28492b15520306d092730dffa48 Mon Sep 17 00:00:00 2001 From: Nikita Kiryushin Date: Wed, 27 Mar 2024 20:47:47 +0300 Subject: [PATCH 18/32] rcu-tasks: Fix show_rcu_tasks_trace_gp_kthread buffer overflow There is a possibility of buffer overflow in show_rcu_tasks_trace_gp_kthread() if counters, passed to sprintf() are huge. Counter numbers, needed for this are unrealistically high, but buffer overflow is still possible. Use snprintf() with buffer size instead of sprintf(). Found by Linux Verification Center (linuxtesting.org) with SVACE. Fixes: edf3775f0ad6 ("rcu-tasks: Add count for idle tasks on offline CPUs") Signed-off-by: Nikita Kiryushin Reviewed-by: Steven Rostedt (Google) Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tasks.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index d5319bbe8c98..08a92bffeb84 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1997,7 +1997,7 @@ void show_rcu_tasks_trace_gp_kthread(void) { char buf[64]; - sprintf(buf, "N%lu h:%lu/%lu/%lu", + snprintf(buf, sizeof(buf), "N%lu h:%lu/%lu/%lu", data_race(n_trc_holdouts), data_race(n_heavy_reader_ofl_updates), data_race(n_heavy_reader_updates), From 988f569ae041ccc93a79d98d1b0043dff4d7e9b7 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:05 +0100 Subject: [PATCH 19/32] rcu: Reduce synchronize_rcu() latency A call to a synchronize_rcu() can be optimized from a latency point of view. Workloads which depend on this can benefit of it. The delay of wakeme_after_rcu() callback, which unblocks a waiter, depends on several factors: - how fast a process of offloading is started. Combination of: - !CONFIG_RCU_NOCB_CPU/CONFIG_RCU_NOCB_CPU; - !CONFIG_RCU_LAZY/CONFIG_RCU_LAZY; - other. - when started, invoking path is interrupted due to: - time limit; - need_resched(); - if limit is reached. - where in a nocb list it is located; - how fast previous callbacks completed; Example: 1. On our embedded devices i can easily trigger the scenario when it is a last in the list out of ~3600 callbacks: <...>-29 [001] d..1. 21950.145313: rcu_batch_start: rcu_preempt CBs=3613 bl=28 ... <...>-29 [001] ..... 21950.152578: rcu_invoke_callback: rcu_preempt rhp=00000000b2d6dee8 func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152579: rcu_invoke_callback: rcu_preempt rhp=00000000a446f607 func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152580: rcu_invoke_callback: rcu_preempt rhp=00000000a5cab03b func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152581: rcu_invoke_callback: rcu_preempt rhp=0000000013b7e5ee func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152582: rcu_invoke_callback: rcu_preempt rhp=000000000a8ca6f9 func=__free_vm_area_struct.cfi_jt <...>-29 [001] ..... 21950.152583: rcu_invoke_callback: rcu_preempt rhp=000000008f162ca8 func=wakeme_after_rcu.cfi_jt <...>-29 [001] d..1. 21950.152625: rcu_batch_end: rcu_preempt CBs-invoked=3612 idle=.... 2. We use cpuset/cgroup to classify tasks and assign them into different cgroups. For example "backgrond" group which binds tasks only to little CPUs or "foreground" which makes use of all CPUs. Tasks can be migrated between groups by a request if an acceleration is needed. See below an example how "surfaceflinger" task gets migrated. Initially it is located in the "system-background" cgroup which allows to run only on little cores. In order to speed it up it can be temporary moved into "foreground" cgroup which allows to use big/all CPUs: cgroup_attach_task(): -> cgroup_migrate_execute() -> cpuset_can_attach() -> percpu_down_write() -> rcu_sync_enter() -> synchronize_rcu() -> now move tasks to the new cgroup. -> cgroup_migrate_finish() rcuop/1-29 [000] ..... 7030.528570: rcu_invoke_callback: rcu_preempt rhp=00000000461605e0 func=wakeme_after_rcu.cfi_jt PERFD-SERVER-1855 [000] d..1. 7030.530293: cgroup_attach_task: dst_root=3 dst_id=22 dst_level=1 dst_path=/foreground pid=1900 comm=surfaceflinger TimerDispatch-2768 [002] d..5. 7030.537542: sched_migrate_task: comm=surfaceflinger pid=1900 prio=98 orig_cpu=0 dest_cpu=4 "Boosting a task" depends on synchronize_rcu() latency: - first trace shows a completion of synchronize_rcu(); - second shows attaching a task to a new group; - last shows a final step when migration occurs. 3. To address this drawback, maintain a separate track that consists of synchronize_rcu() callers only. After completion of a grace period users are deferred to a dedicated worker to process requests. 4. This patch reduces the latency of synchronize_rcu() approximately by ~30-40% on synthetic tests. The real test case, camera launch time, shows(time is in milliseconds): 1-run 542 vs 489 improvement 9% 2-run 540 vs 466 improvement 13% 3-run 518 vs 468 improvement 9% 4-run 531 vs 457 improvement 13% 5-run 548 vs 475 improvement 13% 6-run 509 vs 484 improvement 4% Synthetic test(no "noise" from other callbacks): Hardware: x86_64 64 CPUs, 64GB of memory Linux-6.6 - 10K tasks(simultaneous); - each task does(1000 loops) synchronize_rcu(); kfree(p); default: CONFIG_RCU_NOCB_CPU: takes 54 seconds to complete all users; patch: CONFIG_RCU_NOCB_CPU: takes 35 seconds to complete all users. Running 60K gives approximately same results on my setup. Please note it is without any interaction with another type of callbacks, otherwise it will impact a lot a default case. 5. By default it is disabled. To enable this perform one of the below sequence: echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1" Reviewed-by: Paul E. McKenney Reviewed-by: Frederic Weisbecker Co-developed-by: Neeraj Upadhyay (AMD) Signed-off-by: Neeraj Upadhyay (AMD) Signed-off-by: Uladzislau Rezki (Sony) --- .../admin-guide/kernel-parameters.txt | 14 + kernel/rcu/tree.c | 331 +++++++++++++++++- kernel/rcu/tree_exp.h | 2 +- 3 files changed, 345 insertions(+), 2 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index bb884c14b2f6..0a3b0fd1910e 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -5091,6 +5091,20 @@ delay, memory pressure or callback list growing too big. + rcutree.rcu_normal_wake_from_gp= [KNL] + Reduces a latency of synchronize_rcu() call. This approach + maintains its own track of synchronize_rcu() callers, so it + does not interact with regular callbacks because it does not + use a call_rcu[_hurry]() path. Please note, this is for a + normal grace period. + + How to enable it: + + echo 1 > /sys/module/rcutree/parameters/rcu_normal_wake_from_gp + or pass a boot parameter "rcutree.rcu_normal_wake_from_gp=1" + + Default is 0. + rcuscale.gp_async= [KNL] Measure performance of asynchronous grace-period primitives such as call_rcu(). diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9642dd06c25..f65255205e44 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -75,6 +75,7 @@ #define MODULE_PARAM_PREFIX "rcutree." /* Data structures. */ +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *); static DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, rcu_data) = { .gpwrap = true, @@ -93,6 +94,8 @@ static struct rcu_state rcu_state = { .exp_mutex = __MUTEX_INITIALIZER(rcu_state.exp_mutex), .exp_wake_mutex = __MUTEX_INITIALIZER(rcu_state.exp_wake_mutex), .ofl_lock = __ARCH_SPIN_LOCK_UNLOCKED, + .srs_cleanup_work = __WORK_INITIALIZER(rcu_state.srs_cleanup_work, + rcu_sr_normal_gp_cleanup_work), }; /* Dump rcu_node combining tree at boot to verify correct setup. */ @@ -1422,6 +1425,282 @@ static void rcu_poll_gp_seq_end_unlocked(unsigned long *snap) raw_spin_unlock_irqrestore_rcu_node(rnp, flags); } +/* + * There is a single llist, which is used for handling + * synchronize_rcu() users' enqueued rcu_synchronize nodes. + * Within this llist, there are two tail pointers: + * + * wait tail: Tracks the set of nodes, which need to + * wait for the current GP to complete. + * done tail: Tracks the set of nodes, for which grace + * period has elapsed. These nodes processing + * will be done as part of the cleanup work + * execution by a kworker. + * + * At every grace period init, a new wait node is added + * to the llist. This wait node is used as wait tail + * for this new grace period. Given that there are a fixed + * number of wait nodes, if all wait nodes are in use + * (which can happen when kworker callback processing + * is delayed) and additional grace period is requested. + * This means, a system is slow in processing callbacks. + * + * TODO: If a slow processing is detected, a first node + * in the llist should be used as a wait-tail for this + * grace period, therefore users which should wait due + * to a slow process are handled by _this_ grace period + * and not next. + * + * Below is an illustration of how the done and wait + * tail pointers move from one set of rcu_synchronize nodes + * to the other, as grace periods start and finish and + * nodes are processed by kworker. + * + * + * a. Initial llist callbacks list: + * + * +----------+ +--------+ +-------+ + * | | | | | | + * | head |---------> | cb2 |--------->| cb1 | + * | | | | | | + * +----------+ +--------+ +-------+ + * + * + * + * b. New GP1 Start: + * + * WAIT TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * c. GP completion: + * + * WAIT_TAIL == DONE_TAIL + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ +--------+ +-------+ + * | | | | | | | | + * | head ------> wait |------> cb2 |------> | cb1 | + * | | | head1 | | | | | + * +----------+ +--------+ +--------+ +-------+ + * + * + * + * d. New callbacks and GP2 start: + * + * WAIT TAIL DONE TAIL + * | | + * | | + * v v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * + * e. GP2 completion: + * + * WAIT_TAIL == DONE_TAIL + * DONE TAIL + * | + * | + * v + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * | | | | | | | | | | | | | | + * | head ------> wait |--->| cb4 |--->| cb3 |--->|wait |--->| cb2 |--->| cb1 | + * | | | head2| | | | | |head1| | | | | + * +----------+ +------+ +------+ +------+ +-----+ +-----+ +-----+ + * + * + * While the llist state transitions from d to e, a kworker + * can start executing rcu_sr_normal_gp_cleanup_work() and + * can observe either the old done tail (@c) or the new + * done tail (@e). So, done tail updates and reads need + * to use the rel-acq semantics. If the concurrent kworker + * observes the old done tail, the newly queued work + * execution will process the updated done tail. If the + * concurrent kworker observes the new done tail, then + * the newly queued work will skip processing the done + * tail, as workqueue semantics guarantees that the new + * work is executed only after the previous one completes. + * + * f. kworker callbacks processing complete: + * + * + * DONE TAIL + * | + * | + * v + * +----------+ +--------+ + * | | | | + * | head ------> wait | + * | | | head2 | + * +----------+ +--------+ + * + */ +static bool rcu_sr_is_wait_head(struct llist_node *node) +{ + return &(rcu_state.srs_wait_nodes)[0].node <= node && + node <= &(rcu_state.srs_wait_nodes)[SR_NORMAL_GP_WAIT_HEAD_MAX - 1].node; +} + +static struct llist_node *rcu_sr_get_wait_head(void) +{ + struct sr_wait_node *sr_wn; + int i; + + for (i = 0; i < SR_NORMAL_GP_WAIT_HEAD_MAX; i++) { + sr_wn = &(rcu_state.srs_wait_nodes)[i]; + + if (!atomic_cmpxchg_acquire(&sr_wn->inuse, 0, 1)) + return &sr_wn->node; + } + + return NULL; +} + +static void rcu_sr_put_wait_head(struct llist_node *node) +{ + struct sr_wait_node *sr_wn = container_of(node, struct sr_wait_node, node); + + atomic_set_release(&sr_wn->inuse, 0); +} + +/* Disabled by default. */ +static int rcu_normal_wake_from_gp; +module_param(rcu_normal_wake_from_gp, int, 0644); + +static void rcu_sr_normal_complete(struct llist_node *node) +{ + struct rcu_synchronize *rs = container_of( + (struct rcu_head *) node, struct rcu_synchronize, head); + unsigned long oldstate = (unsigned long) rs->head.func; + + WARN_ONCE(IS_ENABLED(CONFIG_PROVE_RCU) && + !poll_state_synchronize_rcu(oldstate), + "A full grace period is not passed yet: %lu", + rcu_seq_diff(get_state_synchronize_rcu(), oldstate)); + + /* Finally. */ + complete(&rs->completion); +} + +static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) +{ + struct llist_node *done, *rcu, *next, *head; + + /* + * This work execution can potentially execute + * while a new done tail is being updated by + * grace period kthread in rcu_sr_normal_gp_cleanup(). + * So, read and updates of done tail need to + * follow acq-rel semantics. + * + * Given that wq semantics guarantees that a single work + * cannot execute concurrently by multiple kworkers, + * the done tail list manipulations are protected here. + */ + done = smp_load_acquire(&rcu_state.srs_done_tail); + if (!done) + return; + + WARN_ON_ONCE(!rcu_sr_is_wait_head(done)); + head = done->next; + done->next = NULL; + + /* + * The dummy node, which is pointed to by the + * done tail which is acq-read above is not removed + * here. This allows lockless additions of new + * rcu_synchronize nodes in rcu_sr_normal_add_req(), + * while the cleanup work executes. The dummy + * nodes is removed, in next round of cleanup + * work execution. + */ + llist_for_each_safe(rcu, next, head) { + if (!rcu_sr_is_wait_head(rcu)) { + rcu_sr_normal_complete(rcu); + continue; + } + + rcu_sr_put_wait_head(rcu); + } +} + +/* + * Helper function for rcu_gp_cleanup(). + */ +static void rcu_sr_normal_gp_cleanup(void) +{ + struct llist_node *wait_tail; + + wait_tail = rcu_state.srs_wait_tail; + if (wait_tail == NULL) + return; + + rcu_state.srs_wait_tail = NULL; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + // concurrent sr_normal_gp_cleanup work might observe this update. + smp_store_release(&rcu_state.srs_done_tail, wait_tail); + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + + schedule_work(&rcu_state.srs_cleanup_work); +} + +/* + * Helper function for rcu_gp_init(). + */ +static bool rcu_sr_normal_gp_init(void) +{ + struct llist_node *first; + struct llist_node *wait_head; + bool start_new_poll = false; + + first = READ_ONCE(rcu_state.srs_next.first); + if (!first || rcu_sr_is_wait_head(first)) + return start_new_poll; + + wait_head = rcu_sr_get_wait_head(); + if (!wait_head) { + // Kick another GP to retry. + start_new_poll = true; + return start_new_poll; + } + + /* Inject a wait-dummy-node. */ + llist_add(wait_head, &rcu_state.srs_next); + + /* + * A waiting list of rcu_synchronize nodes should be empty on + * this step, since a GP-kthread, rcu_gp_init() -> gp_cleanup(), + * rolls it over. If not, it is a BUG, warn a user. + */ + WARN_ON_ONCE(rcu_state.srs_wait_tail != NULL); + rcu_state.srs_wait_tail = wait_head; + ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + + return start_new_poll; +} + +static void rcu_sr_normal_add_req(struct rcu_synchronize *rs) +{ + llist_add((struct llist_node *) &rs->head, &rcu_state.srs_next); +} + /* * Initialize a new grace period. Return false if no grace period required. */ @@ -1432,6 +1711,7 @@ static noinline_for_stack bool rcu_gp_init(void) unsigned long mask; struct rcu_data *rdp; struct rcu_node *rnp = rcu_get_root(); + bool start_new_poll; WRITE_ONCE(rcu_state.gp_activity, jiffies); raw_spin_lock_irq_rcu_node(rnp); @@ -1456,10 +1736,24 @@ static noinline_for_stack bool rcu_gp_init(void) /* Record GP times before starting GP, hence rcu_seq_start(). */ rcu_seq_start(&rcu_state.gp_seq); ASSERT_EXCLUSIVE_WRITER(rcu_state.gp_seq); + start_new_poll = rcu_sr_normal_gp_init(); trace_rcu_grace_period(rcu_state.name, rcu_state.gp_seq, TPS("start")); rcu_poll_gp_seq_start(&rcu_state.gp_seq_polled_snap); raw_spin_unlock_irq_rcu_node(rnp); + /* + * The "start_new_poll" is set to true, only when this GP is not able + * to handle anything and there are outstanding users. It happens when + * the rcu_sr_normal_gp_init() function was not able to insert a dummy + * separator to the llist, because there were no left any dummy-nodes. + * + * Number of dummy-nodes is fixed, it could be that we are run out of + * them, if so we start a new pool request to repeat a try. It is rare + * and it means that a system is doing a slow processing of callbacks. + */ + if (start_new_poll) + (void) start_poll_synchronize_rcu(); + /* * Apply per-leaf buffered online and offline operations to * the rcu_node tree. Note that this new grace period need not @@ -1825,6 +2119,9 @@ static noinline void rcu_gp_cleanup(void) } raw_spin_unlock_irq_rcu_node(rnp); + // Make synchronize_rcu() users aware of the end of old grace period. + rcu_sr_normal_gp_cleanup(); + // If strict, make all CPUs aware of the end of the old grace period. if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) on_each_cpu(rcu_strict_gp_boundary, NULL, 0); @@ -3559,6 +3856,38 @@ static int rcu_blocking_is_gp(void) return true; } +/* + * Helper function for the synchronize_rcu() API. + */ +static void synchronize_rcu_normal(void) +{ + struct rcu_synchronize rs; + + if (!READ_ONCE(rcu_normal_wake_from_gp)) { + wait_rcu_gp(call_rcu_hurry); + return; + } + + init_rcu_head_on_stack(&rs.head); + init_completion(&rs.completion); + + /* + * This code might be preempted, therefore take a GP + * snapshot before adding a request. + */ + if (IS_ENABLED(CONFIG_PROVE_RCU)) + rs.head.func = (void *) get_state_synchronize_rcu(); + + rcu_sr_normal_add_req(&rs); + + /* Kick a GP and start waiting. */ + (void) start_poll_synchronize_rcu(); + + /* Now we can wait. */ + wait_for_completion(&rs.completion); + destroy_rcu_head_on_stack(&rs.head); +} + /** * synchronize_rcu - wait until a grace period has elapsed. * @@ -3610,7 +3939,7 @@ void synchronize_rcu(void) if (rcu_gp_is_expedited()) synchronize_rcu_expedited(); else - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h index 6b83537480b1..8a1d9c8bd9f7 100644 --- a/kernel/rcu/tree_exp.h +++ b/kernel/rcu/tree_exp.h @@ -930,7 +930,7 @@ void synchronize_rcu_expedited(void) /* If expedited grace periods are prohibited, fall back to normal. */ if (rcu_gp_is_normal()) { - wait_rcu_gp(call_rcu_hurry); + synchronize_rcu_normal(); return; } From 2053937a310a3982de9d33af3db2dbd2b32b66e4 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:06 +0100 Subject: [PATCH 20/32] rcu: Add a trace event for synchronize_rcu_normal() Add an rcu_sr_normal() trace event. It takes three arguments first one is the name of RCU flavour, second one is a user id which triggeres synchronize_rcu_normal() and last one is an event. There are two traces in the synchronize_rcu_normal(). On entry, when a new request is registered and on exit point when request is completed. Please note, CONFIG_RCU_TRACE=y is required to activate traces. Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- include/trace/events/rcu.h | 27 +++++++++++++++++++++++++++ kernel/rcu/tree.c | 7 ++++++- 2 files changed, 33 insertions(+), 1 deletion(-) diff --git a/include/trace/events/rcu.h b/include/trace/events/rcu.h index 2ef9c719772a..31b3e0d3e65f 100644 --- a/include/trace/events/rcu.h +++ b/include/trace/events/rcu.h @@ -707,6 +707,33 @@ TRACE_EVENT_RCU(rcu_invoke_kfree_bulk_callback, __entry->rcuname, __entry->p, __entry->nr_records) ); +/* + * Tracepoint for a normal synchronize_rcu() states. The first argument + * is the RCU flavor, the second argument is a pointer to rcu_head the + * last one is an event. + */ +TRACE_EVENT_RCU(rcu_sr_normal, + + TP_PROTO(const char *rcuname, struct rcu_head *rhp, const char *srevent), + + TP_ARGS(rcuname, rhp, srevent), + + TP_STRUCT__entry( + __field(const char *, rcuname) + __field(void *, rhp) + __field(const char *, srevent) + ), + + TP_fast_assign( + __entry->rcuname = rcuname; + __entry->rhp = rhp; + __entry->srevent = srevent; + ), + + TP_printk("%s rhp=0x%p event=%s", + __entry->rcuname, __entry->rhp, __entry->srevent) +); + /* * Tracepoint for exiting rcu_do_batch after RCU callbacks have been * invoked. The first argument is the name of the RCU flavor, diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index f65255205e44..2e1c5be6d64b 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -3863,9 +3863,11 @@ static void synchronize_rcu_normal(void) { struct rcu_synchronize rs; + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("request")); + if (!READ_ONCE(rcu_normal_wake_from_gp)) { wait_rcu_gp(call_rcu_hurry); - return; + goto trace_complete_out; } init_rcu_head_on_stack(&rs.head); @@ -3886,6 +3888,9 @@ static void synchronize_rcu_normal(void) /* Now we can wait. */ wait_for_completion(&rs.completion); destroy_rcu_head_on_stack(&rs.head); + +trace_complete_out: + trace_rcu_sr_normal(rcu_state.name, &rs.head, TPS("complete")); } /** From 462df2f543ae360e79fcaa1b498d2a1a0c2a5b63 Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:07 +0100 Subject: [PATCH 21/32] rcu: Support direct wake-up of synchronize_rcu() users This patch introduces a small enhancement which allows to do a direct wake-up of synchronize_rcu() callers. It occurs after a completion of grace period, thus by the gp-kthread. Number of clients is limited by the hard-coded maximum allowed threshold. The remaining part, if still exists is deferred to a main worker. Link: https://lore.kernel.org/lkml/Zd0ZtNu+Rt0qXkfS@lothringen/ Reviewed-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 24 +++++++++++++++++++++++- kernel/rcu/tree.h | 6 ++++++ 2 files changed, 29 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2e1c5be6d64b..2a270abade4d 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1645,7 +1645,8 @@ static void rcu_sr_normal_gp_cleanup_work(struct work_struct *work) */ static void rcu_sr_normal_gp_cleanup(void) { - struct llist_node *wait_tail; + struct llist_node *wait_tail, *next, *rcu; + int done = 0; wait_tail = rcu_state.srs_wait_tail; if (wait_tail == NULL) @@ -1653,11 +1654,32 @@ static void rcu_sr_normal_gp_cleanup(void) rcu_state.srs_wait_tail = NULL; ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_wait_tail); + WARN_ON_ONCE(!rcu_sr_is_wait_head(wait_tail)); + + /* + * Process (a) and (d) cases. See an illustration. + */ + llist_for_each_safe(rcu, next, wait_tail->next) { + if (rcu_sr_is_wait_head(rcu)) + break; + + rcu_sr_normal_complete(rcu); + // It can be last, update a next on this step. + wait_tail->next = next; + + if (++done == SR_MAX_USERS_WAKE_FROM_GP) + break; + } // concurrent sr_normal_gp_cleanup work might observe this update. smp_store_release(&rcu_state.srs_done_tail, wait_tail); ASSERT_EXCLUSIVE_WRITER(rcu_state.srs_done_tail); + /* + * We schedule a work in order to perform a final processing + * of outstanding users(if still left) and releasing wait-heads + * added by rcu_sr_normal_gp_init() call. + */ schedule_work(&rcu_state.srs_cleanup_work); } diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h index b942b9437438..2832787cee1d 100644 --- a/kernel/rcu/tree.h +++ b/kernel/rcu/tree.h @@ -315,6 +315,12 @@ do { \ __set_current_state(TASK_RUNNING); \ } while (0) +/* + * A max threshold for synchronize_rcu() users which are + * awaken directly by the rcu_gp_kthread(). Left part is + * deferred to the main worker. + */ +#define SR_MAX_USERS_WAKE_FROM_GP 5 #define SR_NORMAL_GP_WAIT_HEAD_MAX 5 struct sr_wait_node { From 0fd210baa07a9e3f15df1bc687293eafb119283a Mon Sep 17 00:00:00 2001 From: "Uladzislau Rezki (Sony)" Date: Fri, 8 Mar 2024 18:34:09 +0100 Subject: [PATCH 22/32] rcu: Allocate WQ with WQ_MEM_RECLAIM bit set synchronize_rcu() users have to be processed regardless of memory pressure so our private WQ needs to have at least one execution context what WQ_MEM_RECLAIM flag guarantees. Reviewed-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/tree.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index 2a270abade4d..1d5c000e5c7a 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -1582,6 +1582,7 @@ static void rcu_sr_put_wait_head(struct llist_node *node) /* Disabled by default. */ static int rcu_normal_wake_from_gp; module_param(rcu_normal_wake_from_gp, int, 0644); +static struct workqueue_struct *sync_wq; static void rcu_sr_normal_complete(struct llist_node *node) { @@ -1680,7 +1681,7 @@ static void rcu_sr_normal_gp_cleanup(void) * of outstanding users(if still left) and releasing wait-heads * added by rcu_sr_normal_gp_init() call. */ - schedule_work(&rcu_state.srs_cleanup_work); + queue_work(sync_wq, &rcu_state.srs_cleanup_work); } /* @@ -5585,6 +5586,9 @@ void __init rcu_init(void) rcu_gp_wq = alloc_workqueue("rcu_gp", WQ_MEM_RECLAIM, 0); WARN_ON(!rcu_gp_wq); + sync_wq = alloc_workqueue("sync_wq", WQ_MEM_RECLAIM, 0); + WARN_ON(!sync_wq); + /* Fill in default value for rcutree.qovld boot parameter. */ /* -After- the rcu_node ->lock fields are initialized! */ if (qovld < 0) From 8d0f9a6639f5083453602375ce9d3b71ef93ac73 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2024 19:14:46 -0800 Subject: [PATCH 23/32] rcutorture: Remove extraneous rcu_torture_pipe_update_one() READ_ONCE() The rcu_torture_pipe_update_one() cannot run concurrently with any updates of ->rtort_pipe_count, so this commit removes the extraneous READ_ONCE() from the read from this field. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/CAHk-=wiX_zF5Mpt8kUm_LFQpYY-mshrXJPOe+wKNwiVhEUcU9g@mail.gmail.com/ Signed-off-by: Paul E. McKenney Reviewed-by: Joel Fernandes (Google) Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index d8c12eba35b7..6b821a7037b0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -461,7 +461,7 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) WRITE_ONCE(rp->rtort_chkp, NULL); smp_store_release(&rtrcp->rtc_ready, 1); // Pair with smp_load_acquire(). } - i = READ_ONCE(rp->rtort_pipe_count); + i = rp->rtort_pipe_count; if (i > RCU_TORTURE_PIPE_LEN) i = RCU_TORTURE_PIPE_LEN; atomic_inc(&rcu_torture_wcount[i]); From 8b9b443fa860276822b25057cb3ff3b28734dec0 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Wed, 6 Mar 2024 19:21:47 -0800 Subject: [PATCH 24/32] rcutorture: Fix rcu_torture_one_read() pipe_count overflow comment The "pipe_count > RCU_TORTURE_PIPE_LEN" check has a comment saying "Should not happen, but...". This is only true when testing an RCU whose grace periods are always long enough. This commit therefore fixes this comment. Reported-by: Linus Torvalds Closes: https://lore.kernel.org/lkml/CAHk-=wi7rJ-eGq+xaxVfzFEgbL9tdf6Kc8Z89rCpfcQOKm74Tw@mail.gmail.com/ Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 6b821a7037b0..0cb5452ecd94 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2000,7 +2000,8 @@ static bool rcu_torture_one_read(struct torture_random_state *trsp, long myid) preempt_disable(); pipe_count = READ_ONCE(p->rtort_pipe_count); if (pipe_count > RCU_TORTURE_PIPE_LEN) { - /* Should not happen, but... */ + // Should not happen in a correct RCU implementation, + // happens quite often for torture_type=busted. pipe_count = RCU_TORTURE_PIPE_LEN; } completed = cur_ops->get_gp_seq(); From a10e3cbf32786cae437e4f370573318721e47c2c Mon Sep 17 00:00:00 2001 From: linke li Date: Wed, 6 Mar 2024 19:51:10 -0800 Subject: [PATCH 25/32] rcutorture: Re-use value stored to ->rtort_pipe_count instead of re-reading Currently, the rcu_torture_pipe_update_one() writes the value (i + 1) to rp->rtort_pipe_count, then immediately re-reads it in order to compare it to RCU_TORTURE_PIPE_LEN. This re-read is pointless because no other update to rp->rtort_pipe_count can occur at this point. This commit therefore instead re-uses the (i + 1) value stored in the comparison instead of re-reading rp->rtort_pipe_count. Signed-off-by: linke li Reviewed-by: Joel Fernandes (Google) Signed-off-by: Paul E. McKenney Cc: Linus Torvalds Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 0cb5452ecd94..dd7d5ba45740 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -467,7 +467,7 @@ rcu_torture_pipe_update_one(struct rcu_torture *rp) atomic_inc(&rcu_torture_wcount[i]); WRITE_ONCE(rp->rtort_pipe_count, i + 1); ASSERT_EXCLUSIVE_WRITER(rp->rtort_pipe_count); - if (rp->rtort_pipe_count >= RCU_TORTURE_PIPE_LEN) { + if (i + 1 >= RCU_TORTURE_PIPE_LEN) { rp->rtort_mbtest = 0; return true; } From e38bf06d509a90c9fc185129ec913beb6de3d428 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 15 Mar 2024 15:17:10 +0800 Subject: [PATCH 26/32] rcutorture: Use the gp_kthread_dbg operation specified by cur_ops Despite there being a cur_ops->gp_kthread_dbg(), rcu_torture_writer() unconditionally invokes vanilla RCU's show_rcu_gp_kthreads(). This is not at all helpful when some other flavor of RCU is being tested. This commit therefore makes rcu_torture_writer() invoke cur_ops->gp_kthread_dbg() for RCU implementations providing this function. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index dd7d5ba45740..2f43d31fb7a5 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -1589,7 +1589,8 @@ rcu_torture_writer(void *arg) if (list_empty(&rcu_tortures[i].rtort_free) && rcu_access_pointer(rcu_torture_current) != &rcu_tortures[i]) { tracing_off(); - show_rcu_gp_kthreads(); + if (cur_ops->gp_kthread_dbg) + cur_ops->gp_kthread_dbg(); WARN(1, "%s: rtort_pipe_count: %d\n", __func__, rcu_tortures[i].rtort_pipe_count); rcu_ftrace_dump(DUMP_ALL); } From dddcddef1414be3ebc37a40d13fcc0f6a672ba9f Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 18 Mar 2024 17:34:11 +0800 Subject: [PATCH 27/32] rcutorture: Make rcutorture support print rcu-tasks gp state This commit make rcu-tasks related rcutorture test support rcu-tasks gp state printing when the writer stall occurs or the at the end of rcutorture test, and generate rcu_ops->get_gp_data() operation to simplify the acquisition of gp state for different types of rcutorture tests. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcu.h | 20 ++++++++++---------- kernel/rcu/rcutorture.c | 26 ++++++++++++++++++-------- kernel/rcu/srcutree.c | 5 +---- kernel/rcu/tasks.h | 21 +++++++++++++++++++++ kernel/rcu/tree.c | 13 +++---------- 5 files changed, 53 insertions(+), 32 deletions(-) diff --git a/kernel/rcu/rcu.h b/kernel/rcu/rcu.h index 86fce206560e..38238e595a61 100644 --- a/kernel/rcu/rcu.h +++ b/kernel/rcu/rcu.h @@ -522,12 +522,18 @@ static inline void show_rcu_tasks_gp_kthreads(void) {} #ifdef CONFIG_TASKS_RCU struct task_struct *get_rcu_tasks_gp_kthread(void); +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RCU #ifdef CONFIG_TASKS_RUDE_RCU struct task_struct *get_rcu_tasks_rude_gp_kthread(void); +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq); #endif // # ifdef CONFIG_TASKS_RUDE_RCU +#ifdef CONFIG_TASKS_TRACE_RCU +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq); +#endif + #ifdef CONFIG_TASKS_RCU_GENERIC void tasks_cblist_init_generic(void); #else /* #ifdef CONFIG_TASKS_RCU_GENERIC */ @@ -557,8 +563,7 @@ static inline void rcu_set_jiffies_lazy_flush(unsigned long j) { } #endif #if defined(CONFIG_TREE_RCU) -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq); +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq); void do_trace_rcu_torture_read(const char *rcutorturename, struct rcu_head *rhp, unsigned long secs, @@ -566,8 +571,7 @@ void do_trace_rcu_torture_read(const char *rcutorturename, unsigned long c); void rcu_gp_set_torture_wait(int duration); #else -static inline void rcutorture_get_gp_data(enum rcutorture_type test_type, - int *flags, unsigned long *gp_seq) +static inline void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { *flags = 0; *gp_seq = 0; @@ -587,20 +591,16 @@ static inline void rcu_gp_set_torture_wait(int duration) { } #ifdef CONFIG_TINY_SRCU -static inline void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +static inline void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; *gp_seq = sp->srcu_idx; } #elif defined(CONFIG_TREE_SRCU) -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *sp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *sp, int *flags, unsigned long *gp_seq); #endif diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 2f43d31fb7a5..85ff8a32f75a 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -381,6 +381,7 @@ struct rcu_torture_ops { void (*gp_kthread_dbg)(void); bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); + void (*get_gp_data)(int *flags, unsigned long *gp_seq); long cbflood_max; int irq_capable; int can_boost; @@ -569,6 +570,7 @@ static struct rcu_torture_ops rcu_ops = { .gp_kthread_dbg = show_rcu_gp_kthreads, .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, + .get_gp_data = rcutorture_get_gp_data, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -628,6 +630,11 @@ static struct srcu_struct srcu_ctld; static struct srcu_struct *srcu_ctlp = &srcu_ctl; static struct rcu_torture_ops srcud_ops; +static void srcu_get_gp_data(int *flags, unsigned long *gp_seq) +{ + srcutorture_get_gp_data(srcu_ctlp, flags, gp_seq); +} + static int srcu_torture_read_lock(void) { if (cur_ops == &srcud_ops) @@ -736,6 +743,7 @@ static struct rcu_torture_ops srcu_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), @@ -774,6 +782,7 @@ static struct rcu_torture_ops srcud_ops = { .call = srcu_torture_call, .cb_barrier = srcu_torture_barrier, .stats = srcu_torture_stats, + .get_gp_data = srcu_get_gp_data, .cbflood_max = 50000, .irq_capable = 1, .no_pi_lock = IS_ENABLED(CONFIG_TINY_SRCU), @@ -882,6 +891,7 @@ static struct rcu_torture_ops tasks_ops = { .call = call_rcu_tasks, .cb_barrier = rcu_barrier_tasks, .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, + .get_gp_data = rcu_tasks_get_gp_data, .fqs = NULL, .stats = NULL, .irq_capable = 1, @@ -922,6 +932,7 @@ static struct rcu_torture_ops tasks_rude_ops = { .call = call_rcu_tasks_rude, .cb_barrier = rcu_barrier_tasks_rude, .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, + .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, .fqs = NULL, .stats = NULL, @@ -974,6 +985,7 @@ static struct rcu_torture_ops tasks_tracing_ops = { .call = call_rcu_tasks_trace, .cb_barrier = rcu_barrier_tasks_trace, .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, + .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, .fqs = NULL, .stats = NULL, @@ -2264,10 +2276,8 @@ rcu_torture_stats_print(void) int __maybe_unused flags = 0; unsigned long __maybe_unused gp_seq = 0; - rcutorture_get_gp_data(cur_ops->ttype, - &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, - &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); wtp = READ_ONCE(writer_task); pr_alert("??? Writer stall state %s(%d) g%lu f%#x ->state %#x cpu %d\n", rcu_torture_writer_state_getname(), @@ -3390,8 +3400,8 @@ rcu_torture_cleanup(void) fakewriter_tasks = NULL; } - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); pr_alert("%s: End-test grace-period state: g%ld f%#x total-gps=%ld\n", cur_ops->name, (long)gp_seq, flags, rcutorture_seq_diff(gp_seq, start_gp_seq)); @@ -3762,8 +3772,8 @@ rcu_torture_init(void) nrealreaders = 1; } rcu_torture_print_module_parms(cur_ops, "Start of test"); - rcutorture_get_gp_data(cur_ops->ttype, &flags, &gp_seq); - srcutorture_get_gp_data(cur_ops->ttype, srcu_ctlp, &flags, &gp_seq); + if (cur_ops->get_gp_data) + cur_ops->get_gp_data(&flags, &gp_seq); start_gp_seq = gp_seq; pr_alert("%s: Start-test grace-period state: g%ld f%#x\n", cur_ops->name, (long)gp_seq, flags); diff --git a/kernel/rcu/srcutree.c b/kernel/rcu/srcutree.c index e4d673fc30f4..bc4b58b0204e 100644 --- a/kernel/rcu/srcutree.c +++ b/kernel/rcu/srcutree.c @@ -1826,12 +1826,9 @@ static void process_srcu(struct work_struct *work) srcu_reschedule(ssp, curdelay); } -void srcutorture_get_gp_data(enum rcutorture_type test_type, - struct srcu_struct *ssp, int *flags, +void srcutorture_get_gp_data(struct srcu_struct *ssp, int *flags, unsigned long *gp_seq) { - if (test_type != SRCU_FLAVOR) - return; *flags = 0; *gp_seq = rcu_seq_current(&ssp->srcu_sup->srcu_gp_seq); } diff --git a/kernel/rcu/tasks.h b/kernel/rcu/tasks.h index 147b5945d67a..a1af7dadc0f7 100644 --- a/kernel/rcu/tasks.h +++ b/kernel/rcu/tasks.h @@ -1178,6 +1178,13 @@ struct task_struct *get_rcu_tasks_gp_kthread(void) } EXPORT_SYMBOL_GPL(get_rcu_tasks_gp_kthread); +void rcu_tasks_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_get_gp_data); + /* * Protect against tasklist scan blind spot while the task is exiting and * may be removed from the tasklist. Do this by adding the task to yet @@ -1358,6 +1365,13 @@ struct task_struct *get_rcu_tasks_rude_gp_kthread(void) } EXPORT_SYMBOL_GPL(get_rcu_tasks_rude_gp_kthread); +void rcu_tasks_rude_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_rude.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_rude_get_gp_data); + #endif /* #ifdef CONFIG_TASKS_RUDE_RCU */ //////////////////////////////////////////////////////////////////////// @@ -2010,6 +2024,13 @@ struct task_struct *get_rcu_tasks_trace_gp_kthread(void) } EXPORT_SYMBOL_GPL(get_rcu_tasks_trace_gp_kthread); +void rcu_tasks_trace_get_gp_data(int *flags, unsigned long *gp_seq) +{ + *flags = 0; + *gp_seq = rcu_seq_current(&rcu_tasks_trace.tasks_gp_seq); +} +EXPORT_SYMBOL_GPL(rcu_tasks_trace_get_gp_data); + #else /* #ifdef CONFIG_TASKS_TRACE_RCU */ static void exit_tasks_rcu_finish_trace(struct task_struct *t) { } #endif /* #else #ifdef CONFIG_TASKS_TRACE_RCU */ diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index d9642dd06c25..60e79ed73700 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -508,17 +508,10 @@ static struct rcu_node *rcu_get_root(void) /* * Send along grace-period-related data for rcutorture diagnostics. */ -void rcutorture_get_gp_data(enum rcutorture_type test_type, int *flags, - unsigned long *gp_seq) +void rcutorture_get_gp_data(int *flags, unsigned long *gp_seq) { - switch (test_type) { - case RCU_FLAVOR: - *flags = READ_ONCE(rcu_state.gp_flags); - *gp_seq = rcu_seq_current(&rcu_state.gp_seq); - break; - default: - break; - } + *flags = READ_ONCE(rcu_state.gp_flags); + *gp_seq = rcu_seq_current(&rcu_state.gp_seq); } EXPORT_SYMBOL_GPL(rcutorture_get_gp_data); From 710cf51d3722012988b2b8d35dee3c553dcc5649 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 18 Mar 2024 17:34:12 +0800 Subject: [PATCH 28/32] rcutorture: Removing redundant function pointer initialization For these rcu_torture_ops structure's objects defined by using static, if the value of the function pointer in its member is not set, the default value will be NULL, this commit therefore remove the pre-existing initialization of function pointers to NULL. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 85ff8a32f75a..3f9c3766f52b 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -566,7 +566,6 @@ static struct rcu_torture_ops rcu_ops = { .call = call_rcu_hurry, .cb_barrier = rcu_barrier, .fqs = rcu_force_quiescent_state, - .stats = NULL, .gp_kthread_dbg = show_rcu_gp_kthreads, .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, @@ -614,9 +613,6 @@ static struct rcu_torture_ops rcu_busted_ops = { .sync = synchronize_rcu_busted, .exp_sync = synchronize_rcu_busted, .call = call_rcu_busted, - .cb_barrier = NULL, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "busted" }; @@ -847,8 +843,6 @@ static struct rcu_torture_ops trivial_ops = { .get_gp_seq = rcu_no_completed, .sync = synchronize_rcu_trivial, .exp_sync = synchronize_rcu_trivial, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "trivial" }; @@ -892,8 +886,6 @@ static struct rcu_torture_ops tasks_ops = { .cb_barrier = rcu_barrier_tasks, .gp_kthread_dbg = show_rcu_tasks_classic_gp_kthread, .get_gp_data = rcu_tasks_get_gp_data, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .slow_gps = 1, .name = "tasks" @@ -934,8 +926,6 @@ static struct rcu_torture_ops tasks_rude_ops = { .gp_kthread_dbg = show_rcu_tasks_rude_gp_kthread, .get_gp_data = rcu_tasks_rude_get_gp_data, .cbflood_max = 50000, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .name = "tasks-rude" }; @@ -987,8 +977,6 @@ static struct rcu_torture_ops tasks_tracing_ops = { .gp_kthread_dbg = show_rcu_tasks_trace_gp_kthread, .get_gp_data = rcu_tasks_trace_get_gp_data, .cbflood_max = 50000, - .fqs = NULL, - .stats = NULL, .irq_capable = 1, .slow_gps = 1, .name = "tasks-tracing" From 431315a563015f259b28e34c5842f6166439e969 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Thu, 21 Mar 2024 16:28:50 +0800 Subject: [PATCH 29/32] rcutorture: Make stall-tasks directly exit when rcutorture tests end When the rcutorture tests start to exit, the rcu_torture_cleanup() is invoked to stop kthreads and release resources, if the stall-task kthreads exist, cpu-stall has started and the rcutorture.stall_cpu is set to a larger value, the rcu_torture_cleanup() will be blocked for a long time and the hung-task may occur, this commit therefore add kthread_should_stop() to the loop of cpu-stall operation, when rcutorture tests ends, no need to wait for cpu-stall to end, exit directly. Use the following command to test: insmod rcutorture.ko torture_type=srcu fwd_progress=0 stat_interval=4 stall_cpu_block=1 stall_cpu=200 stall_cpu_holdoff=10 read_exit_burst=0 object_debug=1 rmmod rcutorture [15361.918610] INFO: task rmmod:878 blocked for more than 122 seconds. [15361.918613] Tainted: G W 6.8.0-rc2-yoctodev-standard+ #25 [15361.918615] "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. [15361.918616] task:rmmod state:D stack:0 pid:878 tgid:878 ppid:773 flags:0x00004002 [15361.918621] Call Trace: [15361.918623] [15361.918626] __schedule+0xc0d/0x28f0 [15361.918631] ? __pfx___schedule+0x10/0x10 [15361.918635] ? rcu_is_watching+0x19/0xb0 [15361.918638] ? schedule+0x1f6/0x290 [15361.918642] ? __pfx_lock_release+0x10/0x10 [15361.918645] ? schedule+0xc9/0x290 [15361.918648] ? schedule+0xc9/0x290 [15361.918653] ? trace_preempt_off+0x54/0x100 [15361.918657] ? schedule+0xc9/0x290 [15361.918661] schedule+0xd0/0x290 [15361.918665] schedule_timeout+0x56d/0x7d0 [15361.918669] ? debug_smp_processor_id+0x1b/0x30 [15361.918672] ? rcu_is_watching+0x19/0xb0 [15361.918676] ? __pfx_schedule_timeout+0x10/0x10 [15361.918679] ? debug_smp_processor_id+0x1b/0x30 [15361.918683] ? rcu_is_watching+0x19/0xb0 [15361.918686] ? wait_for_completion+0x179/0x4c0 [15361.918690] ? __pfx_lock_release+0x10/0x10 [15361.918693] ? __kasan_check_write+0x18/0x20 [15361.918696] ? wait_for_completion+0x9d/0x4c0 [15361.918700] ? _raw_spin_unlock_irq+0x36/0x50 [15361.918703] ? wait_for_completion+0x179/0x4c0 [15361.918707] ? _raw_spin_unlock_irq+0x36/0x50 [15361.918710] ? wait_for_completion+0x179/0x4c0 [15361.918714] ? trace_preempt_on+0x54/0x100 [15361.918718] ? wait_for_completion+0x179/0x4c0 [15361.918723] wait_for_completion+0x181/0x4c0 [15361.918728] ? __pfx_wait_for_completion+0x10/0x10 [15361.918738] kthread_stop+0x152/0x470 [15361.918742] _torture_stop_kthread+0x44/0xc0 [torture 7af7f9cbba28271a10503b653f9e05d518fbc8c3] [15361.918752] rcu_torture_cleanup+0x2ac/0xe90 [rcutorture f2cb1f556ee7956270927183c4c2c7749a336529] [15361.918766] ? __pfx_rcu_torture_cleanup+0x10/0x10 [rcutorture f2cb1f556ee7956270927183c4c2c7749a336529] [15361.918777] ? __kasan_check_write+0x18/0x20 [15361.918781] ? __mutex_unlock_slowpath+0x17c/0x670 [15361.918789] ? __might_fault+0xcd/0x180 [15361.918793] ? find_module_all+0x104/0x1d0 [15361.918799] __x64_sys_delete_module+0x2a4/0x3f0 [15361.918803] ? __pfx___x64_sys_delete_module+0x10/0x10 [15361.918807] ? syscall_exit_to_user_mode+0x149/0x280 Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 3f9c3766f52b..456185d9e6c0 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -2489,8 +2489,8 @@ static int rcu_torture_stall(void *args) preempt_disable(); pr_alert("%s start on CPU %d.\n", __func__, raw_smp_processor_id()); - while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), - stop_at)) + while (ULONG_CMP_LT((unsigned long)ktime_get_seconds(), stop_at) && + !kthread_should_stop()) if (stall_cpu_block) { #ifdef CONFIG_PREEMPTION preempt_schedule(); From 668c0406d887467d53f8fe79261dda1d22d5b671 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Mon, 25 Mar 2024 15:52:19 +0800 Subject: [PATCH 30/32] rcutorture: Fix invalid context warning when enable srcu barrier testing When the torture_type is set srcu or srcud and cb_barrier is non-zero, running the rcutorture test will trigger the following warning: [ 163.910989][ C1] BUG: sleeping function called from invalid context at kernel/locking/spinlock_rt.c:48 [ 163.910994][ C1] in_atomic(): 1, irqs_disabled(): 1, non_block: 0, pid: 0, name: swapper/1 [ 163.910999][ C1] preempt_count: 10001, expected: 0 [ 163.911002][ C1] RCU nest depth: 0, expected: 0 [ 163.911005][ C1] INFO: lockdep is turned off. [ 163.911007][ C1] irq event stamp: 30964 [ 163.911010][ C1] hardirqs last enabled at (30963): [] do_idle+0x362/0x500 [ 163.911018][ C1] hardirqs last disabled at (30964): [] sysvec_call_function_single+0xf/0xd0 [ 163.911025][ C1] softirqs last enabled at (0): [] copy_process+0x16ff/0x6580 [ 163.911033][ C1] softirqs last disabled at (0): [<0000000000000000>] 0x0 [ 163.911038][ C1] Preemption disabled at: [ 163.911039][ C1] [] stack_depot_save_flags+0x24b/0x6c0 [ 163.911063][ C1] CPU: 1 PID: 0 Comm: swapper/1 Tainted: G W 6.8.0-rc4-rt4-yocto-preempt-rt+ #3 1e39aa9a737dd024a3275c4f835a872f673a7d3a [ 163.911071][ C1] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.16.2-0-gea1b7a073390-prebuilt.qemu.org 04/01/2014 [ 163.911075][ C1] Call Trace: [ 163.911078][ C1] [ 163.911080][ C1] dump_stack_lvl+0x88/0xd0 [ 163.911089][ C1] dump_stack+0x10/0x20 [ 163.911095][ C1] __might_resched+0x36f/0x530 [ 163.911105][ C1] rt_spin_lock+0x82/0x1c0 [ 163.911112][ C1] spin_lock_irqsave_ssp_contention+0xb8/0x100 [ 163.911121][ C1] srcu_gp_start_if_needed+0x782/0xf00 [ 163.911128][ C1] ? _raw_spin_unlock_irqrestore+0x46/0x70 [ 163.911136][ C1] ? debug_object_active_state+0x336/0x470 [ 163.911148][ C1] ? __pfx_srcu_gp_start_if_needed+0x10/0x10 [ 163.911156][ C1] ? __pfx_lock_release+0x10/0x10 [ 163.911165][ C1] ? __pfx_rcu_torture_barrier_cbf+0x10/0x10 [ 163.911188][ C1] __call_srcu+0x9f/0xe0 [ 163.911196][ C1] call_srcu+0x13/0x20 [ 163.911201][ C1] srcu_torture_call+0x1b/0x30 [ 163.911224][ C1] rcu_torture_barrier1cb+0x4a/0x60 [ 163.911247][ C1] __flush_smp_call_function_queue+0x267/0xca0 [ 163.911256][ C1] ? __pfx_rcu_torture_barrier1cb+0x10/0x10 [ 163.911281][ C1] generic_smp_call_function_single_interrupt+0x13/0x20 [ 163.911288][ C1] __sysvec_call_function_single+0x7d/0x280 [ 163.911295][ C1] sysvec_call_function_single+0x93/0xd0 [ 163.911302][ C1] [ 163.911304][ C1] [ 163.911308][ C1] asm_sysvec_call_function_single+0x1b/0x20 [ 163.911313][ C1] RIP: 0010:default_idle+0x17/0x20 [ 163.911326][ C1] RSP: 0018:ffff888001997dc8 EFLAGS: 00000246 [ 163.911333][ C1] RAX: 0000000000000000 RBX: dffffc0000000000 RCX: ffffffffae618b51 [ 163.911337][ C1] RDX: 0000000000000000 RSI: ffffffffaea80920 RDI: ffffffffaec2de80 [ 163.911342][ C1] RBP: ffff888001997dc8 R08: 0000000000000001 R09: ffffed100d740cad [ 163.911346][ C1] R10: ffffed100d740cac R11: ffff88806ba06563 R12: 0000000000000001 [ 163.911350][ C1] R13: ffffffffafe460c0 R14: ffffffffafe460c0 R15: 0000000000000000 [ 163.911358][ C1] ? ct_kernel_exit.constprop.3+0x121/0x160 [ 163.911369][ C1] ? lockdep_hardirqs_on+0xc4/0x150 [ 163.911376][ C1] arch_cpu_idle+0x9/0x10 [ 163.911383][ C1] default_idle_call+0x7a/0xb0 [ 163.911390][ C1] do_idle+0x362/0x500 [ 163.911398][ C1] ? __pfx_do_idle+0x10/0x10 [ 163.911404][ C1] ? complete_with_flags+0x8b/0xb0 [ 163.911416][ C1] cpu_startup_entry+0x58/0x70 [ 163.911423][ C1] start_secondary+0x221/0x280 [ 163.911430][ C1] ? __pfx_start_secondary+0x10/0x10 [ 163.911440][ C1] secondary_startup_64_no_verify+0x17f/0x18b [ 163.911455][ C1] This commit therefore use smp_call_on_cpu() instead of smp_call_function_single(), make rcu_torture_barrier1cb() invoked happens on task-context. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 456185d9e6c0..8654e99bd4a3 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -3044,11 +3044,12 @@ static void rcu_torture_barrier_cbf(struct rcu_head *rcu) } /* IPI handler to get callback posted on desired CPU, if online. */ -static void rcu_torture_barrier1cb(void *rcu_void) +static int rcu_torture_barrier1cb(void *rcu_void) { struct rcu_head *rhp = rcu_void; cur_ops->call(rhp, rcu_torture_barrier_cbf); + return 0; } /* kthread function to register callbacks used to test RCU barriers. */ @@ -3074,11 +3075,9 @@ static int rcu_torture_barrier_cbs(void *arg) * The above smp_load_acquire() ensures barrier_phase load * is ordered before the following ->call(). */ - if (smp_call_function_single(myid, rcu_torture_barrier1cb, - &rcu, 1)) { - // IPI failed, so use direct call from current CPU. + if (smp_call_on_cpu(myid, rcu_torture_barrier1cb, &rcu, 1)) cur_ops->call(&rcu, rcu_torture_barrier_cbf); - } + if (atomic_dec_and_test(&barrier_cbs_count)) wake_up(&barrier_wq); } while (!torture_must_stop()); From 39988fdc126b5148f102fc2afbc9fd92d8249f53 Mon Sep 17 00:00:00 2001 From: "Paul E. McKenney" Date: Thu, 28 Mar 2024 19:51:34 -0700 Subject: [PATCH 31/32] torture: Scale --do-kvfree test time Currently, the torture.sh --do-kvfree testing is hard-coded to ten minutes, ignoring the --duration argument. This commit therefore scales this test duration the same as for the rcutorture tests. Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- tools/testing/selftests/rcutorture/bin/torture.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/testing/selftests/rcutorture/bin/torture.sh b/tools/testing/selftests/rcutorture/bin/torture.sh index 13875ee7b050..990d24696fd3 100755 --- a/tools/testing/selftests/rcutorture/bin/torture.sh +++ b/tools/testing/selftests/rcutorture/bin/torture.sh @@ -559,7 +559,7 @@ do_kcsan="$do_kcsan_save" if test "$do_kvfree" = "yes" then torture_bootargs="rcuscale.kfree_rcu_test=1 rcuscale.kfree_nthreads=16 rcuscale.holdoff=20 rcuscale.kfree_loops=10000 torture.disable_onoff_at_boot" - torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration 10 --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make + torture_set "rcuscale-kvfree" tools/testing/selftests/rcutorture/bin/kvm.sh --torture rcuscale --allcpus --duration $duration_rcutorture --kconfig "CONFIG_NR_CPUS=$HALF_ALLOTED_CPUS" --memory 2G --trust-make fi if test "$do_clocksourcewd" = "yes" From 1c67318b3d72e63c51646cf26aa4425ed6f34bc5 Mon Sep 17 00:00:00 2001 From: Zqiang Date: Fri, 29 Mar 2024 12:52:45 +0800 Subject: [PATCH 32/32] rcutorture: Use rcu_gp_slow_register/unregister() only for rcutype test The rcu_gp_slow_register/unregister() is only useful in tests where torture_type=rcu, so this commit therefore generates ->gp_slow_register() and ->gp_slow_unregister() function pointers in the rcu_torture_ops structure, and slows grace periods only when these function pointers exist. Signed-off-by: Zqiang Signed-off-by: Paul E. McKenney Signed-off-by: Uladzislau Rezki (Sony) --- kernel/rcu/rcutorture.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c index 8654e99bd4a3..807fbf6123a7 100644 --- a/kernel/rcu/rcutorture.c +++ b/kernel/rcu/rcutorture.c @@ -382,6 +382,8 @@ struct rcu_torture_ops { bool (*check_boost_failed)(unsigned long gp_state, int *cpup); int (*stall_dur)(void); void (*get_gp_data)(int *flags, unsigned long *gp_seq); + void (*gp_slow_register)(atomic_t *rgssp); + void (*gp_slow_unregister)(atomic_t *rgssp); long cbflood_max; int irq_capable; int can_boost; @@ -570,6 +572,8 @@ static struct rcu_torture_ops rcu_ops = { .check_boost_failed = rcu_check_boost_fail, .stall_dur = rcu_jiffies_till_stall_check, .get_gp_data = rcutorture_get_gp_data, + .gp_slow_register = rcu_gp_slow_register, + .gp_slow_unregister = rcu_gp_slow_unregister, .irq_capable = 1, .can_boost = IS_ENABLED(CONFIG_RCU_BOOST), .extendables = RCUTORTURE_MAX_EXTEND, @@ -3343,12 +3347,12 @@ rcu_torture_cleanup(void) pr_info("%s: Invoking %pS().\n", __func__, cur_ops->cb_barrier); cur_ops->cb_barrier(); } - rcu_gp_slow_unregister(NULL); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); return; } if (!cur_ops) { torture_cleanup_end(); - rcu_gp_slow_unregister(NULL); return; } @@ -3447,7 +3451,8 @@ rcu_torture_cleanup(void) else rcu_torture_print_module_parms(cur_ops, "End of test: SUCCESS"); torture_cleanup_end(); - rcu_gp_slow_unregister(&rcu_fwd_cb_nodelay); + if (cur_ops->gp_slow_unregister) + cur_ops->gp_slow_unregister(NULL); } #ifdef CONFIG_DEBUG_OBJECTS_RCU_HEAD @@ -3929,7 +3934,8 @@ rcu_torture_init(void) if (object_debug) rcu_test_debug_objects(); torture_init_end(); - rcu_gp_slow_register(&rcu_fwd_cb_nodelay); + if (cur_ops->gp_slow_register && !WARN_ON_ONCE(!cur_ops->gp_slow_unregister)) + cur_ops->gp_slow_register(&rcu_fwd_cb_nodelay); return 0; unwind: