summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAndrea Righi <arighi@nvidia.com>2025-05-30 17:46:29 +0200
committerTejun Heo <tj@kernel.org>2025-05-30 14:42:30 -1000
commitee9a4e92799d72b1a2237d76065562b1f1cf334f (patch)
tree58cd046b0a87ab7c5a9da9da872ec2af341922e7
parent0f70f5b08a47a3bc1a252e5f451a137cde7c98ce (diff)
sched_ext: idle: Properly handle invalid prev_cpu during idle selection
The default idle selection policy doesn't properly handle the case where @prev_cpu is not part of the task's allowed CPUs. In this situation, it may return an idle CPU that is not usable by the task, breaking the assumption that the returned CPU must always be within the allowed cpumask, causing inefficiencies or even stalls in certain cases. This issue can arise in the following cases: - The task's affinity may have changed by the time the function is invoked, especially now that the idle selection logic can be used from multiple contexts (i.e., BPF test_run call). - The BPF scheduler may provide a @prev_cpu that is not part of the allowed mask, either unintentionally or as a placement hint. In fact @prev_cpu may not necessarily refer to the CPU the task last ran on, but it can also be considered as a target CPU that the scheduler wishes to use for the task. Therefore, enforce the right behavior by always checking whether @prev_cpu is in the allowed mask, when using scx_bpf_select_cpu_and(), and it's also usable by the task (@p->cpus_ptr). If it is not, try to find a valid CPU nearby @prev_cpu, following the usual locality-aware fallback path (SMT, LLC, node, allowed CPUs). This ensures the returned CPU is always allowed, improving robustness to affinity changes and invalid scheduler hints, while preserving locality as much as possible. Fixes: a730e3f7a48bc ("sched_ext: idle: Consolidate default idle CPU selection kfuncs") Signed-off-by: Andrea Righi <arighi@nvidia.com> Signed-off-by: Tejun Heo <tj@kernel.org>
-rw-r--r--kernel/sched/ext_idle.c29
1 files changed, 11 insertions, 18 deletions
diff --git a/kernel/sched/ext_idle.c b/kernel/sched/ext_idle.c
index 66da03cc0b33..9477fbb8d625 100644
--- a/kernel/sched/ext_idle.c
+++ b/kernel/sched/ext_idle.c
@@ -447,11 +447,18 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
const struct cpumask *llc_cpus = NULL, *numa_cpus = NULL;
const struct cpumask *allowed = cpus_allowed ?: p->cpus_ptr;
int node = scx_cpu_node_if_enabled(prev_cpu);
+ bool is_prev_allowed;
s32 cpu;
preempt_disable();
/*
+ * Check whether @prev_cpu is still within the allowed set. If not,
+ * we can still try selecting a nearby CPU.
+ */
+ is_prev_allowed = cpumask_test_cpu(prev_cpu, allowed);
+
+ /*
* Determine the subset of CPUs usable by @p within @cpus_allowed.
*/
if (allowed != p->cpus_ptr) {
@@ -465,21 +472,6 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
cpu = -EBUSY;
goto out_enable;
}
-
- /*
- * If @prev_cpu is not in the allowed CPUs, skip topology
- * optimizations and try to pick any idle CPU usable by the
- * task.
- *
- * If %SCX_OPS_BUILTIN_IDLE_PER_NODE is enabled, prioritize
- * the current node, as it may optimize some waker->wakee
- * workloads.
- */
- if (!cpumask_test_cpu(prev_cpu, allowed)) {
- node = scx_cpu_node_if_enabled(smp_processor_id());
- cpu = scx_pick_idle_cpu(allowed, node, flags);
- goto out_enable;
- }
}
/*
@@ -525,7 +517,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
* then avoid a migration.
*/
cpu = smp_processor_id();
- if (cpus_share_cache(cpu, prev_cpu) &&
+ if (is_prev_allowed && cpus_share_cache(cpu, prev_cpu) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -562,7 +554,8 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
/*
* Keep using @prev_cpu if it's part of a fully idle core.
*/
- if (cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
+ if (is_prev_allowed &&
+ cpumask_test_cpu(prev_cpu, idle_cpumask(node)->smt) &&
scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
@@ -611,7 +604,7 @@ s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags,
/*
* Use @prev_cpu if it's idle.
*/
- if (scx_idle_test_and_clear_cpu(prev_cpu)) {
+ if (is_prev_allowed && scx_idle_test_and_clear_cpu(prev_cpu)) {
cpu = prev_cpu;
goto out_unlock;
}