summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/kernel/process.c1
-rw-r--r--drivers/base/power/runtime.c4
-rw-r--r--drivers/cpuidle/Kconfig1
-rw-r--r--drivers/cpuidle/Kconfig.arm10
-rw-r--r--drivers/cpuidle/cpuidle-haltpoll.c2
-rw-r--r--drivers/cpuidle/cpuidle-psci-domain.c7
-rw-r--r--drivers/cpuidle/cpuidle-psci.c3
-rw-r--r--drivers/cpuidle/driver.c4
-rw-r--r--drivers/cpuidle/governors/teo.c102
-rw-r--r--drivers/cpuidle/sysfs.c6
-rw-r--r--drivers/idle/intel_idle.c2
-rw-r--r--include/linux/pm.h4
-rw-r--r--kernel/power/Kconfig1
-rw-r--r--kernel/power/swap.c16
14 files changed, 144 insertions, 19 deletions
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 40d156a31676..00a831d56c50 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -721,6 +721,7 @@ void arch_cpu_idle(void)
{
x86_idle();
}
+EXPORT_SYMBOL_GPL(arch_cpu_idle);
/*
* We use this if we don't have any better idle routine..
diff --git a/drivers/base/power/runtime.c b/drivers/base/power/runtime.c
index 50e726b6c2cf..b29be7d4d7d0 100644
--- a/drivers/base/power/runtime.c
+++ b/drivers/base/power/runtime.c
@@ -1864,6 +1864,10 @@ static bool pm_runtime_need_not_resume(struct device *dev)
* sure the device is put into low power state and it should only be used during
* system-wide PM transitions to sleep states. It assumes that the analogous
* pm_runtime_force_resume() will be used to resume the device.
+ *
+ * Do not use with DPM_FLAG_SMART_SUSPEND as this can lead to an inconsistent
+ * state where this function has called the ->runtime_suspend callback but the
+ * PM core marks the driver as runtime active.
*/
int pm_runtime_force_suspend(struct device *dev)
{
diff --git a/drivers/cpuidle/Kconfig b/drivers/cpuidle/Kconfig
index ff71dd662880..cac5997dca50 100644
--- a/drivers/cpuidle/Kconfig
+++ b/drivers/cpuidle/Kconfig
@@ -74,6 +74,7 @@ endmenu
config HALTPOLL_CPUIDLE
tristate "Halt poll cpuidle driver"
depends on X86 && KVM_GUEST
+ select CPU_IDLE_GOV_HALTPOLL
default y
help
This option enables halt poll cpuidle driver, which allows to poll
diff --git a/drivers/cpuidle/Kconfig.arm b/drivers/cpuidle/Kconfig.arm
index 747aa537389b..a1ee475d180d 100644
--- a/drivers/cpuidle/Kconfig.arm
+++ b/drivers/cpuidle/Kconfig.arm
@@ -24,6 +24,14 @@ config ARM_PSCI_CPUIDLE
It provides an idle driver that is capable of detecting and
managing idle states through the PSCI firmware interface.
+ The driver has limitations when used with PREEMPT_RT:
+ - If the idle states are described with the non-hierarchical layout,
+ all idle states are still available.
+
+ - If the idle states are described with the hierarchical layout,
+ only the idle states defined per CPU are available, but not the ones
+ being shared among a group of CPUs (aka cluster idle states).
+
config ARM_PSCI_CPUIDLE_DOMAIN
bool "PSCI CPU idle Domain"
depends on ARM_PSCI_CPUIDLE
@@ -102,6 +110,7 @@ config ARM_MVEBU_V7_CPUIDLE
config ARM_TEGRA_CPUIDLE
bool "CPU Idle Driver for NVIDIA Tegra SoCs"
depends on (ARCH_TEGRA || COMPILE_TEST) && !ARM64 && MMU
+ depends on ARCH_SUSPEND_POSSIBLE
select ARCH_NEEDS_CPU_IDLE_COUPLED if SMP
select ARM_CPU_SUSPEND
help
@@ -110,6 +119,7 @@ config ARM_TEGRA_CPUIDLE
config ARM_QCOM_SPM_CPUIDLE
bool "CPU Idle Driver for Qualcomm Subsystem Power Manager (SPM)"
depends on (ARCH_QCOM || COMPILE_TEST) && !ARM64 && MMU
+ depends on ARCH_SUSPEND_POSSIBLE
select ARM_CPU_SUSPEND
select CPU_IDLE_MULTIPLE_DRIVERS
select DT_IDLE_STATES
diff --git a/drivers/cpuidle/cpuidle-haltpoll.c b/drivers/cpuidle/cpuidle-haltpoll.c
index 3a39a7f48b77..e66df22f9695 100644
--- a/drivers/cpuidle/cpuidle-haltpoll.c
+++ b/drivers/cpuidle/cpuidle-haltpoll.c
@@ -32,7 +32,7 @@ static int default_enter_idle(struct cpuidle_device *dev,
local_irq_enable();
return index;
}
- default_idle();
+ arch_cpu_idle();
return index;
}
diff --git a/drivers/cpuidle/cpuidle-psci-domain.c b/drivers/cpuidle/cpuidle-psci-domain.c
index c80cf9ddabd8..6ad2954948a5 100644
--- a/drivers/cpuidle/cpuidle-psci-domain.c
+++ b/drivers/cpuidle/cpuidle-psci-domain.c
@@ -64,8 +64,11 @@ static int psci_pd_init(struct device_node *np, bool use_osi)
pd->flags |= GENPD_FLAG_IRQ_SAFE | GENPD_FLAG_CPU_DOMAIN;
- /* Allow power off when OSI has been successfully enabled. */
- if (use_osi)
+ /*
+ * Allow power off when OSI has been successfully enabled.
+ * PREEMPT_RT is not yet ready to enter domain idle states.
+ */
+ if (use_osi && !IS_ENABLED(CONFIG_PREEMPT_RT))
pd->power_off = psci_pd_power_off;
else
pd->flags |= GENPD_FLAG_ALWAYS_ON;
diff --git a/drivers/cpuidle/cpuidle-psci.c b/drivers/cpuidle/cpuidle-psci.c
index 57bc3e3ae391..68467a325dcb 100644
--- a/drivers/cpuidle/cpuidle-psci.c
+++ b/drivers/cpuidle/cpuidle-psci.c
@@ -231,6 +231,9 @@ static int psci_dt_cpu_init_topology(struct cpuidle_driver *drv,
if (!psci_has_osi_support())
return 0;
+ if (IS_ENABLED(CONFIG_PREEMPT_RT))
+ return 0;
+
data->dev = psci_dt_attach_cpu(cpu);
if (IS_ERR_OR_NULL(data->dev))
return PTR_ERR_OR_ZERO(data->dev);
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index f70aa17e2a8e..d9cda7f6ccb9 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -183,11 +183,15 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv)
s->target_residency_ns = s->target_residency * NSEC_PER_USEC;
else if (s->target_residency_ns < 0)
s->target_residency_ns = 0;
+ else
+ s->target_residency = div_u64(s->target_residency_ns, NSEC_PER_USEC);
if (s->exit_latency > 0)
s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC;
else if (s->exit_latency_ns < 0)
s->exit_latency_ns = 0;
+ else
+ s->exit_latency = div_u64(s->exit_latency_ns, NSEC_PER_USEC);
}
}
diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c
index d9262db79cae..987fc5f3997d 100644
--- a/drivers/cpuidle/governors/teo.c
+++ b/drivers/cpuidle/governors/teo.c
@@ -2,8 +2,13 @@
/*
* Timer events oriented CPU idle governor
*
+ * TEO governor:
* Copyright (C) 2018 - 2021 Intel Corporation
* Author: Rafael J. Wysocki <rafael.j.wysocki@intel.com>
+ *
+ * Util-awareness mechanism:
+ * Copyright (C) 2022 Arm Ltd.
+ * Author: Kajetan Puchalski <kajetan.puchalski@arm.com>
*/
/**
@@ -99,15 +104,56 @@
* select the given idle state instead of the candidate one.
*
* 3. By default, select the candidate state.
+ *
+ * Util-awareness mechanism:
+ *
+ * The idea behind the util-awareness extension is that there are two distinct
+ * scenarios for the CPU which should result in two different approaches to idle
+ * state selection - utilized and not utilized.
+ *
+ * In this case, 'utilized' means that the average runqueue util of the CPU is
+ * above a certain threshold.
+ *
+ * When the CPU is utilized while going into idle, more likely than not it will
+ * be woken up to do more work soon and so a shallower idle state should be
+ * selected to minimise latency and maximise performance. When the CPU is not
+ * being utilized, the usual metrics-based approach to selecting the deepest
+ * available idle state should be preferred to take advantage of the power
+ * saving.
+ *
+ * In order to achieve this, the governor uses a utilization threshold.
+ * The threshold is computed per-CPU as a percentage of the CPU's capacity
+ * by bit shifting the capacity value. Based on testing, the shift of 6 (~1.56%)
+ * seems to be getting the best results.
+ *
+ * Before selecting the next idle state, the governor compares the current CPU
+ * util to the precomputed util threshold. If it's below, it defaults to the
+ * TEO metrics mechanism. If it's above, the closest shallower idle state will
+ * be selected instead, as long as is not a polling state.
*/
#include <linux/cpuidle.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/sched.h>
#include <linux/sched/clock.h>
+#include <linux/sched/topology.h>
#include <linux/tick.h>
/*
+ * The number of bits to shift the CPU's capacity by in order to determine
+ * the utilized threshold.
+ *
+ * 6 was chosen based on testing as the number that achieved the best balance
+ * of power and performance on average.
+ *
+ * The resulting threshold is high enough to not be triggered by background
+ * noise and low enough to react quickly when activity starts to ramp up.
+ */
+#define UTIL_THRESHOLD_SHIFT 6
+
+
+/*
* The PULSE value is added to metrics when they grow and the DECAY_SHIFT value
* is used for decreasing metrics on a regular basis.
*/
@@ -137,9 +183,11 @@ struct teo_bin {
* @time_span_ns: Time between idle state selection and post-wakeup update.
* @sleep_length_ns: Time till the closest timer event (at the selection time).
* @state_bins: Idle state data bins for this CPU.
- * @total: Grand total of the "intercepts" and "hits" mertics for all bins.
+ * @total: Grand total of the "intercepts" and "hits" metrics for all bins.
* @next_recent_idx: Index of the next @recent_idx entry to update.
* @recent_idx: Indices of bins corresponding to recent "intercepts".
+ * @util_threshold: Threshold above which the CPU is considered utilized
+ * @utilized: Whether the last sleep on the CPU happened while utilized
*/
struct teo_cpu {
s64 time_span_ns;
@@ -148,11 +196,30 @@ struct teo_cpu {
unsigned int total;
int next_recent_idx;
int recent_idx[NR_RECENT];
+ unsigned long util_threshold;
+ bool utilized;
};
static DEFINE_PER_CPU(struct teo_cpu, teo_cpus);
/**
+ * teo_cpu_is_utilized - Check if the CPU's util is above the threshold
+ * @cpu: Target CPU
+ * @cpu_data: Governor CPU data for the target CPU
+ */
+#ifdef CONFIG_SMP
+static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
+{
+ return sched_cpu_util(cpu) > cpu_data->util_threshold;
+}
+#else
+static bool teo_cpu_is_utilized(int cpu, struct teo_cpu *cpu_data)
+{
+ return false;
+}
+#endif
+
+/**
* teo_update - Update CPU metrics after wakeup.
* @drv: cpuidle driver containing state data.
* @dev: Target CPU.
@@ -258,15 +325,17 @@ static s64 teo_middle_of_bin(int idx, struct cpuidle_driver *drv)
* @dev: Target CPU.
* @state_idx: Index of the capping idle state.
* @duration_ns: Idle duration value to match.
+ * @no_poll: Don't consider polling states.
*/
static int teo_find_shallower_state(struct cpuidle_driver *drv,
struct cpuidle_device *dev, int state_idx,
- s64 duration_ns)
+ s64 duration_ns, bool no_poll)
{
int i;
for (i = state_idx - 1; i >= 0; i--) {
- if (dev->states_usage[i].disable)
+ if (dev->states_usage[i].disable ||
+ (no_poll && drv->states[i].flags & CPUIDLE_FLAG_POLLING))
continue;
state_idx = i;
@@ -321,6 +390,22 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
goto end;
}
+ cpu_data->utilized = teo_cpu_is_utilized(dev->cpu, cpu_data);
+ /*
+ * If the CPU is being utilized over the threshold and there are only 2
+ * states to choose from, the metrics need not be considered, so choose
+ * the shallowest non-polling state and exit.
+ */
+ if (drv->state_count < 3 && cpu_data->utilized) {
+ for (i = 0; i < drv->state_count; ++i) {
+ if (!dev->states_usage[i].disable &&
+ !(drv->states[i].flags & CPUIDLE_FLAG_POLLING)) {
+ idx = i;
+ goto end;
+ }
+ }
+ }
+
/*
* Find the deepest idle state whose target residency does not exceed
* the current sleep length and the deepest idle state not deeper than
@@ -452,6 +537,13 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
if (idx > constraint_idx)
idx = constraint_idx;
+ /*
+ * If the CPU is being utilized over the threshold, choose a shallower
+ * non-polling state to improve latency
+ */
+ if (cpu_data->utilized)
+ idx = teo_find_shallower_state(drv, dev, idx, duration_ns, true);
+
end:
/*
* Don't stop the tick if the selected state is a polling one or if the
@@ -469,7 +561,7 @@ end:
*/
if (idx > idx0 &&
drv->states[idx].target_residency_ns > delta_tick)
- idx = teo_find_shallower_state(drv, dev, idx, delta_tick);
+ idx = teo_find_shallower_state(drv, dev, idx, delta_tick, false);
}
return idx;
@@ -508,9 +600,11 @@ static int teo_enable_device(struct cpuidle_driver *drv,
struct cpuidle_device *dev)
{
struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu);
+ unsigned long max_capacity = arch_scale_cpu_capacity(dev->cpu);
int i;
memset(cpu_data, 0, sizeof(*cpu_data));
+ cpu_data->util_threshold = max_capacity >> UTIL_THRESHOLD_SHIFT;
for (i = 0; i < NR_RECENT; i++)
cpu_data->recent_idx[i] = -1;
diff --git a/drivers/cpuidle/sysfs.c b/drivers/cpuidle/sysfs.c
index 2b496a53cbca..48948b171749 100644
--- a/drivers/cpuidle/sysfs.c
+++ b/drivers/cpuidle/sysfs.c
@@ -200,7 +200,7 @@ static void cpuidle_sysfs_release(struct kobject *kobj)
complete(&kdev->kobj_unregister);
}
-static struct kobj_type ktype_cpuidle = {
+static const struct kobj_type ktype_cpuidle = {
.sysfs_ops = &cpuidle_sysfs_ops,
.release = cpuidle_sysfs_release,
};
@@ -447,7 +447,7 @@ static void cpuidle_state_sysfs_release(struct kobject *kobj)
complete(&state_obj->kobj_unregister);
}
-static struct kobj_type ktype_state_cpuidle = {
+static const struct kobj_type ktype_state_cpuidle = {
.sysfs_ops = &cpuidle_state_sysfs_ops,
.default_groups = cpuidle_state_default_groups,
.release = cpuidle_state_sysfs_release,
@@ -594,7 +594,7 @@ static struct attribute *cpuidle_driver_default_attrs[] = {
};
ATTRIBUTE_GROUPS(cpuidle_driver_default);
-static struct kobj_type ktype_driver_cpuidle = {
+static const struct kobj_type ktype_driver_cpuidle = {
.sysfs_ops = &cpuidle_driver_sysfs_ops,
.default_groups = cpuidle_driver_default_groups,
.release = cpuidle_driver_sysfs_release,
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index cfeb24d40d37..bb3d10099ba4 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1430,6 +1430,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_L, &idle_cpu_adl_l),
X86_MATCH_INTEL_FAM6_MODEL(ALDERLAKE_N, &idle_cpu_adl_n),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
+ X86_MATCH_INTEL_FAM6_MODEL(EMERALDRAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt),
@@ -1862,6 +1863,7 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
skx_idle_state_table_update();
break;
case INTEL_FAM6_SAPPHIRERAPIDS_X:
+ case INTEL_FAM6_EMERALDRAPIDS_X:
spr_idle_state_table_update();
break;
case INTEL_FAM6_ALDERLAKE:
diff --git a/include/linux/pm.h b/include/linux/pm.h
index 93cd34f00822..035d9649eba4 100644
--- a/include/linux/pm.h
+++ b/include/linux/pm.h
@@ -379,9 +379,13 @@ const struct dev_pm_ops name = { \
const struct dev_pm_ops name; \
__EXPORT_SYMBOL(name, sec, ns); \
const struct dev_pm_ops name
+#define EXPORT_PM_FN_GPL(name) EXPORT_SYMBOL_GPL(name)
+#define EXPORT_PM_FN_NS_GPL(name, ns) EXPORT_SYMBOL_NS_GPL(name, ns)
#else
#define _EXPORT_DEV_PM_OPS(name, sec, ns) \
static __maybe_unused const struct dev_pm_ops __static_##name
+#define EXPORT_PM_FN_GPL(name)
+#define EXPORT_PM_FN_NS_GPL(name, ns)
#endif
#define EXPORT_DEV_PM_OPS(name) _EXPORT_DEV_PM_OPS(name, "", "")
diff --git a/kernel/power/Kconfig b/kernel/power/Kconfig
index 60a1d3051cc7..4b31629c5be4 100644
--- a/kernel/power/Kconfig
+++ b/kernel/power/Kconfig
@@ -118,7 +118,6 @@ config PM_SLEEP
def_bool y
depends on SUSPEND || HIBERNATE_CALLBACKS
select PM
- select SRCU
config PM_SLEEP_SMP
def_bool y
diff --git a/kernel/power/swap.c b/kernel/power/swap.c
index 277434b6c0bf..36a1df48280c 100644
--- a/kernel/power/swap.c
+++ b/kernel/power/swap.c
@@ -581,7 +581,7 @@ static int save_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for CRC32.
*/
struct crc_data {
@@ -596,7 +596,7 @@ struct crc_data {
unsigned char *unc[LZO_THREADS]; /* uncompressed data */
};
-/**
+/*
* CRC32 update function that runs in its own thread.
*/
static int crc32_threadfn(void *data)
@@ -623,7 +623,7 @@ static int crc32_threadfn(void *data)
}
return 0;
}
-/**
+/*
* Structure used for LZO data compression.
*/
struct cmp_data {
@@ -640,7 +640,7 @@ struct cmp_data {
unsigned char wrk[LZO1X_1_MEM_COMPRESS]; /* compression workspace */
};
-/**
+/*
* Compression function that runs in its own thread.
*/
static int lzo_compress_threadfn(void *data)
@@ -948,9 +948,9 @@ out_finish:
return error;
}
-/**
+/*
* The following functions allow us to read data using a swap map
- * in a file-alike way
+ * in a file-like way.
*/
static void release_swap_reader(struct swap_map_handle *handle)
@@ -1107,7 +1107,7 @@ static int load_image(struct swap_map_handle *handle,
return ret;
}
-/**
+/*
* Structure used for LZO data decompression.
*/
struct dec_data {
@@ -1123,7 +1123,7 @@ struct dec_data {
unsigned char cmp[LZO_CMP_SIZE]; /* compressed buffer */
};
-/**
+/*
* Decompression function that runs in its own thread.
*/
static int lzo_decompress_threadfn(void *data)