summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--drivers/cpuidle/cpuidle-arm.c2
-rw-r--r--drivers/cpuidle/cpuidle-qcom-spm.c2
-rw-r--r--drivers/cpuidle/cpuidle.c2
-rw-r--r--drivers/cpuidle/driver.c4
-rw-r--r--drivers/cpuidle/governors/menu.c76
-rw-r--r--drivers/idle/intel_idle.c48
-rw-r--r--include/linux/energy_model.h29
-rw-r--r--kernel/power/energy_model.c52
8 files changed, 135 insertions, 80 deletions
diff --git a/drivers/cpuidle/cpuidle-arm.c b/drivers/cpuidle/cpuidle-arm.c
index 7cfb980a357d..caba6f4bb1b7 100644
--- a/drivers/cpuidle/cpuidle-arm.c
+++ b/drivers/cpuidle/cpuidle-arm.c
@@ -139,7 +139,7 @@ out_kfree_drv:
*
* Initializes arm cpuidle driver for all CPUs, if any CPU fails
* to register cpuidle driver then rollback to cancel all CPUs
- * registeration.
+ * registration.
*/
static int __init arm_idle_init(void)
{
diff --git a/drivers/cpuidle/cpuidle-qcom-spm.c b/drivers/cpuidle/cpuidle-qcom-spm.c
index 1fc9968eae19..3ab240e0e122 100644
--- a/drivers/cpuidle/cpuidle-qcom-spm.c
+++ b/drivers/cpuidle/cpuidle-qcom-spm.c
@@ -48,7 +48,7 @@ static int qcom_cpu_spc(struct spm_driver_data *drv)
ret = cpu_suspend(0, qcom_pm_collapse);
/*
* ARM common code executes WFI without calling into our driver and
- * if the SPM mode is not reset, then we may accidently power down the
+ * if the SPM mode is not reset, then we may accidentally power down the
* cpu when we intended only to gate the cpu clock.
* Ensure the state is set to standby before returning.
*/
diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index 9e418aec1755..06ace16f9e71 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -406,7 +406,7 @@ void cpuidle_reflect(struct cpuidle_device *dev, int index)
* Min polling interval of 10usec is a guess. It is assuming that
* for most users, the time for a single ping-pong workload like
* perf bench pipe would generally complete within 10usec but
- * this is hardware dependant. Actual time can be estimated with
+ * this is hardware dependent. Actual time can be estimated with
*
* perf bench sched pipe -l 10000
*
diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c
index cf5873cc45dc..9bbfa594c442 100644
--- a/drivers/cpuidle/driver.c
+++ b/drivers/cpuidle/driver.c
@@ -261,7 +261,7 @@ static void __cpuidle_unregister_driver(struct cpuidle_driver *drv)
* @drv: a pointer to a valid struct cpuidle_driver
*
* Register the driver under a lock to prevent concurrent attempts to
- * [un]register the driver from occuring at the same time.
+ * [un]register the driver from occurring at the same time.
*
* Returns 0 on success, a negative error code (returned by
* __cpuidle_register_driver()) otherwise.
@@ -296,7 +296,7 @@ EXPORT_SYMBOL_GPL(cpuidle_register_driver);
* @drv: a pointer to a valid struct cpuidle_driver
*
* Unregisters the cpuidle driver under a lock to prevent concurrent attempts
- * to [un]register the driver from occuring at the same time. @drv has to
+ * to [un]register the driver from occurring at the same time. @drv has to
* match the currently registered driver.
*/
void cpuidle_unregister_driver(struct cpuidle_driver *drv)
diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c
index f3c9d49f0f2a..28363bfa3e4c 100644
--- a/drivers/cpuidle/governors/menu.c
+++ b/drivers/cpuidle/governors/menu.c
@@ -19,7 +19,7 @@
#include "gov.h"
-#define BUCKETS 12
+#define BUCKETS 6
#define INTERVAL_SHIFT 3
#define INTERVALS (1UL << INTERVAL_SHIFT)
#define RESOLUTION 1024
@@ -29,12 +29,11 @@
/*
* Concepts and ideas behind the menu governor
*
- * For the menu governor, there are 3 decision factors for picking a C
+ * For the menu governor, there are 2 decision factors for picking a C
* state:
* 1) Energy break even point
- * 2) Performance impact
- * 3) Latency tolerance (from pmqos infrastructure)
- * These three factors are treated independently.
+ * 2) Latency tolerance (from pmqos infrastructure)
+ * These two factors are treated independently.
*
* Energy break even point
* -----------------------
@@ -75,30 +74,6 @@
* intervals and if the stand deviation of these 8 intervals is below a
* threshold value, we use the average of these intervals as prediction.
*
- * Limiting Performance Impact
- * ---------------------------
- * C states, especially those with large exit latencies, can have a real
- * noticeable impact on workloads, which is not acceptable for most sysadmins,
- * and in addition, less performance has a power price of its own.
- *
- * As a general rule of thumb, menu assumes that the following heuristic
- * holds:
- * The busier the system, the less impact of C states is acceptable
- *
- * This rule-of-thumb is implemented using a performance-multiplier:
- * If the exit latency times the performance multiplier is longer than
- * the predicted duration, the C state is not considered a candidate
- * for selection due to a too high performance impact. So the higher
- * this multiplier is, the longer we need to be idle to pick a deep C
- * state, and thus the less likely a busy CPU will hit such a deep
- * C state.
- *
- * Currently there is only one value determining the factor:
- * 10 points are added for each process that is waiting for IO on this CPU.
- * (This value was experimentally determined.)
- * Utilization is no longer a factor as it was shown that it never contributed
- * significantly to the performance multiplier in the first place.
- *
*/
struct menu_device {
@@ -112,19 +87,10 @@ struct menu_device {
int interval_ptr;
};
-static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
+static inline int which_bucket(u64 duration_ns)
{
int bucket = 0;
- /*
- * We keep two groups of stats; one with no
- * IO pending, one without.
- * This allows us to calculate
- * E(duration)|iowait
- */
- if (nr_iowaiters)
- bucket = BUCKETS/2;
-
if (duration_ns < 10ULL * NSEC_PER_USEC)
return bucket;
if (duration_ns < 100ULL * NSEC_PER_USEC)
@@ -138,19 +104,6 @@ static inline int which_bucket(u64 duration_ns, unsigned int nr_iowaiters)
return bucket + 5;
}
-/*
- * Return a multiplier for the exit latency that is intended
- * to take performance requirements into account.
- * The more performance critical we estimate the system
- * to be, the higher this multiplier, and thus the higher
- * the barrier to go to an expensive C state.
- */
-static inline int performance_multiplier(unsigned int nr_iowaiters)
-{
- /* for IO wait tasks (per cpu!) we add 10x each */
- return 1 + 10 * nr_iowaiters;
-}
-
static DEFINE_PER_CPU(struct menu_device, menu_devices);
static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev);
@@ -258,8 +211,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
struct menu_device *data = this_cpu_ptr(&menu_devices);
s64 latency_req = cpuidle_governor_latency_req(dev->cpu);
u64 predicted_ns;
- u64 interactivity_req;
- unsigned int nr_iowaiters;
ktime_t delta, delta_tick;
int i, idx;
@@ -268,8 +219,6 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
data->needs_update = 0;
}
- nr_iowaiters = nr_iowait_cpu(dev->cpu);
-
/* Find the shortest expected idle interval. */
predicted_ns = get_typical_interval(data) * NSEC_PER_USEC;
if (predicted_ns > RESIDENCY_THRESHOLD_NS) {
@@ -283,7 +232,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
}
data->next_timer_ns = delta;
- data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters);
+ data->bucket = which_bucket(data->next_timer_ns);
/* Round up the result for half microseconds. */
timer_us = div_u64((RESOLUTION * DECAY * NSEC_PER_USEC) / 2 +
@@ -301,7 +250,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
data->next_timer_ns = KTIME_MAX;
delta_tick = TICK_NSEC / 2;
- data->bucket = which_bucket(KTIME_MAX, nr_iowaiters);
+ data->bucket = which_bucket(KTIME_MAX);
}
if (unlikely(drv->state_count <= 1 || latency_req == 0) ||
@@ -328,15 +277,8 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev,
*/
if (predicted_ns < TICK_NSEC)
predicted_ns = data->next_timer_ns;
- } else {
- /*
- * Use the performance multiplier and the user-configurable
- * latency_req to determine the maximum exit latency.
- */
- interactivity_req = div64_u64(predicted_ns,
- performance_multiplier(nr_iowaiters));
- if (latency_req > interactivity_req)
- latency_req = interactivity_req;
+ } else if (latency_req > predicted_ns) {
+ latency_req = predicted_ns;
}
/*
diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c
index 67aebfe0fed6..ac4d8faa3886 100644
--- a/drivers/idle/intel_idle.c
+++ b/drivers/idle/intel_idle.c
@@ -1069,6 +1069,47 @@ static struct cpuidle_state gnr_cstates[] __initdata = {
.enter = NULL }
};
+static struct cpuidle_state gnrd_cstates[] __initdata = {
+ {
+ .name = "C1",
+ .desc = "MWAIT 0x00",
+ .flags = MWAIT2flg(0x00),
+ .exit_latency = 1,
+ .target_residency = 1,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C1E",
+ .desc = "MWAIT 0x01",
+ .flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE,
+ .exit_latency = 4,
+ .target_residency = 4,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C6",
+ .desc = "MWAIT 0x20",
+ .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED |
+ CPUIDLE_FLAG_INIT_XSTATE |
+ CPUIDLE_FLAG_PARTIAL_HINT_MATCH,
+ .exit_latency = 220,
+ .target_residency = 650,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .name = "C6P",
+ .desc = "MWAIT 0x21",
+ .flags = MWAIT2flg(0x21) | CPUIDLE_FLAG_TLB_FLUSHED |
+ CPUIDLE_FLAG_INIT_XSTATE |
+ CPUIDLE_FLAG_PARTIAL_HINT_MATCH,
+ .exit_latency = 240,
+ .target_residency = 750,
+ .enter = &intel_idle,
+ .enter_s2idle = intel_idle_s2idle, },
+ {
+ .enter = NULL }
+};
+
static struct cpuidle_state atom_cstates[] __initdata = {
{
.name = "C1E",
@@ -1508,6 +1549,12 @@ static const struct idle_cpu idle_cpu_gnr __initconst = {
.use_acpi = true,
};
+static const struct idle_cpu idle_cpu_gnrd __initconst = {
+ .state_table = gnrd_cstates,
+ .disable_promotion_to_c1e = true,
+ .use_acpi = true,
+};
+
static const struct idle_cpu idle_cpu_avn __initconst = {
.state_table = avn_cstates,
.disable_promotion_to_c1e = true,
@@ -1593,6 +1640,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_VFM(INTEL_SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_VFM(INTEL_EMERALDRAPIDS_X, &idle_cpu_spr),
X86_MATCH_VFM(INTEL_GRANITERAPIDS_X, &idle_cpu_gnr),
+ X86_MATCH_VFM(INTEL_GRANITERAPIDS_D, &idle_cpu_gnrd),
X86_MATCH_VFM(INTEL_XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_VFM(INTEL_XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_VFM(INTEL_ATOM_GOLDMONT, &idle_cpu_bxt),
diff --git a/include/linux/energy_model.h b/include/linux/energy_model.h
index 1ff52020cf75..752e0b297582 100644
--- a/include/linux/energy_model.h
+++ b/include/linux/energy_model.h
@@ -55,6 +55,8 @@ struct em_perf_table {
* struct em_perf_domain - Performance domain
* @em_table: Pointer to the runtime modifiable em_perf_table
* @nr_perf_states: Number of performance states
+ * @min_perf_state: Minimum allowed Performance State index
+ * @max_perf_state: Maximum allowed Performance State index
* @flags: See "em_perf_domain flags"
* @cpus: Cpumask covering the CPUs of the domain. It's here
* for performance reasons to avoid potential cache
@@ -70,6 +72,8 @@ struct em_perf_table {
struct em_perf_domain {
struct em_perf_table __rcu *em_table;
int nr_perf_states;
+ int min_perf_state;
+ int max_perf_state;
unsigned long flags;
unsigned long cpus[];
};
@@ -173,13 +177,14 @@ void em_table_free(struct em_perf_table __rcu *table);
int em_dev_compute_costs(struct device *dev, struct em_perf_state *table,
int nr_states);
int em_dev_update_chip_binning(struct device *dev);
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz);
/**
* em_pd_get_efficient_state() - Get an efficient performance state from the EM
* @table: List of performance states, in ascending order
- * @nr_perf_states: Number of performance states
+ * @pd: performance domain for which this must be done
* @max_util: Max utilization to map with the EM
- * @pd_flags: Performance Domain flags
*
* It is called from the scheduler code quite frequently and as a consequence
* doesn't implement any check.
@@ -188,13 +193,16 @@ int em_dev_update_chip_binning(struct device *dev);
* requirement.
*/
static inline int
-em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
- unsigned long max_util, unsigned long pd_flags)
+em_pd_get_efficient_state(struct em_perf_state *table,
+ struct em_perf_domain *pd, unsigned long max_util)
{
+ unsigned long pd_flags = pd->flags;
+ int min_ps = pd->min_perf_state;
+ int max_ps = pd->max_perf_state;
struct em_perf_state *ps;
int i;
- for (i = 0; i < nr_perf_states; i++) {
+ for (i = min_ps; i <= max_ps; i++) {
ps = &table[i];
if (ps->performance >= max_util) {
if (pd_flags & EM_PERF_DOMAIN_SKIP_INEFFICIENCIES &&
@@ -204,7 +212,7 @@ em_pd_get_efficient_state(struct em_perf_state *table, int nr_perf_states,
}
}
- return nr_perf_states - 1;
+ return max_ps;
}
/**
@@ -253,8 +261,7 @@ static inline unsigned long em_cpu_energy(struct em_perf_domain *pd,
* requested performance.
*/
em_table = rcu_dereference(pd->em_table);
- i = em_pd_get_efficient_state(em_table->state, pd->nr_perf_states,
- max_util, pd->flags);
+ i = em_pd_get_efficient_state(em_table->state, pd, max_util);
ps = &em_table->state[i];
/*
@@ -391,6 +398,12 @@ static inline int em_dev_update_chip_binning(struct device *dev)
{
return -EINVAL;
}
+static inline
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+ return -EINVAL;
+}
#endif
#endif
diff --git a/kernel/power/energy_model.c b/kernel/power/energy_model.c
index 927cc55ba0b3..d07faf42eace 100644
--- a/kernel/power/energy_model.c
+++ b/kernel/power/energy_model.c
@@ -628,6 +628,8 @@ int em_dev_register_perf_domain(struct device *dev, unsigned int nr_states,
goto unlock;
dev->em_pd->flags |= flags;
+ dev->em_pd->min_perf_state = 0;
+ dev->em_pd->max_perf_state = nr_states - 1;
em_cpufreq_update_efficiencies(dev, dev->em_pd->em_table->state);
@@ -856,3 +858,53 @@ int em_dev_update_chip_binning(struct device *dev)
return em_recalc_and_update(dev, pd, em_table);
}
EXPORT_SYMBOL_GPL(em_dev_update_chip_binning);
+
+
+/**
+ * em_update_performance_limits() - Update Energy Model with performance
+ * limits information.
+ * @pd : Performance Domain with EM that has to be updated.
+ * @freq_min_khz : New minimum allowed frequency for this device.
+ * @freq_max_khz : New maximum allowed frequency for this device.
+ *
+ * This function allows to update the EM with information about available
+ * performance levels. It takes the minimum and maximum frequency in kHz
+ * and does internal translation to performance levels.
+ * Returns 0 on success or -EINVAL when failed.
+ */
+int em_update_performance_limits(struct em_perf_domain *pd,
+ unsigned long freq_min_khz, unsigned long freq_max_khz)
+{
+ struct em_perf_state *table;
+ int min_ps = -1;
+ int max_ps = -1;
+ int i;
+
+ if (!pd)
+ return -EINVAL;
+
+ rcu_read_lock();
+ table = em_perf_state_from_pd(pd);
+
+ for (i = 0; i < pd->nr_perf_states; i++) {
+ if (freq_min_khz == table[i].frequency)
+ min_ps = i;
+ if (freq_max_khz == table[i].frequency)
+ max_ps = i;
+ }
+ rcu_read_unlock();
+
+ /* Only update when both are found and sane */
+ if (min_ps < 0 || max_ps < 0 || max_ps < min_ps)
+ return -EINVAL;
+
+
+ /* Guard simultaneous updates and make them atomic */
+ mutex_lock(&em_pd_mutex);
+ pd->min_perf_state = min_ps;
+ pd->max_perf_state = max_ps;
+ mutex_unlock(&em_pd_mutex);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(em_update_performance_limits);