summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--include/linux/sched/mm.h5
-rw-r--r--kernel/exit.c16
-rw-r--r--kernel/kthread.c21
-rw-r--r--kernel/sched/core.c6
-rw-r--r--kernel/sched/cpupri.c48
-rw-r--r--kernel/sched/cpupri.h8
-rw-r--r--kernel/sched/deadline.c51
-rw-r--r--kernel/sched/fair.c54
-rw-r--r--kernel/sched/idle.c5
-rw-r--r--kernel/sched/membarrier.c147
-rw-r--r--kernel/sched/rt.c20
-rw-r--r--kernel/sched/sched.h68
-rw-r--r--kernel/sched/stop_task.c3
-rw-r--r--kernel/sched/topology.c1
14 files changed, 364 insertions, 89 deletions
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h
index d5ece7a9a403..a91fb3ad9ec7 100644
--- a/include/linux/sched/mm.h
+++ b/include/linux/sched/mm.h
@@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
extern void membarrier_exec_mmap(struct mm_struct *mm);
+extern void membarrier_update_current_mm(struct mm_struct *next_mm);
+
#else
#ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS
static inline void membarrier_arch_switch_mm(struct mm_struct *prev,
@@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm)
static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm)
{
}
+static inline void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+}
#endif
#endif /* _LINUX_SCHED_MM_H */
diff --git a/kernel/exit.c b/kernel/exit.c
index 87a2d515de0d..a3dd6b36f99a 100644
--- a/kernel/exit.c
+++ b/kernel/exit.c
@@ -475,10 +475,24 @@ static void exit_mm(void)
BUG_ON(mm != current->active_mm);
/* more a memory barrier than a real lock */
task_lock(current);
+ /*
+ * When a thread stops operating on an address space, the loop
+ * in membarrier_private_expedited() may not observe that
+ * tsk->mm, and the loop in membarrier_global_expedited() may
+ * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED
+ * rq->membarrier_state, so those would not issue an IPI.
+ * Membarrier requires a memory barrier after accessing
+ * user-space memory, before clearing tsk->mm or the
+ * rq->membarrier_state.
+ */
+ smp_mb__after_spinlock();
+ local_irq_disable();
current->mm = NULL;
- mmap_read_unlock(mm);
+ membarrier_update_current_mm(NULL);
enter_lazy_tlb(mm, current);
+ local_irq_enable();
task_unlock(current);
+ mmap_read_unlock(mm);
mm_update_next_owner(mm);
mmput(mm);
if (test_thread_flag(TIF_MEMDIE))
diff --git a/kernel/kthread.c b/kernel/kthread.c
index e29773c82b70..481428fe5f22 100644
--- a/kernel/kthread.c
+++ b/kernel/kthread.c
@@ -1248,6 +1248,7 @@ void kthread_use_mm(struct mm_struct *mm)
tsk->active_mm = mm;
}
tsk->mm = mm;
+ membarrier_update_current_mm(mm);
switch_mm_irqs_off(active_mm, mm, tsk);
local_irq_enable();
task_unlock(tsk);
@@ -1255,8 +1256,19 @@ void kthread_use_mm(struct mm_struct *mm)
finish_arch_post_lock_switch();
#endif
+ /*
+ * When a kthread starts operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after storing to tsk->mm, before accessing
+ * user-space memory. A full memory barrier for membarrier
+ * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by
+ * mmdrop(), or explicitly with smp_mb().
+ */
if (active_mm != mm)
mmdrop(active_mm);
+ else
+ smp_mb();
to_kthread(tsk)->oldfs = force_uaccess_begin();
}
@@ -1276,9 +1288,18 @@ void kthread_unuse_mm(struct mm_struct *mm)
force_uaccess_end(to_kthread(tsk)->oldfs);
task_lock(tsk);
+ /*
+ * When a kthread stops operating on an address space, the loop
+ * in membarrier_{private,global}_expedited() may not observe
+ * that tsk->mm, and not issue an IPI. Membarrier requires a
+ * memory barrier after accessing user-space memory, before
+ * clearing tsk->mm.
+ */
+ smp_mb__after_spinlock();
sync_mm_rss(mm);
local_irq_disable();
tsk->mm = NULL;
+ membarrier_update_current_mm(NULL);
/* active_mm is still 'mm' */
enter_lazy_tlb(mm, tsk);
local_irq_enable();
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c6409f34fa2d..622f343413a6 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6596,12 +6596,8 @@ static void do_sched_yield(void)
schedstat_inc(rq->yld_count);
current->sched_class->yield_task(rq);
- /*
- * Since we are going to call schedule() anyway, there's
- * no need to preempt or enable interrupts:
- */
preempt_disable();
- rq_unlock(rq, &rf);
+ rq_unlock_irq(rq, &rf);
sched_preempt_enable_no_resched();
schedule();
diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c
index 11c4df2010de..ec9be789c7e2 100644
--- a/kernel/sched/cpupri.c
+++ b/kernel/sched/cpupri.c
@@ -11,7 +11,7 @@
* This code tracks the priority of each CPU so that global migration
* decisions are easy to calculate. Each CPU can be in a state as follows:
*
- * (INVALID), IDLE, NORMAL, RT1, ... RT99
+ * (INVALID), NORMAL, RT1, ... RT99, HIGHER
*
* going from the lowest priority to the highest. CPUs in the INVALID state
* are not eligible for routing. The system maintains this state with
@@ -19,24 +19,48 @@
* in that class). Therefore a typical application without affinity
* restrictions can find a suitable CPU with O(1) complexity (e.g. two bit
* searches). For tasks with affinity restrictions, the algorithm has a
- * worst case complexity of O(min(102, nr_domcpus)), though the scenario that
+ * worst case complexity of O(min(101, nr_domcpus)), though the scenario that
* yields the worst case search is fairly contrived.
*/
#include "sched.h"
-/* Convert between a 140 based task->prio, and our 102 based cpupri */
+/*
+ * p->rt_priority p->prio newpri cpupri
+ *
+ * -1 -1 (CPUPRI_INVALID)
+ *
+ * 99 0 (CPUPRI_NORMAL)
+ *
+ * 1 98 98 1
+ * ...
+ * 49 50 50 49
+ * 50 49 49 50
+ * ...
+ * 99 0 0 99
+ *
+ * 100 100 (CPUPRI_HIGHER)
+ */
static int convert_prio(int prio)
{
int cpupri;
- if (prio == CPUPRI_INVALID)
- cpupri = CPUPRI_INVALID;
- else if (prio == MAX_PRIO)
- cpupri = CPUPRI_IDLE;
- else if (prio >= MAX_RT_PRIO)
- cpupri = CPUPRI_NORMAL;
- else
- cpupri = MAX_RT_PRIO - prio + 1;
+ switch (prio) {
+ case CPUPRI_INVALID:
+ cpupri = CPUPRI_INVALID; /* -1 */
+ break;
+
+ case 0 ... 98:
+ cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */
+ break;
+
+ case MAX_RT_PRIO-1:
+ cpupri = CPUPRI_NORMAL; /* 0 */
+ break;
+
+ case MAX_RT_PRIO:
+ cpupri = CPUPRI_HIGHER; /* 100 */
+ break;
+ }
return cpupri;
}
@@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p,
* cpupri_set - update the CPU priority setting
* @cp: The cpupri context
* @cpu: The target CPU
- * @newpri: The priority (INVALID-RT99) to assign to this CPU
+ * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU
*
* Note: Assumes cpu_rq(cpu)->lock is locked
*
diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h
index efbb492bb94c..d6cba0020064 100644
--- a/kernel/sched/cpupri.h
+++ b/kernel/sched/cpupri.h
@@ -1,11 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
-#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2)
+#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1)
#define CPUPRI_INVALID -1
-#define CPUPRI_IDLE 0
-#define CPUPRI_NORMAL 1
-/* values 2-101 are RT priorities 0-99 */
+#define CPUPRI_NORMAL 0
+/* values 1-99 are for RT1-RT99 priorities */
+#define CPUPRI_HIGHER 100
struct cpupri_vec {
atomic_t count;
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index eed2e449b313..cc1feb79d786 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i)
return __dl_bw_capacity(i);
}
}
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+ struct root_domain *rd = cpu_rq(cpu)->rd;
+
+ if (rd->visit_gen == gen)
+ return true;
+
+ rd->visit_gen = gen;
+ return false;
+}
#else
static inline struct dl_bw *dl_bw_of(int i)
{
@@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i)
{
return SCHED_CAPACITY_SCALE;
}
+
+static inline bool dl_bw_visited(int cpu, u64 gen)
+{
+ return false;
+}
#endif
static inline
@@ -1378,6 +1394,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
if (dl_rq->earliest_dl.curr == 0 ||
dl_time_before(deadline, dl_rq->earliest_dl.curr)) {
+ if (dl_rq->earliest_dl.curr == 0)
+ cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER);
dl_rq->earliest_dl.curr = deadline;
cpudl_set(&rq->rd->cpudl, rq->cpu, deadline);
}
@@ -1395,6 +1413,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline)
dl_rq->earliest_dl.curr = 0;
dl_rq->earliest_dl.next = 0;
cpudl_clear(&rq->rd->cpudl, rq->cpu);
+ cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr);
} else {
struct rb_node *leftmost = dl_rq->root.rb_leftmost;
struct sched_dl_entity *entry;
@@ -2518,8 +2537,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p,
}
}
-const struct sched_class dl_sched_class
- __section("__dl_sched_class") = {
+DEFINE_SCHED_CLASS(dl) = {
+
.enqueue_task = enqueue_task_dl,
.dequeue_task = dequeue_task_dl,
.yield_task = yield_task_dl,
@@ -2551,33 +2570,39 @@ const struct sched_class dl_sched_class
.update_curr = update_curr_dl,
};
+/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */
+static u64 dl_generation;
+
int sched_dl_global_validate(void)
{
u64 runtime = global_rt_runtime();
u64 period = global_rt_period();
u64 new_bw = to_ratio(period, runtime);
+ u64 gen = ++dl_generation;
struct dl_bw *dl_b;
- int cpu, ret = 0;
+ int cpu, cpus, ret = 0;
unsigned long flags;
/*
* Here we want to check the bandwidth not being set to some
* value smaller than the currently allocated bandwidth in
* any of the root_domains.
- *
- * FIXME: Cycling on all the CPUs is overdoing, but simpler than
- * cycling on root_domains... Discussion on different/better
- * solutions is welcome!
*/
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, gen))
+ goto next;
+
dl_b = dl_bw_of(cpu);
+ cpus = dl_bw_cpus(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
- if (new_bw < dl_b->total_bw)
+ if (new_bw * cpus < dl_b->total_bw)
ret = -EBUSY;
raw_spin_unlock_irqrestore(&dl_b->lock, flags);
+next:
rcu_read_unlock_sched();
if (ret)
@@ -2603,6 +2628,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq)
void sched_dl_do_global(void)
{
u64 new_bw = -1;
+ u64 gen = ++dl_generation;
struct dl_bw *dl_b;
int cpu;
unsigned long flags;
@@ -2613,11 +2639,14 @@ void sched_dl_do_global(void)
if (global_rt_runtime() != RUNTIME_INF)
new_bw = to_ratio(global_rt_period(), global_rt_runtime());
- /*
- * FIXME: As above...
- */
for_each_possible_cpu(cpu) {
rcu_read_lock_sched();
+
+ if (dl_bw_visited(cpu, gen)) {
+ rcu_read_unlock_sched();
+ continue;
+ }
+
dl_b = dl_bw_of(cpu);
raw_spin_lock_irqsave(&dl_b->lock, flags);
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 290f9e38378c..2755a7e0f1ce 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se)
if (!schedstat_enabled())
return;
+ /*
+ * When the sched_schedstat changes from 0 to 1, some sched se
+ * maybe already in the runqueue, the se->statistics.wait_start
+ * will be 0.So it will let the delta wrong. We need to avoid this
+ * scenario.
+ */
+ if (unlikely(!schedstat_val(se->statistics.wait_start)))
+ return;
+
delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start);
if (entity_is_task(se)) {
@@ -4779,25 +4788,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq)
struct cfs_rq *qcfs_rq = cfs_rq_of(se);
/* throttled entity or throttle-on-deactivate */
if (!se->on_rq)
- break;
+ goto done;
- if (dequeue) {
- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
- } else {
- update_load_avg(qcfs_rq, se, 0);
- se_update_runnable(se);
- }
+ dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP);
qcfs_rq->h_nr_running -= task_delta;
qcfs_rq->idle_h_nr_running -= idle_task_delta;
- if (qcfs_rq->load.weight)
- dequeue = 0;
+ if (qcfs_rq->load.weight) {
+ /* Avoid re-evaluating load for this entity: */
+ se = parent_entity(se);
+ break;
+ }
}
- if (!se)
- sub_nr_running(rq, task_delta);
+ for_each_sched_entity(se) {
+ struct cfs_rq *qcfs_rq = cfs_rq_of(se);
+ /* throttled entity or throttle-on-deactivate */
+ if (!se->on_rq)
+ goto done;
+
+ update_load_avg(qcfs_rq, se, 0);
+ se_update_runnable(se);
+
+ qcfs_rq->h_nr_running -= task_delta;
+ qcfs_rq->idle_h_nr_running -= idle_task_delta;
+ }
+
+ /* At this point se is NULL and we are at root level*/
+ sub_nr_running(rq, task_delta);
+done:
/*
* Note: distribution will already see us throttled via the
* throttled-list. rq->lock protects completion.
@@ -5804,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync)
if (sync && cpu_rq(this_cpu)->nr_running == 1)
return this_cpu;
+ if (available_idle_cpu(prev_cpu))
+ return prev_cpu;
+
return nr_cpumask_bits;
}
@@ -10047,6 +10071,10 @@ static inline int find_new_ilb(void)
for_each_cpu_and(ilb, nohz.idle_cpus_mask,
housekeeping_cpumask(HK_FLAG_MISC)) {
+
+ if (ilb == smp_processor_id())
+ continue;
+
if (idle_cpu(ilb))
return ilb;
}
@@ -11158,8 +11186,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task
/*
* All the scheduling class methods:
*/
-const struct sched_class fair_sched_class
- __section("__fair_sched_class") = {
+DEFINE_SCHED_CLASS(fair) = {
+
.enqueue_task = enqueue_task_fair,
.dequeue_task = dequeue_task_fair,
.yield_task = yield_task_fair,
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 24d0ee26377d..9da69c4e0ee9 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -338,6 +338,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns)
WARN_ON_ONCE(!(current->flags & PF_KTHREAD));
WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY));
WARN_ON_ONCE(!duration_ns);
+ WARN_ON_ONCE(current->mm);
rcu_sleep_check();
preempt_disable();
@@ -457,8 +458,8 @@ static void update_curr_idle(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU idle tasks:
*/
-const struct sched_class idle_sched_class
- __section("__idle_sched_class") = {
+DEFINE_SCHED_CLASS(idle) = {
+
/* no enqueue/yield_task for idle tasks */
/* dequeue is not valid, we print a debug message there: */
diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c
index e23e74d52db5..5a40b3828ff2 100644
--- a/kernel/sched/membarrier.c
+++ b/kernel/sched/membarrier.c
@@ -7,6 +7,134 @@
#include "sched.h"
/*
+ * For documentation purposes, here are some membarrier ordering
+ * scenarios to keep in mind:
+ *
+ * A) Userspace thread execution after IPI vs membarrier's memory
+ * barrier before sending the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the start of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU0 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU1 after the IPI-induced memory barrier:
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r2 = y
+ * y = 1
+ * barrier()
+ * r1 = x
+ *
+ * BUG_ON(r1 == 0 && r2 == 0)
+ *
+ * The write to y and load from x by CPU1 are unordered by the hardware,
+ * so it's possible to have "r1 = x" reordered before "y = 1" at any
+ * point after (b). If the memory barrier at (a) is omitted, then "x = 1"
+ * can be reordered after (a) (although not after (c)), so we get r1 == 0
+ * and r2 == 0. This violates the guarantee that membarrier() is
+ * supposed by provide.
+ *
+ * The timing of the memory barrier at (a) has to ensure that it executes
+ * before the IPI-induced memory barrier on CPU1.
+ *
+ * B) Userspace thread execution before IPI vs membarrier's memory
+ * barrier after completing the IPI
+ *
+ * Userspace variables:
+ *
+ * int x = 0, y = 0;
+ *
+ * The memory barrier at the end of membarrier() on CPU0 is necessary in
+ * order to enforce the guarantee that any writes occurring on CPU1 before
+ * the membarrier() is executed will be visible to any code executing on
+ * CPU0 after the membarrier():
+ *
+ * CPU0 CPU1
+ *
+ * x = 1
+ * barrier()
+ * y = 1
+ * r2 = y
+ * membarrier():
+ * a: smp_mb()
+ * b: send IPI IPI-induced mb
+ * c: smp_mb()
+ * r1 = x
+ * BUG_ON(r1 == 0 && r2 == 1)
+ *
+ * The writes to x and y are unordered by the hardware, so it's possible to
+ * have "r2 = 1" even though the write to x doesn't execute until (b). If
+ * the memory barrier at (c) is omitted then "r1 = x" can be reordered
+ * before (b) (although not before (a)), so we get "r1 = 0". This violates
+ * the guarantee that membarrier() is supposed to provide.
+ *
+ * The timing of the memory barrier at (c) has to ensure that it executes
+ * after the IPI-induced memory barrier on CPU1.
+ *
+ * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * d: switch to kthread (includes mb)
+ * b: read rq->curr->mm == NULL
+ * e: switch to user (includes mb)
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (e). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ *
+ * D) exit_mm vs membarrier
+ *
+ * Two thread groups are created, A and B. Thread group B is created by
+ * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD.
+ * Let's assume we have a single thread within each thread group (Thread A
+ * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1.
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * exit_mm():
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * c: smp_mb()
+ *
+ * Using scenario (B), we can show that (c) needs to be paired with (d).
+ *
+ * E) kthread_{use,unuse}_mm vs membarrier
+ *
+ * CPU0 CPU1
+ *
+ * membarrier():
+ * a: smp_mb()
+ * kthread_unuse_mm()
+ * d: smp_mb()
+ * e: current->mm = NULL
+ * b: read rq->curr->mm == NULL
+ * kthread_use_mm()
+ * f: current->mm = mm
+ * g: smp_mb()
+ * c: smp_mb()
+ *
+ * Using the scenario from (A), we can show that (a) needs to be paired
+ * with (g). Using the scenario from (B), we can show that (c) needs to
+ * be paired with (d).
+ */
+
+/*
* Bitmask made from a "or" of all commands within enum membarrier_cmd,
* except MEMBARRIER_CMD_QUERY.
*/
@@ -76,6 +204,18 @@ void membarrier_exec_mmap(struct mm_struct *mm)
this_cpu_write(runqueues.membarrier_state, 0);
}
+void membarrier_update_current_mm(struct mm_struct *next_mm)
+{
+ struct rq *rq = this_rq();
+ int membarrier_state = 0;
+
+ if (next_mm)
+ membarrier_state = atomic_read(&next_mm->membarrier_state);
+ if (READ_ONCE(rq->membarrier_state) == membarrier_state)
+ return;
+ WRITE_ONCE(rq->membarrier_state, membarrier_state);
+}
+
static int membarrier_global_expedited(void)
{
int cpu;
@@ -114,12 +254,11 @@ static int membarrier_global_expedited(void)
continue;
/*
- * Skip the CPU if it runs a kernel thread. The scheduler
- * leaves the prior task mm in place as an optimization when
- * scheduling a kthread.
+ * Skip the CPU if it runs a kernel thread which is not using
+ * a task mm.
*/
p = rcu_dereference(cpu_rq(cpu)->curr);
- if (p->flags & PF_KTHREAD)
+ if (!p->mm)
continue;
__cpumask_set_cpu(cpu, tmpmask);
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index c592e47cafed..c961a3fc0166 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq)
__set_bit(MAX_RT_PRIO, array->bitmap);
#if defined CONFIG_SMP
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
- rt_rq->highest_prio.next = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ rt_rq->highest_prio.next = MAX_RT_PRIO-1;
rt_rq->rt_nr_migratory = 0;
rt_rq->overloaded = 0;
plist_head_init(&rt_rq->pushable_tasks);
@@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq,
{
struct rq *rq = cpu_rq(cpu);
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
rt_rq->rt_nr_boosted = 0;
rt_rq->rq = rq;
rt_rq->tg = tg;
@@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p)
p = plist_first_entry(&rq->rt.pushable_tasks,
struct task_struct, pushable_tasks);
rq->rt.highest_prio.next = p->prio;
- } else
- rq->rt.highest_prio.next = MAX_RT_PRIO;
+ } else {
+ rq->rt.highest_prio.next = MAX_RT_PRIO-1;
+ }
}
#else
@@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio)
sched_find_first_bit(array->bitmap);
}
- } else
- rt_rq->highest_prio.curr = MAX_RT_PRIO;
+ } else {
+ rt_rq->highest_prio.curr = MAX_RT_PRIO-1;
+ }
dec_rt_prio_smp(rt_rq, prio, prev_prio);
}
@@ -2467,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
return 0;
}
-const struct sched_class rt_sched_class
- __section("__rt_sched_class") = {
+DEFINE_SCHED_CLASS(rt) = {
+
.enqueue_task = enqueue_task_rt,
.dequeue_task = dequeue_task_rt,
.yield_task = yield_task_rt,
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index 56992aaca48e..e897d779839f 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -67,7 +67,6 @@
#include <linux/tsacct_kern.h>
#include <asm/tlb.h>
-#include <asm-generic/vmlinux.lds.h>
#ifdef CONFIG_PARAVIRT
# include <asm/paravirt.h>
@@ -257,30 +256,6 @@ struct rt_bandwidth {
void __dl_clear_params(struct task_struct *p);
-/*
- * To keep the bandwidth of -deadline tasks and groups under control
- * we need some place where:
- * - store the maximum -deadline bandwidth of the system (the group);
- * - cache the fraction of that bandwidth that is currently allocated.
- *
- * This is all done in the data structure below. It is similar to the
- * one used for RT-throttling (rt_bandwidth), with the main difference
- * that, since here we are only interested in admission control, we
- * do not decrease any runtime while the group "executes", neither we
- * need a timer to replenish it.
- *
- * With respect to SMP, the bandwidth is given on a per-CPU basis,
- * meaning that:
- * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU;
- * - dl_total_bw array contains, in the i-eth element, the currently
- * allocated bandwidth on the i-eth CPU.
- * Moreover, groups consume bandwidth on each CPU, while tasks only
- * consume bandwidth on the CPU they're running on.
- * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw
- * that will be shown the next time the proc or cgroup controls will
- * be red. It on its turn can be changed by writing on its own
- * control.
- */
struct dl_bandwidth {
raw_spinlock_t dl_runtime_lock;
u64 dl_runtime;
@@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void)
return sysctl_sched_rt_runtime >= 0;
}
+/*
+ * To keep the bandwidth of -deadline tasks under control
+ * we need some place where:
+ * - store the maximum -deadline bandwidth of each cpu;
+ * - cache the fraction of bandwidth that is currently allocated in
+ * each root domain;
+ *
+ * This is all done in the data structure below. It is similar to the
+ * one used for RT-throttling (rt_bandwidth), with the main difference
+ * that, since here we are only interested in admission control, we
+ * do not decrease any runtime while the group "executes", neither we
+ * need a timer to replenish it.
+ *
+ * With respect to SMP, bandwidth is given on a per root domain basis,
+ * meaning that:
+ * - bw (< 100%) is the deadline bandwidth of each CPU;
+ * - total_bw is the currently allocated bandwidth in each root domain;
+ */
struct dl_bw {
raw_spinlock_t lock;
u64 bw;
@@ -801,6 +794,15 @@ struct root_domain {
struct dl_bw dl_bw;
struct cpudl cpudl;
+ /*
+ * Indicate whether a root_domain's dl_bw has been checked or
+ * updated. It's monotonously increasing value.
+ *
+ * Also, some corner cases, like 'wrap around' is dangerous, but given
+ * that u64 is 'big enough'. So that shouldn't be a concern.
+ */
+ u64 visit_gen;
+
#ifdef HAVE_RT_PUSH_IPI
/*
* For IPI pull requests, loop across the rto_mask.
@@ -1864,7 +1866,7 @@ struct sched_class {
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_change_group)(struct task_struct *p, int type);
#endif
-} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */
+};
static inline void put_prev_task(struct rq *rq, struct task_struct *prev)
{
@@ -1878,6 +1880,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next)
next->sched_class->set_next_task(rq, next, false);
}
+
+/*
+ * Helper to define a sched_class instance; each one is placed in a separate
+ * section which is ordered by the linker script:
+ *
+ * include/asm-generic/vmlinux.lds.h
+ *
+ * Also enforce alignment on the instance, not the type, to guarantee layout.
+ */
+#define DEFINE_SCHED_CLASS(name) \
+const struct sched_class name##_sched_class \
+ __aligned(__alignof__(struct sched_class)) \
+ __section("__" #name "_sched_class")
+
/* Defined in include/asm-generic/vmlinux.lds.h */
extern struct sched_class __begin_sched_classes[];
extern struct sched_class __end_sched_classes[];
diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c
index ceb5b6b12561..91bb10cc070e 100644
--- a/kernel/sched/stop_task.c
+++ b/kernel/sched/stop_task.c
@@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq)
/*
* Simple, special scheduling class for the per-CPU stop tasks:
*/
-const struct sched_class stop_sched_class
- __section("__stop_sched_class") = {
+DEFINE_SCHED_CLASS(stop) = {
.enqueue_task = enqueue_task_stop,
.dequeue_task = dequeue_task_stop,
diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c
index dd7770226086..90f3e5558fa2 100644
--- a/kernel/sched/topology.c
+++ b/kernel/sched/topology.c
@@ -516,6 +516,7 @@ static int init_rootdomain(struct root_domain *rd)
init_irq_work(&rd->rto_push_work, rto_push_irq_work_func);
#endif
+ rd->visit_gen = 0;
init_dl_bw(&rd->dl_bw);
if (cpudl_init(&rd->cpudl) != 0)
goto free_rto_mask;