diff options
-rw-r--r-- | include/linux/sched/mm.h | 5 | ||||
-rw-r--r-- | kernel/exit.c | 16 | ||||
-rw-r--r-- | kernel/kthread.c | 21 | ||||
-rw-r--r-- | kernel/sched/core.c | 6 | ||||
-rw-r--r-- | kernel/sched/cpupri.c | 48 | ||||
-rw-r--r-- | kernel/sched/cpupri.h | 8 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 51 | ||||
-rw-r--r-- | kernel/sched/fair.c | 54 | ||||
-rw-r--r-- | kernel/sched/idle.c | 5 | ||||
-rw-r--r-- | kernel/sched/membarrier.c | 147 | ||||
-rw-r--r-- | kernel/sched/rt.c | 20 | ||||
-rw-r--r-- | kernel/sched/sched.h | 68 | ||||
-rw-r--r-- | kernel/sched/stop_task.c | 3 | ||||
-rw-r--r-- | kernel/sched/topology.c | 1 |
14 files changed, 364 insertions, 89 deletions
diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index d5ece7a9a403..a91fb3ad9ec7 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -347,6 +347,8 @@ static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) extern void membarrier_exec_mmap(struct mm_struct *mm); +extern void membarrier_update_current_mm(struct mm_struct *next_mm); + #else #ifdef CONFIG_ARCH_HAS_MEMBARRIER_CALLBACKS static inline void membarrier_arch_switch_mm(struct mm_struct *prev, @@ -361,6 +363,9 @@ static inline void membarrier_exec_mmap(struct mm_struct *mm) static inline void membarrier_mm_sync_core_before_usermode(struct mm_struct *mm) { } +static inline void membarrier_update_current_mm(struct mm_struct *next_mm) +{ +} #endif #endif /* _LINUX_SCHED_MM_H */ diff --git a/kernel/exit.c b/kernel/exit.c index 87a2d515de0d..a3dd6b36f99a 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -475,10 +475,24 @@ static void exit_mm(void) BUG_ON(mm != current->active_mm); /* more a memory barrier than a real lock */ task_lock(current); + /* + * When a thread stops operating on an address space, the loop + * in membarrier_private_expedited() may not observe that + * tsk->mm, and the loop in membarrier_global_expedited() may + * not observe a MEMBARRIER_STATE_GLOBAL_EXPEDITED + * rq->membarrier_state, so those would not issue an IPI. + * Membarrier requires a memory barrier after accessing + * user-space memory, before clearing tsk->mm or the + * rq->membarrier_state. + */ + smp_mb__after_spinlock(); + local_irq_disable(); current->mm = NULL; - mmap_read_unlock(mm); + membarrier_update_current_mm(NULL); enter_lazy_tlb(mm, current); + local_irq_enable(); task_unlock(current); + mmap_read_unlock(mm); mm_update_next_owner(mm); mmput(mm); if (test_thread_flag(TIF_MEMDIE)) diff --git a/kernel/kthread.c b/kernel/kthread.c index e29773c82b70..481428fe5f22 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -1248,6 +1248,7 @@ void kthread_use_mm(struct mm_struct *mm) tsk->active_mm = mm; } tsk->mm = mm; + membarrier_update_current_mm(mm); switch_mm_irqs_off(active_mm, mm, tsk); local_irq_enable(); task_unlock(tsk); @@ -1255,8 +1256,19 @@ void kthread_use_mm(struct mm_struct *mm) finish_arch_post_lock_switch(); #endif + /* + * When a kthread starts operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after storing to tsk->mm, before accessing + * user-space memory. A full memory barrier for membarrier + * {PRIVATE,GLOBAL}_EXPEDITED is implicitly provided by + * mmdrop(), or explicitly with smp_mb(). + */ if (active_mm != mm) mmdrop(active_mm); + else + smp_mb(); to_kthread(tsk)->oldfs = force_uaccess_begin(); } @@ -1276,9 +1288,18 @@ void kthread_unuse_mm(struct mm_struct *mm) force_uaccess_end(to_kthread(tsk)->oldfs); task_lock(tsk); + /* + * When a kthread stops operating on an address space, the loop + * in membarrier_{private,global}_expedited() may not observe + * that tsk->mm, and not issue an IPI. Membarrier requires a + * memory barrier after accessing user-space memory, before + * clearing tsk->mm. + */ + smp_mb__after_spinlock(); sync_mm_rss(mm); local_irq_disable(); tsk->mm = NULL; + membarrier_update_current_mm(NULL); /* active_mm is still 'mm' */ enter_lazy_tlb(mm, tsk); local_irq_enable(); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c6409f34fa2d..622f343413a6 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6596,12 +6596,8 @@ static void do_sched_yield(void) schedstat_inc(rq->yld_count); current->sched_class->yield_task(rq); - /* - * Since we are going to call schedule() anyway, there's - * no need to preempt or enable interrupts: - */ preempt_disable(); - rq_unlock(rq, &rf); + rq_unlock_irq(rq, &rf); sched_preempt_enable_no_resched(); schedule(); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 11c4df2010de..ec9be789c7e2 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -11,7 +11,7 @@ * This code tracks the priority of each CPU so that global migration * decisions are easy to calculate. Each CPU can be in a state as follows: * - * (INVALID), IDLE, NORMAL, RT1, ... RT99 + * (INVALID), NORMAL, RT1, ... RT99, HIGHER * * going from the lowest priority to the highest. CPUs in the INVALID state * are not eligible for routing. The system maintains this state with @@ -19,24 +19,48 @@ * in that class). Therefore a typical application without affinity * restrictions can find a suitable CPU with O(1) complexity (e.g. two bit * searches). For tasks with affinity restrictions, the algorithm has a - * worst case complexity of O(min(102, nr_domcpus)), though the scenario that + * worst case complexity of O(min(101, nr_domcpus)), though the scenario that * yields the worst case search is fairly contrived. */ #include "sched.h" -/* Convert between a 140 based task->prio, and our 102 based cpupri */ +/* + * p->rt_priority p->prio newpri cpupri + * + * -1 -1 (CPUPRI_INVALID) + * + * 99 0 (CPUPRI_NORMAL) + * + * 1 98 98 1 + * ... + * 49 50 50 49 + * 50 49 49 50 + * ... + * 99 0 0 99 + * + * 100 100 (CPUPRI_HIGHER) + */ static int convert_prio(int prio) { int cpupri; - if (prio == CPUPRI_INVALID) - cpupri = CPUPRI_INVALID; - else if (prio == MAX_PRIO) - cpupri = CPUPRI_IDLE; - else if (prio >= MAX_RT_PRIO) - cpupri = CPUPRI_NORMAL; - else - cpupri = MAX_RT_PRIO - prio + 1; + switch (prio) { + case CPUPRI_INVALID: + cpupri = CPUPRI_INVALID; /* -1 */ + break; + + case 0 ... 98: + cpupri = MAX_RT_PRIO-1 - prio; /* 1 ... 99 */ + break; + + case MAX_RT_PRIO-1: + cpupri = CPUPRI_NORMAL; /* 0 */ + break; + + case MAX_RT_PRIO: + cpupri = CPUPRI_HIGHER; /* 100 */ + break; + } return cpupri; } @@ -177,7 +201,7 @@ int cpupri_find_fitness(struct cpupri *cp, struct task_struct *p, * cpupri_set - update the CPU priority setting * @cp: The cpupri context * @cpu: The target CPU - * @newpri: The priority (INVALID-RT99) to assign to this CPU + * @newpri: The priority (INVALID,NORMAL,RT1-RT99,HIGHER) to assign to this CPU * * Note: Assumes cpu_rq(cpu)->lock is locked * diff --git a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h index efbb492bb94c..d6cba0020064 100644 --- a/kernel/sched/cpupri.h +++ b/kernel/sched/cpupri.h @@ -1,11 +1,11 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO + 2) +#define CPUPRI_NR_PRIORITIES (MAX_RT_PRIO+1) #define CPUPRI_INVALID -1 -#define CPUPRI_IDLE 0 -#define CPUPRI_NORMAL 1 -/* values 2-101 are RT priorities 0-99 */ +#define CPUPRI_NORMAL 0 +/* values 1-99 are for RT1-RT99 priorities */ +#define CPUPRI_HIGHER 100 struct cpupri_vec { atomic_t count; diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index eed2e449b313..cc1feb79d786 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -97,6 +97,17 @@ static inline unsigned long dl_bw_capacity(int i) return __dl_bw_capacity(i); } } + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + struct root_domain *rd = cpu_rq(cpu)->rd; + + if (rd->visit_gen == gen) + return true; + + rd->visit_gen = gen; + return false; +} #else static inline struct dl_bw *dl_bw_of(int i) { @@ -112,6 +123,11 @@ static inline unsigned long dl_bw_capacity(int i) { return SCHED_CAPACITY_SCALE; } + +static inline bool dl_bw_visited(int cpu, u64 gen) +{ + return false; +} #endif static inline @@ -1378,6 +1394,8 @@ static void inc_dl_deadline(struct dl_rq *dl_rq, u64 deadline) if (dl_rq->earliest_dl.curr == 0 || dl_time_before(deadline, dl_rq->earliest_dl.curr)) { + if (dl_rq->earliest_dl.curr == 0) + cpupri_set(&rq->rd->cpupri, rq->cpu, CPUPRI_HIGHER); dl_rq->earliest_dl.curr = deadline; cpudl_set(&rq->rd->cpudl, rq->cpu, deadline); } @@ -1395,6 +1413,7 @@ static void dec_dl_deadline(struct dl_rq *dl_rq, u64 deadline) dl_rq->earliest_dl.curr = 0; dl_rq->earliest_dl.next = 0; cpudl_clear(&rq->rd->cpudl, rq->cpu); + cpupri_set(&rq->rd->cpupri, rq->cpu, rq->rt.highest_prio.curr); } else { struct rb_node *leftmost = dl_rq->root.rb_leftmost; struct sched_dl_entity *entry; @@ -2518,8 +2537,8 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, } } -const struct sched_class dl_sched_class - __section("__dl_sched_class") = { +DEFINE_SCHED_CLASS(dl) = { + .enqueue_task = enqueue_task_dl, .dequeue_task = dequeue_task_dl, .yield_task = yield_task_dl, @@ -2551,33 +2570,39 @@ const struct sched_class dl_sched_class .update_curr = update_curr_dl, }; +/* Used for dl_bw check and update, used under sched_rt_handler()::mutex */ +static u64 dl_generation; + int sched_dl_global_validate(void) { u64 runtime = global_rt_runtime(); u64 period = global_rt_period(); u64 new_bw = to_ratio(period, runtime); + u64 gen = ++dl_generation; struct dl_bw *dl_b; - int cpu, ret = 0; + int cpu, cpus, ret = 0; unsigned long flags; /* * Here we want to check the bandwidth not being set to some * value smaller than the currently allocated bandwidth in * any of the root_domains. - * - * FIXME: Cycling on all the CPUs is overdoing, but simpler than - * cycling on root_domains... Discussion on different/better - * solutions is welcome! */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) + goto next; + dl_b = dl_bw_of(cpu); + cpus = dl_bw_cpus(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); - if (new_bw < dl_b->total_bw) + if (new_bw * cpus < dl_b->total_bw) ret = -EBUSY; raw_spin_unlock_irqrestore(&dl_b->lock, flags); +next: rcu_read_unlock_sched(); if (ret) @@ -2603,6 +2628,7 @@ static void init_dl_rq_bw_ratio(struct dl_rq *dl_rq) void sched_dl_do_global(void) { u64 new_bw = -1; + u64 gen = ++dl_generation; struct dl_bw *dl_b; int cpu; unsigned long flags; @@ -2613,11 +2639,14 @@ void sched_dl_do_global(void) if (global_rt_runtime() != RUNTIME_INF) new_bw = to_ratio(global_rt_period(), global_rt_runtime()); - /* - * FIXME: As above... - */ for_each_possible_cpu(cpu) { rcu_read_lock_sched(); + + if (dl_bw_visited(cpu, gen)) { + rcu_read_unlock_sched(); + continue; + } + dl_b = dl_bw_of(cpu); raw_spin_lock_irqsave(&dl_b->lock, flags); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 290f9e38378c..2755a7e0f1ce 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -906,6 +906,15 @@ update_stats_wait_end(struct cfs_rq *cfs_rq, struct sched_entity *se) if (!schedstat_enabled()) return; + /* + * When the sched_schedstat changes from 0 to 1, some sched se + * maybe already in the runqueue, the se->statistics.wait_start + * will be 0.So it will let the delta wrong. We need to avoid this + * scenario. + */ + if (unlikely(!schedstat_val(se->statistics.wait_start))) + return; + delta = rq_clock(rq_of(cfs_rq)) - schedstat_val(se->statistics.wait_start); if (entity_is_task(se)) { @@ -4779,25 +4788,37 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) struct cfs_rq *qcfs_rq = cfs_rq_of(se); /* throttled entity or throttle-on-deactivate */ if (!se->on_rq) - break; + goto done; - if (dequeue) { - dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); - } else { - update_load_avg(qcfs_rq, se, 0); - se_update_runnable(se); - } + dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); qcfs_rq->h_nr_running -= task_delta; qcfs_rq->idle_h_nr_running -= idle_task_delta; - if (qcfs_rq->load.weight) - dequeue = 0; + if (qcfs_rq->load.weight) { + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + break; + } } - if (!se) - sub_nr_running(rq, task_delta); + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + + update_load_avg(qcfs_rq, se, 0); + se_update_runnable(se); + + qcfs_rq->h_nr_running -= task_delta; + qcfs_rq->idle_h_nr_running -= idle_task_delta; + } + + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); +done: /* * Note: distribution will already see us throttled via the * throttled-list. rq->lock protects completion. @@ -5804,6 +5825,9 @@ wake_affine_idle(int this_cpu, int prev_cpu, int sync) if (sync && cpu_rq(this_cpu)->nr_running == 1) return this_cpu; + if (available_idle_cpu(prev_cpu)) + return prev_cpu; + return nr_cpumask_bits; } @@ -10047,6 +10071,10 @@ static inline int find_new_ilb(void) for_each_cpu_and(ilb, nohz.idle_cpus_mask, housekeeping_cpumask(HK_FLAG_MISC)) { + + if (ilb == smp_processor_id()) + continue; + if (idle_cpu(ilb)) return ilb; } @@ -11158,8 +11186,8 @@ static unsigned int get_rr_interval_fair(struct rq *rq, struct task_struct *task /* * All the scheduling class methods: */ -const struct sched_class fair_sched_class - __section("__fair_sched_class") = { +DEFINE_SCHED_CLASS(fair) = { + .enqueue_task = enqueue_task_fair, .dequeue_task = dequeue_task_fair, .yield_task = yield_task_fair, diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 24d0ee26377d..9da69c4e0ee9 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -338,6 +338,7 @@ void play_idle_precise(u64 duration_ns, u64 latency_ns) WARN_ON_ONCE(!(current->flags & PF_KTHREAD)); WARN_ON_ONCE(!(current->flags & PF_NO_SETAFFINITY)); WARN_ON_ONCE(!duration_ns); + WARN_ON_ONCE(current->mm); rcu_sleep_check(); preempt_disable(); @@ -457,8 +458,8 @@ static void update_curr_idle(struct rq *rq) /* * Simple, special scheduling class for the per-CPU idle tasks: */ -const struct sched_class idle_sched_class - __section("__idle_sched_class") = { +DEFINE_SCHED_CLASS(idle) = { + /* no enqueue/yield_task for idle tasks */ /* dequeue is not valid, we print a debug message there: */ diff --git a/kernel/sched/membarrier.c b/kernel/sched/membarrier.c index e23e74d52db5..5a40b3828ff2 100644 --- a/kernel/sched/membarrier.c +++ b/kernel/sched/membarrier.c @@ -7,6 +7,134 @@ #include "sched.h" /* + * For documentation purposes, here are some membarrier ordering + * scenarios to keep in mind: + * + * A) Userspace thread execution after IPI vs membarrier's memory + * barrier before sending the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the start of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU0 before + * the membarrier() is executed will be visible to any code executing on + * CPU1 after the IPI-induced memory barrier: + * + * CPU0 CPU1 + * + * x = 1 + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r2 = y + * y = 1 + * barrier() + * r1 = x + * + * BUG_ON(r1 == 0 && r2 == 0) + * + * The write to y and load from x by CPU1 are unordered by the hardware, + * so it's possible to have "r1 = x" reordered before "y = 1" at any + * point after (b). If the memory barrier at (a) is omitted, then "x = 1" + * can be reordered after (a) (although not after (c)), so we get r1 == 0 + * and r2 == 0. This violates the guarantee that membarrier() is + * supposed by provide. + * + * The timing of the memory barrier at (a) has to ensure that it executes + * before the IPI-induced memory barrier on CPU1. + * + * B) Userspace thread execution before IPI vs membarrier's memory + * barrier after completing the IPI + * + * Userspace variables: + * + * int x = 0, y = 0; + * + * The memory barrier at the end of membarrier() on CPU0 is necessary in + * order to enforce the guarantee that any writes occurring on CPU1 before + * the membarrier() is executed will be visible to any code executing on + * CPU0 after the membarrier(): + * + * CPU0 CPU1 + * + * x = 1 + * barrier() + * y = 1 + * r2 = y + * membarrier(): + * a: smp_mb() + * b: send IPI IPI-induced mb + * c: smp_mb() + * r1 = x + * BUG_ON(r1 == 0 && r2 == 1) + * + * The writes to x and y are unordered by the hardware, so it's possible to + * have "r2 = 1" even though the write to x doesn't execute until (b). If + * the memory barrier at (c) is omitted then "r1 = x" can be reordered + * before (b) (although not before (a)), so we get "r1 = 0". This violates + * the guarantee that membarrier() is supposed to provide. + * + * The timing of the memory barrier at (c) has to ensure that it executes + * after the IPI-induced memory barrier on CPU1. + * + * C) Scheduling userspace thread -> kthread -> userspace thread vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * d: switch to kthread (includes mb) + * b: read rq->curr->mm == NULL + * e: switch to user (includes mb) + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (e). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + * + * D) exit_mm vs membarrier + * + * Two thread groups are created, A and B. Thread group B is created by + * issuing clone from group A with flag CLONE_VM set, but not CLONE_THREAD. + * Let's assume we have a single thread within each thread group (Thread A + * and Thread B). Thread A runs on CPU0, Thread B runs on CPU1. + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * exit_mm(): + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * c: smp_mb() + * + * Using scenario (B), we can show that (c) needs to be paired with (d). + * + * E) kthread_{use,unuse}_mm vs membarrier + * + * CPU0 CPU1 + * + * membarrier(): + * a: smp_mb() + * kthread_unuse_mm() + * d: smp_mb() + * e: current->mm = NULL + * b: read rq->curr->mm == NULL + * kthread_use_mm() + * f: current->mm = mm + * g: smp_mb() + * c: smp_mb() + * + * Using the scenario from (A), we can show that (a) needs to be paired + * with (g). Using the scenario from (B), we can show that (c) needs to + * be paired with (d). + */ + +/* * Bitmask made from a "or" of all commands within enum membarrier_cmd, * except MEMBARRIER_CMD_QUERY. */ @@ -76,6 +204,18 @@ void membarrier_exec_mmap(struct mm_struct *mm) this_cpu_write(runqueues.membarrier_state, 0); } +void membarrier_update_current_mm(struct mm_struct *next_mm) +{ + struct rq *rq = this_rq(); + int membarrier_state = 0; + + if (next_mm) + membarrier_state = atomic_read(&next_mm->membarrier_state); + if (READ_ONCE(rq->membarrier_state) == membarrier_state) + return; + WRITE_ONCE(rq->membarrier_state, membarrier_state); +} + static int membarrier_global_expedited(void) { int cpu; @@ -114,12 +254,11 @@ static int membarrier_global_expedited(void) continue; /* - * Skip the CPU if it runs a kernel thread. The scheduler - * leaves the prior task mm in place as an optimization when - * scheduling a kthread. + * Skip the CPU if it runs a kernel thread which is not using + * a task mm. */ p = rcu_dereference(cpu_rq(cpu)->curr); - if (p->flags & PF_KTHREAD) + if (!p->mm) continue; __cpumask_set_cpu(cpu, tmpmask); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index c592e47cafed..c961a3fc0166 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -89,8 +89,8 @@ void init_rt_rq(struct rt_rq *rt_rq) __set_bit(MAX_RT_PRIO, array->bitmap); #if defined CONFIG_SMP - rt_rq->highest_prio.curr = MAX_RT_PRIO; - rt_rq->highest_prio.next = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + rt_rq->highest_prio.next = MAX_RT_PRIO-1; rt_rq->rt_nr_migratory = 0; rt_rq->overloaded = 0; plist_head_init(&rt_rq->pushable_tasks); @@ -161,7 +161,7 @@ void init_tg_rt_entry(struct task_group *tg, struct rt_rq *rt_rq, { struct rq *rq = cpu_rq(cpu); - rt_rq->highest_prio.curr = MAX_RT_PRIO; + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; rt_rq->rt_nr_boosted = 0; rt_rq->rq = rq; rt_rq->tg = tg; @@ -393,8 +393,9 @@ static void dequeue_pushable_task(struct rq *rq, struct task_struct *p) p = plist_first_entry(&rq->rt.pushable_tasks, struct task_struct, pushable_tasks); rq->rt.highest_prio.next = p->prio; - } else - rq->rt.highest_prio.next = MAX_RT_PRIO; + } else { + rq->rt.highest_prio.next = MAX_RT_PRIO-1; + } } #else @@ -1147,8 +1148,9 @@ dec_rt_prio(struct rt_rq *rt_rq, int prio) sched_find_first_bit(array->bitmap); } - } else - rt_rq->highest_prio.curr = MAX_RT_PRIO; + } else { + rt_rq->highest_prio.curr = MAX_RT_PRIO-1; + } dec_rt_prio_smp(rt_rq, prio, prev_prio); } @@ -2467,8 +2469,8 @@ static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task) return 0; } -const struct sched_class rt_sched_class - __section("__rt_sched_class") = { +DEFINE_SCHED_CLASS(rt) = { + .enqueue_task = enqueue_task_rt, .dequeue_task = dequeue_task_rt, .yield_task = yield_task_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 56992aaca48e..e897d779839f 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -67,7 +67,6 @@ #include <linux/tsacct_kern.h> #include <asm/tlb.h> -#include <asm-generic/vmlinux.lds.h> #ifdef CONFIG_PARAVIRT # include <asm/paravirt.h> @@ -257,30 +256,6 @@ struct rt_bandwidth { void __dl_clear_params(struct task_struct *p); -/* - * To keep the bandwidth of -deadline tasks and groups under control - * we need some place where: - * - store the maximum -deadline bandwidth of the system (the group); - * - cache the fraction of that bandwidth that is currently allocated. - * - * This is all done in the data structure below. It is similar to the - * one used for RT-throttling (rt_bandwidth), with the main difference - * that, since here we are only interested in admission control, we - * do not decrease any runtime while the group "executes", neither we - * need a timer to replenish it. - * - * With respect to SMP, the bandwidth is given on a per-CPU basis, - * meaning that: - * - dl_bw (< 100%) is the bandwidth of the system (group) on each CPU; - * - dl_total_bw array contains, in the i-eth element, the currently - * allocated bandwidth on the i-eth CPU. - * Moreover, groups consume bandwidth on each CPU, while tasks only - * consume bandwidth on the CPU they're running on. - * Finally, dl_total_bw_cpu is used to cache the index of dl_total_bw - * that will be shown the next time the proc or cgroup controls will - * be red. It on its turn can be changed by writing on its own - * control. - */ struct dl_bandwidth { raw_spinlock_t dl_runtime_lock; u64 dl_runtime; @@ -292,6 +267,24 @@ static inline int dl_bandwidth_enabled(void) return sysctl_sched_rt_runtime >= 0; } +/* + * To keep the bandwidth of -deadline tasks under control + * we need some place where: + * - store the maximum -deadline bandwidth of each cpu; + * - cache the fraction of bandwidth that is currently allocated in + * each root domain; + * + * This is all done in the data structure below. It is similar to the + * one used for RT-throttling (rt_bandwidth), with the main difference + * that, since here we are only interested in admission control, we + * do not decrease any runtime while the group "executes", neither we + * need a timer to replenish it. + * + * With respect to SMP, bandwidth is given on a per root domain basis, + * meaning that: + * - bw (< 100%) is the deadline bandwidth of each CPU; + * - total_bw is the currently allocated bandwidth in each root domain; + */ struct dl_bw { raw_spinlock_t lock; u64 bw; @@ -801,6 +794,15 @@ struct root_domain { struct dl_bw dl_bw; struct cpudl cpudl; + /* + * Indicate whether a root_domain's dl_bw has been checked or + * updated. It's monotonously increasing value. + * + * Also, some corner cases, like 'wrap around' is dangerous, but given + * that u64 is 'big enough'. So that shouldn't be a concern. + */ + u64 visit_gen; + #ifdef HAVE_RT_PUSH_IPI /* * For IPI pull requests, loop across the rto_mask. @@ -1864,7 +1866,7 @@ struct sched_class { #ifdef CONFIG_FAIR_GROUP_SCHED void (*task_change_group)(struct task_struct *p, int type); #endif -} __aligned(STRUCT_ALIGNMENT); /* STRUCT_ALIGN(), vmlinux.lds.h */ +}; static inline void put_prev_task(struct rq *rq, struct task_struct *prev) { @@ -1878,6 +1880,20 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) next->sched_class->set_next_task(rq, next, false); } + +/* + * Helper to define a sched_class instance; each one is placed in a separate + * section which is ordered by the linker script: + * + * include/asm-generic/vmlinux.lds.h + * + * Also enforce alignment on the instance, not the type, to guarantee layout. + */ +#define DEFINE_SCHED_CLASS(name) \ +const struct sched_class name##_sched_class \ + __aligned(__alignof__(struct sched_class)) \ + __section("__" #name "_sched_class") + /* Defined in include/asm-generic/vmlinux.lds.h */ extern struct sched_class __begin_sched_classes[]; extern struct sched_class __end_sched_classes[]; diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c index ceb5b6b12561..91bb10cc070e 100644 --- a/kernel/sched/stop_task.c +++ b/kernel/sched/stop_task.c @@ -109,8 +109,7 @@ static void update_curr_stop(struct rq *rq) /* * Simple, special scheduling class for the per-CPU stop tasks: */ -const struct sched_class stop_sched_class - __section("__stop_sched_class") = { +DEFINE_SCHED_CLASS(stop) = { .enqueue_task = enqueue_task_stop, .dequeue_task = dequeue_task_stop, diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index dd7770226086..90f3e5558fa2 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -516,6 +516,7 @@ static int init_rootdomain(struct root_domain *rd) init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); #endif + rd->visit_gen = 0; init_dl_bw(&rd->dl_bw); if (cpudl_init(&rd->cpudl) != 0) goto free_rto_mask; |