diff options
Diffstat (limited to 'kernel')
-rw-r--r-- | kernel/Makefile | 3 | ||||
-rw-r--r-- | kernel/bpf/core.c | 100 | ||||
-rw-r--r-- | kernel/bpf/sockmap.c | 18 | ||||
-rw-r--r-- | kernel/bpf/verifier.c | 145 | ||||
-rw-r--r-- | kernel/iomem.c | 167 | ||||
-rw-r--r-- | kernel/kthread.c | 6 | ||||
-rw-r--r-- | kernel/locking/rwsem-xadd.c | 19 | ||||
-rw-r--r-- | kernel/locking/rwsem.c | 2 | ||||
-rw-r--r-- | kernel/locking/rwsem.h | 30 | ||||
-rw-r--r-- | kernel/memremap.c | 210 | ||||
-rw-r--r-- | kernel/resource.c | 1 | ||||
-rw-r--r-- | kernel/sched/deadline.c | 4 | ||||
-rw-r--r-- | kernel/sched/rt.c | 2 | ||||
-rw-r--r-- | kernel/sched/sched.h | 5 | ||||
-rw-r--r-- | kernel/sched/topology.c | 2 | ||||
-rw-r--r-- | kernel/seccomp.c | 21 | ||||
-rw-r--r-- | kernel/sys.c | 28 | ||||
-rw-r--r-- | kernel/time/tick-broadcast.c | 8 |
18 files changed, 492 insertions, 279 deletions
diff --git a/kernel/Makefile b/kernel/Makefile index f85ae5dfa474..9b9241361311 100644 --- a/kernel/Makefile +++ b/kernel/Makefile @@ -112,7 +112,8 @@ obj-$(CONFIG_JUMP_LABEL) += jump_label.o obj-$(CONFIG_CONTEXT_TRACKING) += context_tracking.o obj-$(CONFIG_TORTURE_TEST) += torture.o -obj-$(CONFIG_HAS_IOMEM) += memremap.o +obj-$(CONFIG_HAS_IOMEM) += iomem.o +obj-$(CONFIG_ZONE_DEVICE) += memremap.o $(obj)/configs.o: $(obj)/config_data.h diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c index ba03ec39efb3..6ef6746a7871 100644 --- a/kernel/bpf/core.c +++ b/kernel/bpf/core.c @@ -218,47 +218,84 @@ int bpf_prog_calc_tag(struct bpf_prog *fp) return 0; } -static void bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta) +static int bpf_adj_delta_to_imm(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) { + const s64 imm_min = S32_MIN, imm_max = S32_MAX; + s64 imm = insn->imm; + + if (curr < pos && curr + imm + 1 > pos) + imm += delta; + else if (curr > pos + delta && curr + imm + 1 <= pos + delta) + imm -= delta; + if (imm < imm_min || imm > imm_max) + return -ERANGE; + if (!probe_pass) + insn->imm = imm; + return 0; +} + +static int bpf_adj_delta_to_off(struct bpf_insn *insn, u32 pos, u32 delta, + u32 curr, const bool probe_pass) +{ + const s32 off_min = S16_MIN, off_max = S16_MAX; + s32 off = insn->off; + + if (curr < pos && curr + off + 1 > pos) + off += delta; + else if (curr > pos + delta && curr + off + 1 <= pos + delta) + off -= delta; + if (off < off_min || off > off_max) + return -ERANGE; + if (!probe_pass) + insn->off = off; + return 0; +} + +static int bpf_adj_branches(struct bpf_prog *prog, u32 pos, u32 delta, + const bool probe_pass) +{ + u32 i, insn_cnt = prog->len + (probe_pass ? delta : 0); struct bpf_insn *insn = prog->insnsi; - u32 i, insn_cnt = prog->len; - bool pseudo_call; - u8 code; - int off; + int ret = 0; for (i = 0; i < insn_cnt; i++, insn++) { + u8 code; + + /* In the probing pass we still operate on the original, + * unpatched image in order to check overflows before we + * do any other adjustments. Therefore skip the patchlet. + */ + if (probe_pass && i == pos) { + i += delta + 1; + insn++; + } code = insn->code; - if (BPF_CLASS(code) != BPF_JMP) - continue; - if (BPF_OP(code) == BPF_EXIT) + if (BPF_CLASS(code) != BPF_JMP || + BPF_OP(code) == BPF_EXIT) continue; + /* Adjust offset of jmps if we cross patch boundaries. */ if (BPF_OP(code) == BPF_CALL) { - if (insn->src_reg == BPF_PSEUDO_CALL) - pseudo_call = true; - else + if (insn->src_reg != BPF_PSEUDO_CALL) continue; + ret = bpf_adj_delta_to_imm(insn, pos, delta, i, + probe_pass); } else { - pseudo_call = false; + ret = bpf_adj_delta_to_off(insn, pos, delta, i, + probe_pass); } - off = pseudo_call ? insn->imm : insn->off; - - /* Adjust offset of jmps if we cross boundaries. */ - if (i < pos && i + off + 1 > pos) - off += delta; - else if (i > pos + delta && i + off + 1 <= pos + delta) - off -= delta; - - if (pseudo_call) - insn->imm = off; - else - insn->off = off; + if (ret) + break; } + + return ret; } struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, const struct bpf_insn *patch, u32 len) { u32 insn_adj_cnt, insn_rest, insn_delta = len - 1; + const u32 cnt_max = S16_MAX; struct bpf_prog *prog_adj; /* Since our patchlet doesn't expand the image, we're done. */ @@ -269,6 +306,15 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, insn_adj_cnt = prog->len + insn_delta; + /* Reject anything that would potentially let the insn->off + * target overflow when we have excessive program expansions. + * We need to probe here before we do any reallocation where + * we afterwards may not fail anymore. + */ + if (insn_adj_cnt > cnt_max && + bpf_adj_branches(prog, off, insn_delta, true)) + return NULL; + /* Several new instructions need to be inserted. Make room * for them. Likely, there's no need for a new allocation as * last page could have large enough tailroom. @@ -294,7 +340,11 @@ struct bpf_prog *bpf_patch_insn_single(struct bpf_prog *prog, u32 off, sizeof(*patch) * insn_rest); memcpy(prog_adj->insnsi + off, patch, sizeof(*patch) * len); - bpf_adj_branches(prog_adj, off, insn_delta); + /* We are guaranteed to not fail at this point, otherwise + * the ship has sailed to reverse to the original state. An + * overflow cannot happen at this point. + */ + BUG_ON(bpf_adj_branches(prog_adj, off, insn_delta, false)); return prog_adj; } diff --git a/kernel/bpf/sockmap.c b/kernel/bpf/sockmap.c index 098eca568c2b..95a84b2f10ce 100644 --- a/kernel/bpf/sockmap.c +++ b/kernel/bpf/sockmap.c @@ -1703,11 +1703,11 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, * we increment the refcnt. If this is the case abort with an * error. */ - verdict = bpf_prog_inc_not_zero(stab->bpf_verdict); + verdict = bpf_prog_inc_not_zero(verdict); if (IS_ERR(verdict)) return PTR_ERR(verdict); - parse = bpf_prog_inc_not_zero(stab->bpf_parse); + parse = bpf_prog_inc_not_zero(parse); if (IS_ERR(parse)) { bpf_prog_put(verdict); return PTR_ERR(parse); @@ -1715,12 +1715,12 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, } if (tx_msg) { - tx_msg = bpf_prog_inc_not_zero(stab->bpf_tx_msg); + tx_msg = bpf_prog_inc_not_zero(tx_msg); if (IS_ERR(tx_msg)) { - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } return PTR_ERR(tx_msg); } } @@ -1805,10 +1805,10 @@ static int sock_map_ctx_update_elem(struct bpf_sock_ops_kern *skops, out_free: smap_release_sock(psock, sock); out_progs: - if (verdict) - bpf_prog_put(verdict); - if (parse) + if (parse && verdict) { bpf_prog_put(parse); + bpf_prog_put(verdict); + } if (tx_msg) bpf_prog_put(tx_msg); write_unlock_bh(&sock->sk_callback_lock); diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c index 5dd1dcb902bf..1904e814f282 100644 --- a/kernel/bpf/verifier.c +++ b/kernel/bpf/verifier.c @@ -156,7 +156,29 @@ struct bpf_verifier_stack_elem { #define BPF_COMPLEXITY_LIMIT_INSNS 131072 #define BPF_COMPLEXITY_LIMIT_STACK 1024 -#define BPF_MAP_PTR_POISON ((void *)0xeB9F + POISON_POINTER_DELTA) +#define BPF_MAP_PTR_UNPRIV 1UL +#define BPF_MAP_PTR_POISON ((void *)((0xeB9FUL << 1) + \ + POISON_POINTER_DELTA)) +#define BPF_MAP_PTR(X) ((struct bpf_map *)((X) & ~BPF_MAP_PTR_UNPRIV)) + +static bool bpf_map_ptr_poisoned(const struct bpf_insn_aux_data *aux) +{ + return BPF_MAP_PTR(aux->map_state) == BPF_MAP_PTR_POISON; +} + +static bool bpf_map_ptr_unpriv(const struct bpf_insn_aux_data *aux) +{ + return aux->map_state & BPF_MAP_PTR_UNPRIV; +} + +static void bpf_map_ptr_store(struct bpf_insn_aux_data *aux, + const struct bpf_map *map, bool unpriv) +{ + BUILD_BUG_ON((unsigned long)BPF_MAP_PTR_POISON & BPF_MAP_PTR_UNPRIV); + unpriv |= bpf_map_ptr_unpriv(aux); + aux->map_state = (unsigned long)map | + (unpriv ? BPF_MAP_PTR_UNPRIV : 0UL); +} struct bpf_call_arg_meta { struct bpf_map *map_ptr; @@ -978,7 +1000,7 @@ static bool register_is_null(struct bpf_reg_state *reg) */ static int check_stack_write(struct bpf_verifier_env *env, struct bpf_func_state *state, /* func where register points to */ - int off, int size, int value_regno) + int off, int size, int value_regno, int insn_idx) { struct bpf_func_state *cur; /* state of the current function */ int i, slot = -off - 1, spi = slot / BPF_REG_SIZE, err; @@ -1017,8 +1039,33 @@ static int check_stack_write(struct bpf_verifier_env *env, state->stack[spi].spilled_ptr = cur->regs[value_regno]; state->stack[spi].spilled_ptr.live |= REG_LIVE_WRITTEN; - for (i = 0; i < BPF_REG_SIZE; i++) + for (i = 0; i < BPF_REG_SIZE; i++) { + if (state->stack[spi].slot_type[i] == STACK_MISC && + !env->allow_ptr_leaks) { + int *poff = &env->insn_aux_data[insn_idx].sanitize_stack_off; + int soff = (-spi - 1) * BPF_REG_SIZE; + + /* detected reuse of integer stack slot with a pointer + * which means either llvm is reusing stack slot or + * an attacker is trying to exploit CVE-2018-3639 + * (speculative store bypass) + * Have to sanitize that slot with preemptive + * store of zero. + */ + if (*poff && *poff != soff) { + /* disallow programs where single insn stores + * into two different stack slots, since verifier + * cannot sanitize them + */ + verbose(env, + "insn %d cannot access two stack slots fp%d and fp%d", + insn_idx, *poff, soff); + return -EINVAL; + } + *poff = soff; + } state->stack[spi].slot_type[i] = STACK_SPILL; + } } else { u8 type = STACK_MISC; @@ -1694,7 +1741,7 @@ static int check_mem_access(struct bpf_verifier_env *env, int insn_idx, u32 regn if (t == BPF_WRITE) err = check_stack_write(env, state, off, size, - value_regno); + value_regno, insn_idx); else err = check_stack_read(env, state, off, size, value_regno); @@ -2333,6 +2380,29 @@ static int prepare_func_exit(struct bpf_verifier_env *env, int *insn_idx) return 0; } +static int +record_func_map(struct bpf_verifier_env *env, struct bpf_call_arg_meta *meta, + int func_id, int insn_idx) +{ + struct bpf_insn_aux_data *aux = &env->insn_aux_data[insn_idx]; + + if (func_id != BPF_FUNC_tail_call && + func_id != BPF_FUNC_map_lookup_elem) + return 0; + if (meta->map_ptr == NULL) { + verbose(env, "kernel subsystem misconfigured verifier\n"); + return -EINVAL; + } + + if (!BPF_MAP_PTR(aux->map_state)) + bpf_map_ptr_store(aux, meta->map_ptr, + meta->map_ptr->unpriv_array); + else if (BPF_MAP_PTR(aux->map_state) != meta->map_ptr) + bpf_map_ptr_store(aux, BPF_MAP_PTR_POISON, + meta->map_ptr->unpriv_array); + return 0; +} + static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn_idx) { const struct bpf_func_proto *fn = NULL; @@ -2387,13 +2457,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn err = check_func_arg(env, BPF_REG_2, fn->arg2_type, &meta); if (err) return err; - if (func_id == BPF_FUNC_tail_call) { - if (meta.map_ptr == NULL) { - verbose(env, "verifier bug\n"); - return -EINVAL; - } - env->insn_aux_data[insn_idx].map_ptr = meta.map_ptr; - } err = check_func_arg(env, BPF_REG_3, fn->arg3_type, &meta); if (err) return err; @@ -2404,6 +2467,10 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn if (err) return err; + err = record_func_map(env, &meta, func_id, insn_idx); + if (err) + return err; + /* Mark slots with STACK_MISC in case of raw mode, stack offset * is inferred from register state. */ @@ -2428,8 +2495,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } else if (fn->ret_type == RET_VOID) { regs[BPF_REG_0].type = NOT_INIT; } else if (fn->ret_type == RET_PTR_TO_MAP_VALUE_OR_NULL) { - struct bpf_insn_aux_data *insn_aux; - regs[BPF_REG_0].type = PTR_TO_MAP_VALUE_OR_NULL; /* There is no offset yet applied, variable or fixed */ mark_reg_known_zero(env, regs, BPF_REG_0); @@ -2445,11 +2510,6 @@ static int check_helper_call(struct bpf_verifier_env *env, int func_id, int insn } regs[BPF_REG_0].map_ptr = meta.map_ptr; regs[BPF_REG_0].id = ++env->id_gen; - insn_aux = &env->insn_aux_data[insn_idx]; - if (!insn_aux->map_ptr) - insn_aux->map_ptr = meta.map_ptr; - else if (insn_aux->map_ptr != meta.map_ptr) - insn_aux->map_ptr = BPF_MAP_PTR_POISON; } else { verbose(env, "unknown return type %d of func %s#%d\n", fn->ret_type, func_id_name(func_id), func_id); @@ -5169,6 +5229,34 @@ static int convert_ctx_accesses(struct bpf_verifier_env *env) else continue; + if (type == BPF_WRITE && + env->insn_aux_data[i + delta].sanitize_stack_off) { + struct bpf_insn patch[] = { + /* Sanitize suspicious stack slot with zero. + * There are no memory dependencies for this store, + * since it's only using frame pointer and immediate + * constant of zero + */ + BPF_ST_MEM(BPF_DW, BPF_REG_FP, + env->insn_aux_data[i + delta].sanitize_stack_off, + 0), + /* the original STX instruction will immediately + * overwrite the same stack slot with appropriate value + */ + *insn, + }; + + cnt = ARRAY_SIZE(patch); + new_prog = bpf_patch_insn_data(env, i + delta, patch, cnt); + if (!new_prog) + return -ENOMEM; + + delta += cnt - 1; + env->prog = new_prog; + insn = new_prog->insnsi + i + delta; + continue; + } + if (env->insn_aux_data[i + delta].ptr_type != PTR_TO_CTX) continue; @@ -5417,6 +5505,7 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) struct bpf_insn *insn = prog->insnsi; const struct bpf_func_proto *fn; const int insn_cnt = prog->len; + struct bpf_insn_aux_data *aux; struct bpf_insn insn_buf[16]; struct bpf_prog *new_prog; struct bpf_map *map_ptr; @@ -5491,19 +5580,22 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) insn->imm = 0; insn->code = BPF_JMP | BPF_TAIL_CALL; + aux = &env->insn_aux_data[i + delta]; + if (!bpf_map_ptr_unpriv(aux)) + continue; + /* instead of changing every JIT dealing with tail_call * emit two extra insns: * if (index >= max_entries) goto out; * index &= array->index_mask; * to avoid out-of-bounds cpu speculation */ - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON) { + if (bpf_map_ptr_poisoned(aux)) { verbose(env, "tail_call abusing map_ptr\n"); return -EINVAL; } - if (!map_ptr->unpriv_array) - continue; + + map_ptr = BPF_MAP_PTR(aux->map_state); insn_buf[0] = BPF_JMP_IMM(BPF_JGE, BPF_REG_3, map_ptr->max_entries, 2); insn_buf[1] = BPF_ALU32_IMM(BPF_AND, BPF_REG_3, @@ -5527,9 +5619,12 @@ static int fixup_bpf_calls(struct bpf_verifier_env *env) */ if (prog->jit_requested && BITS_PER_LONG == 64 && insn->imm == BPF_FUNC_map_lookup_elem) { - map_ptr = env->insn_aux_data[i + delta].map_ptr; - if (map_ptr == BPF_MAP_PTR_POISON || - !map_ptr->ops->map_gen_lookup) + aux = &env->insn_aux_data[i + delta]; + if (bpf_map_ptr_poisoned(aux)) + goto patch_call_imm; + + map_ptr = BPF_MAP_PTR(aux->map_state); + if (!map_ptr->ops->map_gen_lookup) goto patch_call_imm; cnt = map_ptr->ops->map_gen_lookup(map_ptr, insn_buf); diff --git a/kernel/iomem.c b/kernel/iomem.c new file mode 100644 index 000000000000..f7525e14ebc6 --- /dev/null +++ b/kernel/iomem.c @@ -0,0 +1,167 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#include <linux/device.h> +#include <linux/types.h> +#include <linux/io.h> +#include <linux/mm.h> + +#ifndef ioremap_cache +/* temporary while we convert existing ioremap_cache users to memremap */ +__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) +{ + return ioremap(offset, size); +} +#endif + +#ifndef arch_memremap_wb +static void *arch_memremap_wb(resource_size_t offset, unsigned long size) +{ + return (__force void *)ioremap_cache(offset, size); +} +#endif + +#ifndef arch_memremap_can_ram_remap +static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + return true; +} +#endif + +static void *try_ram_remap(resource_size_t offset, size_t size, + unsigned long flags) +{ + unsigned long pfn = PHYS_PFN(offset); + + /* In the simple case just return the existing linear address */ + if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && + arch_memremap_can_ram_remap(offset, size, flags)) + return __va(offset); + + return NULL; /* fallback to arch_memremap_wb */ +} + +/** + * memremap() - remap an iomem_resource as cacheable memory + * @offset: iomem resource start address + * @size: size of remap + * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, + * MEMREMAP_ENC, MEMREMAP_DEC + * + * memremap() is "ioremap" for cases where it is known that the resource + * being mapped does not have i/o side effects and the __iomem + * annotation is not applicable. In the case of multiple flags, the different + * mapping types will be attempted in the order listed below until one of + * them succeeds. + * + * MEMREMAP_WB - matches the default mapping for System RAM on + * the architecture. This is usually a read-allocate write-back cache. + * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM + * memremap() will bypass establishing a new mapping and instead return + * a pointer into the direct map. + * + * MEMREMAP_WT - establish a mapping whereby writes either bypass the + * cache or are written through to memory and never exist in a + * cache-dirty state with respect to program visibility. Attempts to + * map System RAM with this mapping type will fail. + * + * MEMREMAP_WC - establish a writecombine mapping, whereby writes may + * be coalesced together (e.g. in the CPU's write buffers), but is otherwise + * uncached. Attempts to map System RAM with this mapping type will fail. + */ +void *memremap(resource_size_t offset, size_t size, unsigned long flags) +{ + int is_ram = region_intersects(offset, size, + IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); + void *addr = NULL; + + if (!flags) + return NULL; + + if (is_ram == REGION_MIXED) { + WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + /* Try all mapping types requested until one returns non-NULL */ + if (flags & MEMREMAP_WB) { + /* + * MEMREMAP_WB is special in that it can be satisifed + * from the direct map. Some archs depend on the + * capability of memremap() to autodetect cases where + * the requested range is potentially in System RAM. + */ + if (is_ram == REGION_INTERSECTS) + addr = try_ram_remap(offset, size, flags); + if (!addr) + addr = arch_memremap_wb(offset, size); + } + + /* + * If we don't have a mapping yet and other request flags are + * present then we will be attempting to establish a new virtual + * address mapping. Enforce that this mapping is not aliasing + * System RAM. + */ + if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { + WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", + &offset, (unsigned long) size); + return NULL; + } + + if (!addr && (flags & MEMREMAP_WT)) + addr = ioremap_wt(offset, size); + + if (!addr && (flags & MEMREMAP_WC)) + addr = ioremap_wc(offset, size); + + return addr; +} +EXPORT_SYMBOL(memremap); + +void memunmap(void *addr) +{ + if (is_vmalloc_addr(addr)) + iounmap((void __iomem *) addr); +} +EXPORT_SYMBOL(memunmap); + +static void devm_memremap_release(struct device *dev, void *res) +{ + memunmap(*(void **)res); +} + +static int devm_memremap_match(struct device *dev, void *res, void *match_data) +{ + return *(void **)res == match_data; +} + +void *devm_memremap(struct device *dev, resource_size_t offset, + size_t size, unsigned long flags) +{ + void **ptr, *addr; + + ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, + dev_to_node(dev)); + if (!ptr) + return ERR_PTR(-ENOMEM); + + addr = memremap(offset, size, flags); + if (addr) { + *ptr = addr; + devres_add(dev, ptr); + } else { + devres_free(ptr); + return ERR_PTR(-ENXIO); + } + + return addr; +} +EXPORT_SYMBOL(devm_memremap); + +void devm_memunmap(struct device *dev, void *addr) +{ + WARN_ON(devres_release(dev, devm_memremap_release, + devm_memremap_match, addr)); +} +EXPORT_SYMBOL(devm_memunmap); diff --git a/kernel/kthread.c b/kernel/kthread.c index 2017a39ab490..481951bf091d 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -193,7 +193,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); void kthread_park_complete(struct task_struct *k) { - complete(&to_kthread(k)->parked); + complete_all(&to_kthread(k)->parked); } static int kthread(void *_create) @@ -459,6 +459,7 @@ void kthread_unpark(struct task_struct *k) if (test_bit(KTHREAD_IS_PER_CPU, &kthread->flags)) __kthread_bind(k, kthread->cpu, TASK_PARKED); + reinit_completion(&kthread->parked); clear_bit(KTHREAD_SHOULD_PARK, &kthread->flags); wake_up_state(k, TASK_PARKED); } @@ -483,9 +484,6 @@ int kthread_park(struct task_struct *k) if (WARN_ON(k->flags & PF_EXITING)) return -ENOSYS; - if (WARN_ON_ONCE(test_bit(KTHREAD_SHOULD_PARK, &kthread->flags))) - return -EBUSY; - set_bit(KTHREAD_SHOULD_PARK, &kthread->flags); if (k != current) { wake_up_process(k); diff --git a/kernel/locking/rwsem-xadd.c b/kernel/locking/rwsem-xadd.c index e795908f3607..a90336779375 100644 --- a/kernel/locking/rwsem-xadd.c +++ b/kernel/locking/rwsem-xadd.c @@ -352,16 +352,15 @@ static inline bool rwsem_can_spin_on_owner(struct rw_semaphore *sem) struct task_struct *owner; bool ret = true; + BUILD_BUG_ON(!rwsem_has_anonymous_owner(RWSEM_OWNER_UNKNOWN)); + if (need_resched()) return false; rcu_read_lock(); owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) { - /* - * Don't spin if the rwsem is readers owned. - */ - ret = !rwsem_owner_is_reader(owner); + if (!owner || !is_rwsem_owner_spinnable(owner)) { + ret = !owner; /* !owner is spinnable */ goto done; } @@ -382,11 +381,11 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) { struct task_struct *owner = READ_ONCE(sem->owner); - if (!rwsem_owner_is_writer(owner)) - goto out; + if (!is_rwsem_owner_spinnable(owner)) + return false; rcu_read_lock(); - while (sem->owner == owner) { + while (owner && (READ_ONCE(sem->owner) == owner)) { /* * Ensure we emit the owner->on_cpu, dereference _after_ * checking sem->owner still matches owner, if that fails, @@ -408,12 +407,12 @@ static noinline bool rwsem_spin_on_owner(struct rw_semaphore *sem) cpu_relax(); } rcu_read_unlock(); -out: + /* * If there is a new owner or the owner is not set, we continue * spinning. */ - return !rwsem_owner_is_reader(READ_ONCE(sem->owner)); + return is_rwsem_owner_spinnable(READ_ONCE(sem->owner)); } static bool rwsem_optimistic_spin(struct rw_semaphore *sem) diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index 30465a2f2b6c..bc1e507be9ff 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -221,5 +221,3 @@ void up_read_non_owner(struct rw_semaphore *sem) EXPORT_SYMBOL(up_read_non_owner); #endif - - diff --git a/kernel/locking/rwsem.h b/kernel/locking/rwsem.h index a17cba8d94bb..b9d0e72aa80f 100644 --- a/kernel/locking/rwsem.h +++ b/kernel/locking/rwsem.h @@ -1,20 +1,24 @@ /* SPDX-License-Identifier: GPL-2.0 */ /* * The owner field of the rw_semaphore structure will be set to - * RWSEM_READ_OWNED when a reader grabs the lock. A writer will clear + * RWSEM_READER_OWNED when a reader grabs the lock. A writer will clear * the owner field when it unlocks. A reader, on the other hand, will * not touch the owner field when it unlocks. * - * In essence, the owner field now has the following 3 states: + * In essence, the owner field now has the following 4 states: * 1) 0 * - lock is free or the owner hasn't set the field yet * 2) RWSEM_READER_OWNED * - lock is currently or previously owned by readers (lock is free * or not set by owner yet) - * 3) Other non-zero value - * - a writer owns the lock + * 3) RWSEM_ANONYMOUSLY_OWNED bit set with some other bits set as well + * - lock is owned by an anonymous writer, so spinning on the lock + * owner should be disabled. + * 4) Other non-zero value + * - a writer owns the lock and other writers can spin on the lock owner. */ -#define RWSEM_READER_OWNED ((struct task_struct *)1UL) +#define RWSEM_ANONYMOUSLY_OWNED (1UL << 0) +#define RWSEM_READER_OWNED ((struct task_struct *)RWSEM_ANONYMOUSLY_OWNED) #ifdef CONFIG_DEBUG_RWSEMS # define DEBUG_RWSEMS_WARN_ON(c) DEBUG_LOCKS_WARN_ON(c) @@ -51,14 +55,22 @@ static inline void rwsem_set_reader_owned(struct rw_semaphore *sem) WRITE_ONCE(sem->owner, RWSEM_READER_OWNED); } -static inline bool rwsem_owner_is_writer(struct task_struct *owner) +/* + * Return true if the a rwsem waiter can spin on the rwsem's owner + * and steal the lock, i.e. the lock is not anonymously owned. + * N.B. !owner is considered spinnable. + */ +static inline bool is_rwsem_owner_spinnable(struct task_struct *owner) { - return owner && owner != RWSEM_READER_OWNED; + return !((unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED); } -static inline bool rwsem_owner_is_reader(struct task_struct *owner) +/* + * Return true if rwsem is owned by an anonymous writer or readers. + */ +static inline bool rwsem_has_anonymous_owner(struct task_struct *owner) { - return owner == RWSEM_READER_OWNED; + return (unsigned long)owner & RWSEM_ANONYMOUSLY_OWNED; } #else static inline void rwsem_set_owner(struct rw_semaphore *sem) diff --git a/kernel/memremap.c b/kernel/memremap.c index 895e6b76b25e..5857267a4af5 100644 --- a/kernel/memremap.c +++ b/kernel/memremap.c @@ -1,15 +1,5 @@ -/* - * Copyright(c) 2015 Intel Corporation. All rights reserved. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of version 2 of the GNU General Public License as - * published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License for more details. - */ +/* SPDX-License-Identifier: GPL-2.0 */ +/* Copyright(c) 2015 Intel Corporation. All rights reserved. */ #include <linux/radix-tree.h> #include <linux/device.h> #include <linux/types.h> @@ -19,170 +9,8 @@ #include <linux/memory_hotplug.h> #include <linux/swap.h> #include <linux/swapops.h> +#include <linux/wait_bit.h> -#ifndef ioremap_cache -/* temporary while we convert existing ioremap_cache users to memremap */ -__weak void __iomem *ioremap_cache(resource_size_t offset, unsigned long size) -{ - return ioremap(offset, size); -} -#endif - -#ifndef arch_memremap_wb -static void *arch_memremap_wb(resource_size_t offset, unsigned long size) -{ - return (__force void *)ioremap_cache(offset, size); -} -#endif - -#ifndef arch_memremap_can_ram_remap -static bool arch_memremap_can_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - return true; -} -#endif - -static void *try_ram_remap(resource_size_t offset, size_t size, - unsigned long flags) -{ - unsigned long pfn = PHYS_PFN(offset); - - /* In the simple case just return the existing linear address */ - if (pfn_valid(pfn) && !PageHighMem(pfn_to_page(pfn)) && - arch_memremap_can_ram_remap(offset, size, flags)) - return __va(offset); - - return NULL; /* fallback to arch_memremap_wb */ -} - -/** - * memremap() - remap an iomem_resource as cacheable memory - * @offset: iomem resource start address - * @size: size of remap - * @flags: any of MEMREMAP_WB, MEMREMAP_WT, MEMREMAP_WC, - * MEMREMAP_ENC, MEMREMAP_DEC - * - * memremap() is "ioremap" for cases where it is known that the resource - * being mapped does not have i/o side effects and the __iomem - * annotation is not applicable. In the case of multiple flags, the different - * mapping types will be attempted in the order listed below until one of - * them succeeds. - * - * MEMREMAP_WB - matches the default mapping for System RAM on - * the architecture. This is usually a read-allocate write-back cache. - * Morever, if MEMREMAP_WB is specified and the requested remap region is RAM - * memremap() will bypass establishing a new mapping and instead return - * a pointer into the direct map. - * - * MEMREMAP_WT - establish a mapping whereby writes either bypass the - * cache or are written through to memory and never exist in a - * cache-dirty state with respect to program visibility. Attempts to - * map System RAM with this mapping type will fail. - * - * MEMREMAP_WC - establish a writecombine mapping, whereby writes may - * be coalesced together (e.g. in the CPU's write buffers), but is otherwise - * uncached. Attempts to map System RAM with this mapping type will fail. - */ -void *memremap(resource_size_t offset, size_t size, unsigned long flags) -{ - int is_ram = region_intersects(offset, size, - IORESOURCE_SYSTEM_RAM, IORES_DESC_NONE); - void *addr = NULL; - - if (!flags) - return NULL; - - if (is_ram == REGION_MIXED) { - WARN_ONCE(1, "memremap attempted on mixed range %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - /* Try all mapping types requested until one returns non-NULL */ - if (flags & MEMREMAP_WB) { - /* - * MEMREMAP_WB is special in that it can be satisifed - * from the direct map. Some archs depend on the - * capability of memremap() to autodetect cases where - * the requested range is potentially in System RAM. - */ - if (is_ram == REGION_INTERSECTS) - addr = try_ram_remap(offset, size, flags); - if (!addr) - addr = arch_memremap_wb(offset, size); - } - - /* - * If we don't have a mapping yet and other request flags are - * present then we will be attempting to establish a new virtual - * address mapping. Enforce that this mapping is not aliasing - * System RAM. - */ - if (!addr && is_ram == REGION_INTERSECTS && flags != MEMREMAP_WB) { - WARN_ONCE(1, "memremap attempted on ram %pa size: %#lx\n", - &offset, (unsigned long) size); - return NULL; - } - - if (!addr && (flags & MEMREMAP_WT)) - addr = ioremap_wt(offset, size); - - if (!addr && (flags & MEMREMAP_WC)) - addr = ioremap_wc(offset, size); - - return addr; -} -EXPORT_SYMBOL(memremap); - -void memunmap(void *addr) -{ - if (is_vmalloc_addr(addr)) - iounmap((void __iomem *) addr); -} -EXPORT_SYMBOL(memunmap); - -static void devm_memremap_release(struct device *dev, void *res) -{ - memunmap(*(void **)res); -} - -static int devm_memremap_match(struct device *dev, void *res, void *match_data) -{ - return *(void **)res == match_data; -} - -void *devm_memremap(struct device *dev, resource_size_t offset, - size_t size, unsigned long flags) -{ - void **ptr, *addr; - - ptr = devres_alloc_node(devm_memremap_release, sizeof(*ptr), GFP_KERNEL, - dev_to_node(dev)); - if (!ptr) - return ERR_PTR(-ENOMEM); - - addr = memremap(offset, size, flags); - if (addr) { - *ptr = addr; - devres_add(dev, ptr); - } else { - devres_free(ptr); - return ERR_PTR(-ENXIO); - } - - return addr; -} -EXPORT_SYMBOL(devm_memremap); - -void devm_memunmap(struct device *dev, void *addr) -{ - WARN_ON(devres_release(dev, devm_memremap_release, - devm_memremap_match, addr)); -} -EXPORT_SYMBOL(devm_memunmap); - -#ifdef CONFIG_ZONE_DEVICE static DEFINE_MUTEX(pgmap_lock); static RADIX_TREE(pgmap_radix, GFP_KERNEL); #define SECTION_MASK ~((1UL << PA_SECTION_SHIFT) - 1) @@ -473,10 +301,32 @@ struct dev_pagemap *get_dev_pagemap(unsigned long pfn, return pgmap; } -#endif /* CONFIG_ZONE_DEVICE */ +EXPORT_SYMBOL_GPL(get_dev_pagemap); + +#ifdef CONFIG_DEV_PAGEMAP_OPS +DEFINE_STATIC_KEY_FALSE(devmap_managed_key); +EXPORT_SYMBOL_GPL(devmap_managed_key); +static atomic_t devmap_enable; + +/* + * Toggle the static key for ->page_free() callbacks when dev_pagemap + * pages go idle. + */ +void dev_pagemap_get_ops(void) +{ + if (atomic_inc_return(&devmap_enable) == 1) + static_branch_enable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_get_ops); + +void dev_pagemap_put_ops(void) +{ + if (atomic_dec_and_test(&devmap_enable)) + static_branch_disable(&devmap_managed_key); +} +EXPORT_SYMBOL_GPL(dev_pagemap_put_ops); -#if IS_ENABLED(CONFIG_DEVICE_PRIVATE) || IS_ENABLED(CONFIG_DEVICE_PUBLIC) -void put_zone_device_private_or_public_page(struct page *page) +void __put_devmap_managed_page(struct page *page) { int count = page_ref_dec_return(page); @@ -496,5 +346,5 @@ void put_zone_device_private_or_public_page(struct page *page) } else if (!count) __put_page(page); } -EXPORT_SYMBOL(put_zone_device_private_or_public_page); -#endif /* CONFIG_DEVICE_PRIVATE || CONFIG_DEVICE_PUBLIC */ +EXPORT_SYMBOL_GPL(__put_devmap_managed_page); +#endif /* CONFIG_DEV_PAGEMAP_OPS */ diff --git a/kernel/resource.c b/kernel/resource.c index 2af6c03858b9..b85f59e8a4b8 100644 --- a/kernel/resource.c +++ b/kernel/resource.c @@ -448,6 +448,7 @@ int walk_iomem_res_desc(unsigned long desc, unsigned long flags, u64 start, return __walk_iomem_res_desc(&res, desc, false, arg, func); } +EXPORT_SYMBOL_GPL(walk_iomem_res_desc); /* * This function calls the @func callback against all memory ranges of type diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index e7b3008b85bb..1356afd1eeb6 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1117,7 +1117,7 @@ extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); * should be larger than 2^(64 - 20 - 8), which is more than 64 seconds. * So, overflow is not an issue here. */ -u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) +static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) { u64 u_inact = rq->dl.this_bw - rq->dl.running_bw; /* Utot - Uact */ u64 u_act; @@ -2731,8 +2731,6 @@ bool dl_cpu_busy(unsigned int cpu) #endif #ifdef CONFIG_SCHED_DEBUG -extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); - void print_dl_stats(struct seq_file *m, int cpu) { print_dl_rq(m, cpu, &cpu_rq(cpu)->dl); diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index 7aef6b4e885a..ef3c4e6f5345 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -2701,8 +2701,6 @@ int sched_rr_handler(struct ctl_table *table, int write, } #ifdef CONFIG_SCHED_DEBUG -extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); - void print_rt_stats(struct seq_file *m, int cpu) { rt_rq_iter_t iter; diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index 15750c222ca2..1f0a4bc6a39d 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2025,8 +2025,9 @@ extern bool sched_debug_enabled; extern void print_cfs_stats(struct seq_file *m, int cpu); extern void print_rt_stats(struct seq_file *m, int cpu); extern void print_dl_stats(struct seq_file *m, int cpu); -extern void -print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq); +extern void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq); +extern void print_dl_rq(struct seq_file *m, int cpu, struct dl_rq *dl_rq); #ifdef CONFIG_NUMA_BALANCING extern void show_numa_stats(struct task_struct *p, struct seq_file *m); diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 64cc564f5255..61a1125c1ae4 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -1708,7 +1708,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att rcu_read_unlock(); if (rq && sched_debug_enabled) { - pr_info("span: %*pbl (max cpu_capacity = %lu)\n", + pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n", cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity); } diff --git a/kernel/seccomp.c b/kernel/seccomp.c index dc77548167ef..e691d9a6c58d 100644 --- a/kernel/seccomp.c +++ b/kernel/seccomp.c @@ -19,6 +19,8 @@ #include <linux/compat.h> #include <linux/coredump.h> #include <linux/kmemleak.h> +#include <linux/nospec.h> +#include <linux/prctl.h> #include <linux/sched.h> #include <linux/sched/task_stack.h> #include <linux/seccomp.h> @@ -227,8 +229,11 @@ static inline bool seccomp_may_assign_mode(unsigned long seccomp_mode) return true; } +void __weak arch_seccomp_spec_mitigate(struct task_struct *task) { } + static inline void seccomp_assign_mode(struct task_struct *task, - unsigned long seccomp_mode) + unsigned long seccomp_mode, + unsigned long flags) { assert_spin_locked(&task->sighand->siglock); @@ -238,6 +243,9 @@ static inline void seccomp_assign_mode(struct task_struct *task, * filter) is set. */ smp_mb__before_atomic(); + /* Assume default seccomp processes want spec flaw mitigation. */ + if ((flags & SECCOMP_FILTER_FLAG_SPEC_ALLOW) == 0) + arch_seccomp_spec_mitigate(task); set_tsk_thread_flag(task, TIF_SECCOMP); } @@ -305,7 +313,7 @@ static inline pid_t seccomp_can_sync_threads(void) * without dropping the locks. * */ -static inline void seccomp_sync_threads(void) +static inline void seccomp_sync_threads(unsigned long flags) { struct task_struct *thread, *caller; @@ -346,7 +354,8 @@ static inline void seccomp_sync_threads(void) * allow one thread to transition the other. */ if (thread->seccomp.mode == SECCOMP_MODE_DISABLED) - seccomp_assign_mode(thread, SECCOMP_MODE_FILTER); + seccomp_assign_mode(thread, SECCOMP_MODE_FILTER, + flags); } } @@ -469,7 +478,7 @@ static long seccomp_attach_filter(unsigned int flags, /* Now that the new filter is in place, synchronize to all threads. */ if (flags & SECCOMP_FILTER_FLAG_TSYNC) - seccomp_sync_threads(); + seccomp_sync_threads(flags); return 0; } @@ -818,7 +827,7 @@ static long seccomp_set_mode_strict(void) #ifdef TIF_NOTSC disable_TSC(); #endif - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, 0); ret = 0; out: @@ -876,7 +885,7 @@ static long seccomp_set_mode_filter(unsigned int flags, /* Do not free the successfully attached filter. */ prepared = NULL; - seccomp_assign_mode(current, seccomp_mode); + seccomp_assign_mode(current, seccomp_mode, flags); out: spin_unlock_irq(¤t->sighand->siglock); if (flags & SECCOMP_FILTER_FLAG_TSYNC) diff --git a/kernel/sys.c b/kernel/sys.c index ad692183dfe9..d1b2b8d934bb 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -61,6 +61,8 @@ #include <linux/uidgid.h> #include <linux/cred.h> +#include <linux/nospec.h> + #include <linux/kmsg_dump.h> /* Move somewhere else to avoid recompiling? */ #include <generated/utsrelease.h> @@ -69,6 +71,9 @@ #include <asm/io.h> #include <asm/unistd.h> +/* Hardening for Spectre-v1 */ +#include <linux/nospec.h> + #include "uid16.h" #ifndef SET_UNALIGN_CTL @@ -1451,6 +1456,7 @@ SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); x = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -1470,6 +1476,7 @@ COMPAT_SYSCALL_DEFINE2(old_getrlimit, unsigned int, resource, if (resource >= RLIM_NLIMITS) return -EINVAL; + resource = array_index_nospec(resource, RLIM_NLIMITS); task_lock(current->group_leader); r = current->signal->rlim[resource]; task_unlock(current->group_leader); @@ -2242,6 +2249,17 @@ static int propagate_has_child_subreaper(struct task_struct *p, void *data) return 1; } +int __weak arch_prctl_spec_ctrl_get(struct task_struct *t, unsigned long which) +{ + return -EINVAL; +} + +int __weak arch_prctl_spec_ctrl_set(struct task_struct *t, unsigned long which, + unsigned long ctrl) +{ + return -EINVAL; +} + SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, unsigned long, arg4, unsigned long, arg5) { @@ -2450,6 +2468,16 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_SVE_GET_VL: error = SVE_GET_VL(); break; + case PR_GET_SPECULATION_CTRL: + if (arg3 || arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_get(me, arg2); + break; + case PR_SET_SPECULATION_CTRL: + if (arg4 || arg5) + return -EINVAL; + error = arch_prctl_spec_ctrl_set(me, arg2, arg3); + break; default: error = -EINVAL; break; diff --git a/kernel/time/tick-broadcast.c b/kernel/time/tick-broadcast.c index b398c2ea69b2..aa2094d5dd27 100644 --- a/kernel/time/tick-broadcast.c +++ b/kernel/time/tick-broadcast.c @@ -612,6 +612,14 @@ static void tick_handle_oneshot_broadcast(struct clock_event_device *dev) now = ktime_get(); /* Find all expired events */ for_each_cpu(cpu, tick_broadcast_oneshot_mask) { + /* + * Required for !SMP because for_each_cpu() reports + * unconditionally CPU0 as set on UP kernels. + */ + if (!IS_ENABLED(CONFIG_SMP) && + cpumask_empty(tick_broadcast_oneshot_mask)) + break; + td = &per_cpu(tick_cpu_device, cpu); if (td->evtdev->next_event <= now) { cpumask_set_cpu(cpu, tmpmask); |