diff options
54 files changed, 399 insertions, 273 deletions
diff --git a/Documentation/static-keys.txt b/Documentation/static-keys.txt index f4cb0b2d5cd7..477927becacb 100644 --- a/Documentation/static-keys.txt +++ b/Documentation/static-keys.txt @@ -15,8 +15,8 @@ The updated API replacements are: DEFINE_STATIC_KEY_TRUE(key); DEFINE_STATIC_KEY_FALSE(key); -static_key_likely() -statick_key_unlikely() +static_branch_likely() +static_branch_unlikely() 0) Abstract diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 7d95663c0160..07d1811aa03f 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -32,6 +32,7 @@ config ARM64 select GENERIC_CLOCKEVENTS_BROADCAST select GENERIC_CPU_AUTOPROBE select GENERIC_EARLY_IOREMAP + select GENERIC_IDLE_POLL_SETUP select GENERIC_IRQ_PROBE select GENERIC_IRQ_SHOW select GENERIC_IRQ_SHOW_LEVEL @@ -331,6 +332,22 @@ config ARM64_ERRATUM_845719 If unsure, say Y. +config ARM64_ERRATUM_843419 + bool "Cortex-A53: 843419: A load or store might access an incorrect address" + depends on MODULES + default y + help + This option builds kernel modules using the large memory model in + order to avoid the use of the ADRP instruction, which can cause + a subsequent memory access to use an incorrect address on Cortex-A53 + parts up to r0p4. + + Note that the kernel itself must be linked with a version of ld + which fixes potentially affected ADRP instructions through the + use of veneers. + + If unsure, say Y. + endmenu diff --git a/arch/arm64/Makefile b/arch/arm64/Makefile index 15ff5b4156fd..f9914d7c1bb0 100644 --- a/arch/arm64/Makefile +++ b/arch/arm64/Makefile @@ -41,6 +41,10 @@ endif CHECKFLAGS += -D__aarch64__ +ifeq ($(CONFIG_ARM64_ERRATUM_843419), y) +CFLAGS_MODULE += -mcmodel=large +endif + # Default value head-y := arch/arm64/kernel/head.o diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 6900b2d95371..b0329be95cb1 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -26,13 +26,9 @@ * Software defined PTE bits definition. */ #define PTE_VALID (_AT(pteval_t, 1) << 0) +#define PTE_WRITE (PTE_DBM) /* same as DBM (51) */ #define PTE_DIRTY (_AT(pteval_t, 1) << 55) #define PTE_SPECIAL (_AT(pteval_t, 1) << 56) -#ifdef CONFIG_ARM64_HW_AFDBM -#define PTE_WRITE (PTE_DBM) /* same as DBM */ -#else -#define PTE_WRITE (_AT(pteval_t, 1) << 57) -#endif #define PTE_PROT_NONE (_AT(pteval_t, 1) << 58) /* only when !PTE_VALID */ /* @@ -146,7 +142,7 @@ extern struct page *empty_zero_page; #define pte_exec(pte) (!(pte_val(pte) & PTE_UXN)) #ifdef CONFIG_ARM64_HW_AFDBM -#define pte_hw_dirty(pte) (!(pte_val(pte) & PTE_RDONLY)) +#define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) #else #define pte_hw_dirty(pte) (0) #endif @@ -238,7 +234,7 @@ extern void __sync_icache_dcache(pte_t pteval, unsigned long addr); * When hardware DBM is not present, the sofware PTE_DIRTY bit is updated via * the page fault mechanism. Checking the dirty status of a pte becomes: * - * PTE_DIRTY || !PTE_RDONLY + * PTE_DIRTY || (PTE_WRITE && !PTE_RDONLY) */ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) @@ -503,7 +499,7 @@ static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) PTE_PROT_NONE | PTE_WRITE | PTE_TYPE_MASK; /* preserve the hardware dirty information */ if (pte_hw_dirty(pte)) - newprot |= PTE_DIRTY; + pte = pte_mkdirty(pte); pte_val(pte) = (pte_val(pte) & ~mask) | (pgprot_val(newprot) & mask); return pte; } diff --git a/arch/arm64/kernel/debug-monitors.c b/arch/arm64/kernel/debug-monitors.c index 9b3b62ac9c24..cebf78661a55 100644 --- a/arch/arm64/kernel/debug-monitors.c +++ b/arch/arm64/kernel/debug-monitors.c @@ -134,7 +134,7 @@ static int os_lock_notify(struct notifier_block *self, unsigned long action, void *data) { int cpu = (unsigned long)data; - if (action == CPU_ONLINE) + if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) smp_call_function_single(cpu, clear_os_lock, NULL, 1); return NOTIFY_OK; } diff --git a/arch/arm64/kernel/head.S b/arch/arm64/kernel/head.S index a055be6125cf..90d09eddd5b2 100644 --- a/arch/arm64/kernel/head.S +++ b/arch/arm64/kernel/head.S @@ -523,6 +523,11 @@ CPU_LE( movk x0, #0x30d0, lsl #16 ) // Clear EE and E0E on LE systems msr hstr_el2, xzr // Disable CP15 traps to EL2 #endif + /* EL2 debug */ + mrs x0, pmcr_el0 // Disable debug access traps + ubfx x0, x0, #11, #5 // to EL2 and allow access to + msr mdcr_el2, x0 // all PMU counters from EL1 + /* Stage-2 translation */ msr vttbr_el2, xzr diff --git a/arch/arm64/kernel/hw_breakpoint.c b/arch/arm64/kernel/hw_breakpoint.c index c97040ecf838..bba85c8f8037 100644 --- a/arch/arm64/kernel/hw_breakpoint.c +++ b/arch/arm64/kernel/hw_breakpoint.c @@ -872,7 +872,7 @@ static int hw_breakpoint_reset_notify(struct notifier_block *self, void *hcpu) { int cpu = (long)hcpu; - if (action == CPU_ONLINE) + if ((action & ~CPU_TASKS_FROZEN) == CPU_ONLINE) smp_call_function_single(cpu, hw_breakpoint_reset, NULL, 1); return NOTIFY_OK; } diff --git a/arch/arm64/kernel/module.c b/arch/arm64/kernel/module.c index 67bf4107f6ef..876eb8df50bf 100644 --- a/arch/arm64/kernel/module.c +++ b/arch/arm64/kernel/module.c @@ -332,12 +332,14 @@ int apply_relocate_add(Elf64_Shdr *sechdrs, ovf = reloc_insn_imm(RELOC_OP_PREL, loc, val, 0, 21, AARCH64_INSN_IMM_ADR); break; +#ifndef CONFIG_ARM64_ERRATUM_843419 case R_AARCH64_ADR_PREL_PG_HI21_NC: overflow_check = false; case R_AARCH64_ADR_PREL_PG_HI21: ovf = reloc_insn_imm(RELOC_OP_PAGE, loc, val, 12, 21, AARCH64_INSN_IMM_ADR); break; +#endif case R_AARCH64_ADD_ABS_LO12_NC: case R_AARCH64_LDST8_ABS_LO12_NC: overflow_check = false; diff --git a/arch/arm64/kernel/signal32.c b/arch/arm64/kernel/signal32.c index 948f0ad2de23..71ef6dc89ae5 100644 --- a/arch/arm64/kernel/signal32.c +++ b/arch/arm64/kernel/signal32.c @@ -212,14 +212,32 @@ int copy_siginfo_from_user32(siginfo_t *to, compat_siginfo_t __user *from) /* * VFP save/restore code. + * + * We have to be careful with endianness, since the fpsimd context-switch + * code operates on 128-bit (Q) register values whereas the compat ABI + * uses an array of 64-bit (D) registers. Consequently, we need to swap + * the two halves of each Q register when running on a big-endian CPU. */ +union __fpsimd_vreg { + __uint128_t raw; + struct { +#ifdef __AARCH64EB__ + u64 hi; + u64 lo; +#else + u64 lo; + u64 hi; +#endif + }; +}; + static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) { struct fpsimd_state *fpsimd = ¤t->thread.fpsimd_state; compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr, fpexc; - int err = 0; + int i, err = 0; /* * Save the hardware registers to the fpsimd_state structure. @@ -235,10 +253,15 @@ static int compat_preserve_vfp_context(struct compat_vfp_sigframe __user *frame) /* * Now copy the FP registers. Since the registers are packed, * we can copy the prefix we want (V0-V15) as it is. - * FIXME: Won't work if big endian. */ - err |= __copy_to_user(&frame->ufp.fpregs, fpsimd->vregs, - sizeof(frame->ufp.fpregs)); + for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) { + union __fpsimd_vreg vreg = { + .raw = fpsimd->vregs[i >> 1], + }; + + __put_user_error(vreg.lo, &frame->ufp.fpregs[i], err); + __put_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err); + } /* Create an AArch32 fpscr from the fpsr and the fpcr. */ fpscr = (fpsimd->fpsr & VFP_FPSCR_STAT_MASK) | @@ -263,7 +286,7 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) compat_ulong_t magic = VFP_MAGIC; compat_ulong_t size = VFP_STORAGE_SIZE; compat_ulong_t fpscr; - int err = 0; + int i, err = 0; __get_user_error(magic, &frame->magic, err); __get_user_error(size, &frame->size, err); @@ -273,12 +296,14 @@ static int compat_restore_vfp_context(struct compat_vfp_sigframe __user *frame) if (magic != VFP_MAGIC || size != VFP_STORAGE_SIZE) return -EINVAL; - /* - * Copy the FP registers into the start of the fpsimd_state. - * FIXME: Won't work if big endian. - */ - err |= __copy_from_user(fpsimd.vregs, frame->ufp.fpregs, - sizeof(frame->ufp.fpregs)); + /* Copy the FP registers into the start of the fpsimd_state. */ + for (i = 0; i < ARRAY_SIZE(frame->ufp.fpregs); i += 2) { + union __fpsimd_vreg vreg; + + __get_user_error(vreg.lo, &frame->ufp.fpregs[i], err); + __get_user_error(vreg.hi, &frame->ufp.fpregs[i + 1], err); + fpsimd.vregs[i >> 1] = vreg.raw; + } /* Extract the fpsr and the fpcr from the fpscr */ __get_user_error(fpscr, &frame->ufp.fpscr, err); diff --git a/arch/arm64/mm/dma-mapping.c b/arch/arm64/mm/dma-mapping.c index 0bcc4bc94b4a..99224dcebdc5 100644 --- a/arch/arm64/mm/dma-mapping.c +++ b/arch/arm64/mm/dma-mapping.c @@ -100,7 +100,7 @@ static void *__dma_alloc_coherent(struct device *dev, size_t size, if (IS_ENABLED(CONFIG_ZONE_DMA) && dev->coherent_dma_mask <= DMA_BIT_MASK(32)) flags |= GFP_DMA; - if (IS_ENABLED(CONFIG_DMA_CMA) && (flags & __GFP_WAIT)) { + if (dev_get_cma_area(dev) && (flags & __GFP_WAIT)) { struct page *page; void *addr; diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 7aef2d52daa0..328c8352480c 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1006,7 +1006,7 @@ config X86_THERMAL_VECTOR depends on X86_MCE_INTEL config X86_LEGACY_VM86 - bool "Legacy VM86 support (obsolete)" + bool "Legacy VM86 support" default n depends on X86_32 ---help--- @@ -1018,19 +1018,20 @@ config X86_LEGACY_VM86 available to accelerate real mode DOS programs. However, any recent version of DOSEMU, X, or vbetool should be fully functional even without kernel VM86 support, as they will all - fall back to (pretty well performing) software emulation. + fall back to software emulation. Nevertheless, if you are using + a 16-bit DOS program where 16-bit performance matters, vm86 + mode might be faster than emulation and you might want to + enable this option. - Anything that works on a 64-bit kernel is unlikely to need - this option, as 64-bit kernels don't, and can't, support V8086 - mode. This option is also unrelated to 16-bit protected mode - and is not needed to run most 16-bit programs under Wine. + Note that any app that works on a 64-bit kernel is unlikely to + need this option, as 64-bit kernels don't, and can't, support + V8086 mode. This option is also unrelated to 16-bit protected + mode and is not needed to run most 16-bit programs under Wine. - Enabling this option adds considerable attack surface to the - kernel and slows down system calls and exception handling. + Enabling this option increases the complexity of the kernel + and slows down exception handling a tiny bit. - Unless you use very old userspace or need the last drop of - performance in your real mode DOS games and can't use KVM, - say N here. + If unsure, say N here. config VM86 bool diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 477fc28050e4..e6cf2ad350d1 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -241,6 +241,7 @@ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ #define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ /* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ #define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index ce029e4fa7c6..31247b5bff7c 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -97,7 +97,6 @@ struct pv_lazy_ops { struct pv_time_ops { unsigned long long (*sched_clock)(void); unsigned long long (*steal_clock)(int cpu); - unsigned long (*get_tsc_khz)(void); }; struct pv_cpu_ops { diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 9d51fae1cba3..eaba08076030 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -39,18 +39,27 @@ static inline void queued_spin_unlock(struct qspinlock *lock) } #endif -#define virt_queued_spin_lock virt_queued_spin_lock - -static inline bool virt_queued_spin_lock(struct qspinlock *lock) +#ifdef CONFIG_PARAVIRT +#define virt_spin_lock virt_spin_lock +static inline bool virt_spin_lock(struct qspinlock *lock) { if (!static_cpu_has(X86_FEATURE_HYPERVISOR)) return false; - while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0) - cpu_relax(); + /* + * On hypervisors without PARAVIRT_SPINLOCKS support we fall + * back to a Test-and-Set spinlock, because fair locks have + * horrible lock 'holder' preemption issues. + */ + + do { + while (atomic_read(&lock->val) != 0) + cpu_relax(); + } while (atomic_cmpxchg(&lock->val, 0, _Q_LOCKED_VAL) != 0); return true; } +#endif /* CONFIG_PARAVIRT */ #include <asm-generic/qspinlock.h> diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index c42827eb86cf..25f909362b7a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -338,10 +338,15 @@ done: static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) { + unsigned long flags; + if (instr[0] != 0x90) return; + local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); + sync_core(); + local_irq_restore(flags); DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", instr, a->instrlen - a->padlen, a->padlen); diff --git a/arch/x86/kernel/apic/apic.c b/arch/x86/kernel/apic/apic.c index 3ca3e46aa405..24e94ce454e2 100644 --- a/arch/x86/kernel/apic/apic.c +++ b/arch/x86/kernel/apic/apic.c @@ -336,6 +336,13 @@ static void __setup_APIC_LVTT(unsigned int clocks, int oneshot, int irqen) apic_write(APIC_LVTT, lvtt_value); if (lvtt_value & APIC_LVT_TIMER_TSCDEADLINE) { + /* + * See Intel SDM: TSC-Deadline Mode chapter. In xAPIC mode, + * writing to the APIC LVTT and TSC_DEADLINE MSR isn't serialized. + * According to Intel, MFENCE can do the serialization here. + */ + asm volatile("mfence" : : : "memory"); + printk_once(KERN_DEBUG "TSC deadline timer enabled\n"); return; } diff --git a/arch/x86/kernel/apic/io_apic.c b/arch/x86/kernel/apic/io_apic.c index 38a76f826530..5c60bb162622 100644 --- a/arch/x86/kernel/apic/io_apic.c +++ b/arch/x86/kernel/apic/io_apic.c @@ -2522,6 +2522,7 @@ void __init setup_ioapic_dest(void) int pin, ioapic, irq, irq_entry; const struct cpumask *mask; struct irq_data *idata; + struct irq_chip *chip; if (skip_ioapic_setup == 1) return; @@ -2545,9 +2546,9 @@ void __init setup_ioapic_dest(void) else mask = apic->target_cpus(); - irq_set_affinity(irq, mask); + chip = irq_data_get_irq_chip(idata); + chip->irq_set_affinity(idata, mask, false); } - } #endif diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 07ce52c22ec8..de22ea7ff82f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1110,10 +1110,10 @@ void print_cpu_info(struct cpuinfo_x86 *c) else printk(KERN_CONT "%d86", c->x86); - printk(KERN_CONT " (fam: %02x, model: %02x", c->x86, c->x86_model); + printk(KERN_CONT " (family: 0x%x, model: 0x%x", c->x86, c->x86_model); if (c->x86_mask || c->cpuid_level >= 0) - printk(KERN_CONT ", stepping: %02x)\n", c->x86_mask); + printk(KERN_CONT ", stepping: 0x%x)\n", c->x86_mask); else printk(KERN_CONT ")\n"); diff --git a/arch/x86/kernel/cpu/perf_event_intel.c b/arch/x86/kernel/cpu/perf_event_intel.c index cd9b6d0b10bf..3fefebfbdf4b 100644 --- a/arch/x86/kernel/cpu/perf_event_intel.c +++ b/arch/x86/kernel/cpu/perf_event_intel.c @@ -2316,9 +2316,12 @@ static struct event_constraint * intel_get_event_constraints(struct cpu_hw_events *cpuc, int idx, struct perf_event *event) { - struct event_constraint *c1 = cpuc->event_constraint[idx]; + struct event_constraint *c1 = NULL; struct event_constraint *c2; + if (idx >= 0) /* fake does < 0 */ + c1 = cpuc->event_constraint[idx]; + /* * first time only * - static constraint: no change across incremental scheduling calls diff --git a/arch/x86/kernel/cpu/perf_event_intel_bts.c b/arch/x86/kernel/cpu/perf_event_intel_bts.c index 54690e885759..d1c0f254afbe 100644 --- a/arch/x86/kernel/cpu/perf_event_intel_bts.c +++ b/arch/x86/kernel/cpu/perf_event_intel_bts.c @@ -222,6 +222,7 @@ static void __bts_event_start(struct perf_event *event) if (!buf || bts_buffer_is_full(buf, bts)) return; + event->hw.itrace_started = 1; event->hw.state = 0; if (!buf->snapshot) diff --git a/arch/x86/kernel/ldt.c b/arch/x86/kernel/ldt.c index 2bcc0525f1c1..6acc9dd91f36 100644 --- a/arch/x86/kernel/ldt.c +++ b/arch/x86/kernel/ldt.c @@ -58,7 +58,7 @@ static struct ldt_struct *alloc_ldt_struct(int size) if (alloc_size > PAGE_SIZE) new_ldt->entries = vzalloc(alloc_size); else - new_ldt->entries = kzalloc(PAGE_SIZE, GFP_KERNEL); + new_ldt->entries = (void *)get_zeroed_page(GFP_KERNEL); if (!new_ldt->entries) { kfree(new_ldt); @@ -95,7 +95,7 @@ static void free_ldt_struct(struct ldt_struct *ldt) if (ldt->size * LDT_ENTRY_SIZE > PAGE_SIZE) vfree(ldt->entries); else - kfree(ldt->entries); + free_page((unsigned long)ldt->entries); kfree(ldt); } diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index c8d52cb4cb6e..c3f7602cd038 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -21,6 +21,7 @@ #include <asm/hypervisor.h> #include <asm/nmi.h> #include <asm/x86_init.h> +#include <asm/geode.h> unsigned int __read_mostly cpu_khz; /* TSC clocks / usec, not used here */ EXPORT_SYMBOL(cpu_khz); @@ -1013,15 +1014,17 @@ EXPORT_SYMBOL_GPL(mark_tsc_unstable); static void __init check_system_tsc_reliable(void) { -#ifdef CONFIG_MGEODE_LX - /* RTSC counts during suspend */ +#if defined(CONFIG_MGEODEGX1) || defined(CONFIG_MGEODE_LX) || defined(CONFIG_X86_GENERIC) + if (is_geode_lx()) { + /* RTSC counts during suspend */ #define RTSC_SUSP 0x100 - unsigned long res_low, res_high; + unsigned long res_low, res_high; - rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); - /* Geode_LX - the OLPC CPU has a very reliable TSC */ - if (res_low & RTSC_SUSP) - tsc_clocksource_reliable = 1; + rdmsr_safe(MSR_GEODE_BUSCONT_CONF0, &res_low, &res_high); + /* Geode_LX - the OLPC CPU has a very reliable TSC */ + if (res_low & RTSC_SUSP) + tsc_clocksource_reliable = 1; + } #endif if (boot_cpu_has(X86_FEATURE_TSC_RELIABLE)) tsc_clocksource_reliable = 1; diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index abd8b856bd2b..524619351961 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -45,6 +45,7 @@ #include <linux/audit.h> #include <linux/stddef.h> #include <linux/slab.h> +#include <linux/security.h> #include <asm/uaccess.h> #include <asm/io.h> @@ -232,6 +233,32 @@ static long do_sys_vm86(struct vm86plus_struct __user *user_vm86, bool plus) struct pt_regs *regs = current_pt_regs(); unsigned long err = 0; + err = security_mmap_addr(0); + if (err) { + /* + * vm86 cannot virtualize the address space, so vm86 users + * need to manage the low 1MB themselves using mmap. Given + * that BIOS places important data in the first page, vm86 + * is essentially useless if mmap_min_addr != 0. DOSEMU, + * for example, won't even bother trying to use vm86 if it + * can't map a page at virtual address 0. + * + * To reduce the available kernel attack surface, simply + * disallow vm86(old) for users who cannot mmap at va 0. + * + * The implementation of security_mmap_addr will allow + * suitably privileged users to map va 0 even if + * vm.mmap_min_addr is set above 0, and we want this + * behavior for vm86 as well, as it ensures that legacy + * tools like vbetool will not fail just because of + * vm.mmap_min_addr. + */ + pr_info_once("Denied a call to vm86(old) from %s[%d] (uid: %d). Set the vm.mmap_min_addr sysctl to 0 and/or adjust LSM mmap_min_addr policy to enable vm86 if you are using a vm86-based DOS emulator.\n", + current->comm, task_pid_nr(current), + from_kuid_munged(&init_user_ns, current_uid())); + return -EPERM; + } + if (!vm86) { if (!(vm86 = kzalloc(sizeof(*vm86), GFP_KERNEL))) return -ENOMEM; diff --git a/arch/x86/mm/srat.c b/arch/x86/mm/srat.c index 66338a60aa6e..c2aea63bee20 100644 --- a/arch/x86/mm/srat.c +++ b/arch/x86/mm/srat.c @@ -192,10 +192,11 @@ acpi_numa_memory_affinity_init(struct acpi_srat_mem_affinity *ma) node_set(node, numa_nodes_parsed); - pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s\n", + pr_info("SRAT: Node %u PXM %u [mem %#010Lx-%#010Lx]%s%s\n", node, pxm, (unsigned long long) start, (unsigned long long) end - 1, - hotpluggable ? " hotplug" : ""); + hotpluggable ? " hotplug" : "", + ma->flags & ACPI_SRAT_MEM_NON_VOLATILE ? " non-volatile" : ""); /* Mark hotplug range in memblock. */ if (hotpluggable && memblock_mark_hotplug(start, ma->length)) diff --git a/drivers/power/twl4030_charger.c b/drivers/power/twl4030_charger.c index f4f2c1f76c32..74f2d3ff1d7c 100644 --- a/drivers/power/twl4030_charger.c +++ b/drivers/power/twl4030_charger.c @@ -91,7 +91,7 @@ #define TWL4030_MSTATEC_COMPLETE1 0x0b #define TWL4030_MSTATEC_COMPLETE4 0x0e -#if IS_ENABLED(CONFIG_TWL4030_MADC) +#if IS_REACHABLE(CONFIG_TWL4030_MADC) /* * If AC (Accessory Charger) voltage exceeds 4.5V (MADC 11) * then AC is available. @@ -1057,13 +1057,9 @@ static int twl4030_bci_probe(struct platform_device *pdev) phynode = of_find_compatible_node(bci->dev->of_node->parent, NULL, "ti,twl4030-usb"); - if (phynode) { + if (phynode) bci->transceiver = devm_usb_get_phy_by_node( bci->dev, phynode, &bci->usb_nb); - if (IS_ERR(bci->transceiver) && - PTR_ERR(bci->transceiver) == -EPROBE_DEFER) - return -EPROBE_DEFER; - } } /* Enable interrupts now. */ diff --git a/include/asm-generic/qspinlock.h b/include/asm-generic/qspinlock.h index 83bfb87f5bf1..e2aadbc7151f 100644 --- a/include/asm-generic/qspinlock.h +++ b/include/asm-generic/qspinlock.h @@ -111,8 +111,8 @@ static inline void queued_spin_unlock_wait(struct qspinlock *lock) cpu_relax(); } -#ifndef virt_queued_spin_lock -static __always_inline bool virt_queued_spin_lock(struct qspinlock *lock) +#ifndef virt_spin_lock +static __always_inline bool virt_spin_lock(struct qspinlock *lock) { return false; } diff --git a/include/linux/ceph/ceph_features.h b/include/linux/ceph/ceph_features.h index 4763ad64e832..f89b31d45cc8 100644 --- a/include/linux/ceph/ceph_features.h +++ b/include/linux/ceph/ceph_features.h @@ -107,6 +107,7 @@ static inline u64 ceph_sanitize_features(u64 features) CEPH_FEATURE_OSDMAP_ENC | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ + CEPH_FEATURE_MSGR_KEEPALIVE2 | \ CEPH_FEATURE_CRUSH_V4) #define CEPH_FEATURES_REQUIRED_DEFAULT \ diff --git a/include/linux/ceph/messenger.h b/include/linux/ceph/messenger.h index 7e1252e97a30..b2371d9b51fa 100644 --- a/include/linux/ceph/messenger.h +++ b/include/linux/ceph/messenger.h @@ -238,6 +238,8 @@ struct ceph_connection { bool out_kvec_is_msg; /* kvec refers to out_msg */ int out_more; /* there is more data after the kvecs */ __le64 out_temp_ack; /* for writing an ack */ + struct ceph_timespec out_temp_keepalive2; /* for writing keepalive2 + stamp */ /* message in temps */ struct ceph_msg_header in_hdr; @@ -248,7 +250,7 @@ struct ceph_connection { int in_base_pos; /* bytes read */ __le64 in_temp_ack; /* for reading an ack */ - struct timespec last_keepalive_ack; + struct timespec last_keepalive_ack; /* keepalive2 ack stamp */ struct delayed_work work; /* send|recv work */ unsigned long delay; /* current delay interval */ diff --git a/include/linux/clockchips.h b/include/linux/clockchips.h index 31ce435981fe..bdcf358dfce2 100644 --- a/include/linux/clockchips.h +++ b/include/linux/clockchips.h @@ -18,15 +18,6 @@ struct clock_event_device; struct module; -/* Clock event mode commands for legacy ->set_mode(): OBSOLETE */ -enum clock_event_mode { - CLOCK_EVT_MODE_UNUSED, - CLOCK_EVT_MODE_SHUTDOWN, - CLOCK_EVT_MODE_PERIODIC, - CLOCK_EVT_MODE_ONESHOT, - CLOCK_EVT_MODE_RESUME, -}; - /* * Possible states of a clock event device. * @@ -86,16 +77,14 @@ enum clock_event_state { * @min_delta_ns: minimum delta value in ns * @mult: nanosecond to cycles multiplier * @shift: nanoseconds to cycles divisor (power of two) - * @mode: operating mode, relevant only to ->set_mode(), OBSOLETE * @state_use_accessors:current state of the device, assigned by the core code * @features: features * @retries: number of forced programming retries - * @set_mode: legacy set mode function, only for modes <= CLOCK_EVT_MODE_RESUME. - * @set_state_periodic: switch state to periodic, if !set_mode - * @set_state_oneshot: switch state to oneshot, if !set_mode - * @set_state_oneshot_stopped: switch state to oneshot_stopped, if !set_mode - * @set_state_shutdown: switch state to shutdown, if !set_mode - * @tick_resume: resume clkevt device, if !set_mode + * @set_state_periodic: switch state to periodic + * @set_state_oneshot: switch state to oneshot + * @set_state_oneshot_stopped: switch state to oneshot_stopped + * @set_state_shutdown: switch state to shutdown + * @tick_resume: resume clkevt device * @broadcast: function to broadcast events * @min_delta_ticks: minimum delta value in ticks stored for reconfiguration * @max_delta_ticks: maximum delta value in ticks stored for reconfiguration @@ -116,18 +105,10 @@ struct clock_event_device { u64 min_delta_ns; u32 mult; u32 shift; - enum clock_event_mode mode; enum clock_event_state state_use_accessors; unsigned int features; unsigned long retries; - /* - * State transition callback(s): Only one of the two groups should be - * defined: - * - set_mode(), only for modes <= CLOCK_EVT_MODE_RESUME. - * - set_state_{shutdown|periodic|oneshot|oneshot_stopped}(), tick_resume(). - */ - void (*set_mode)(enum clock_event_mode mode, struct clock_event_device *); int (*set_state_periodic)(struct clock_event_device *); int (*set_state_oneshot)(struct clock_event_device *); int (*set_state_oneshot_stopped)(struct clock_event_device *); diff --git a/include/linux/jump_label.h b/include/linux/jump_label.h index 7f653e8f6690..f1094238ab2a 100644 --- a/include/linux/jump_label.h +++ b/include/linux/jump_label.h @@ -21,8 +21,8 @@ * * DEFINE_STATIC_KEY_TRUE(key); * DEFINE_STATIC_KEY_FALSE(key); - * static_key_likely() - * statick_key_unlikely() + * static_branch_likely() + * static_branch_unlikely() * * Jump labels provide an interface to generate dynamic branches using * self-modifying code. Assuming toolchain and architecture support, if we @@ -45,12 +45,10 @@ * statement, setting the key to true requires us to patch in a jump * to the out-of-line of true branch. * - * In addtion to static_branch_{enable,disable}, we can also reference count + * In addition to static_branch_{enable,disable}, we can also reference count * the key or branch direction via static_branch_{inc,dec}. Thus, * static_branch_inc() can be thought of as a 'make more true' and - * static_branch_dec() as a 'make more false'. The inc()/dec() - * interface is meant to be used exclusively from the inc()/dec() for a given - * key. + * static_branch_dec() as a 'make more false'. * * Since this relies on modifying code, the branch modifying functions * must be considered absolute slow paths (machine wide synchronization etc.). diff --git a/include/linux/tick.h b/include/linux/tick.h index 48d901f83f92..e312219ff823 100644 --- a/include/linux/tick.h +++ b/include/linux/tick.h @@ -147,11 +147,20 @@ static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) cpumask_or(mask, mask, tick_nohz_full_mask); } +static inline int housekeeping_any_cpu(void) +{ + return cpumask_any_and(housekeeping_mask, cpu_online_mask); +} + extern void tick_nohz_full_kick(void); extern void tick_nohz_full_kick_cpu(int cpu); extern void tick_nohz_full_kick_all(void); extern void __tick_nohz_task_switch(void); #else +static inline int housekeeping_any_cpu(void) +{ + return smp_processor_id(); +} static inline bool tick_nohz_full_enabled(void) { return false; } static inline bool tick_nohz_full_cpu(int cpu) { return false; } static inline void tick_nohz_full_add_cpus_to(struct cpumask *mask) { } diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 337c8818541d..87e9ce6a63c5 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -289,7 +289,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) if (pv_enabled()) goto queue; - if (virt_queued_spin_lock(lock)) + if (virt_spin_lock(lock)) return; /* diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 3595403921bd..97d276ff1edb 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -621,18 +621,21 @@ int get_nohz_timer_target(void) int i, cpu = smp_processor_id(); struct sched_domain *sd; - if (!idle_cpu(cpu)) + if (!idle_cpu(cpu) && is_housekeeping_cpu(cpu)) return cpu; rcu_read_lock(); for_each_domain(cpu, sd) { for_each_cpu(i, sched_domain_span(sd)) { - if (!idle_cpu(i)) { + if (!idle_cpu(i) && is_housekeeping_cpu(cpu)) { cpu = i; goto unlock; } } } + + if (!is_housekeeping_cpu(cpu)) + cpu = housekeeping_any_cpu(); unlock: rcu_read_unlock(); return cpu; @@ -5178,24 +5181,47 @@ static void migrate_tasks(struct rq *dead_rq) break; /* - * Ensure rq->lock covers the entire task selection - * until the migration. + * pick_next_task assumes pinned rq->lock. */ lockdep_pin_lock(&rq->lock); next = pick_next_task(rq, &fake_task); BUG_ON(!next); next->sched_class->put_prev_task(rq, next); + /* + * Rules for changing task_struct::cpus_allowed are holding + * both pi_lock and rq->lock, such that holding either + * stabilizes the mask. + * + * Drop rq->lock is not quite as disastrous as it usually is + * because !cpu_active at this point, which means load-balance + * will not interfere. Also, stop-machine. + */ + lockdep_unpin_lock(&rq->lock); + raw_spin_unlock(&rq->lock); + raw_spin_lock(&next->pi_lock); + raw_spin_lock(&rq->lock); + + /* + * Since we're inside stop-machine, _nothing_ should have + * changed the task, WARN if weird stuff happened, because in + * that case the above rq->lock drop is a fail too. + */ + if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { + raw_spin_unlock(&next->pi_lock); + continue; + } + /* Find suitable destination for @next, with force if needed. */ dest_cpu = select_fallback_rq(dead_rq->cpu, next); - lockdep_unpin_lock(&rq->lock); rq = __migrate_task(rq, next, dest_cpu); if (rq != dead_rq) { raw_spin_unlock(&rq->lock); rq = dead_rq; raw_spin_lock(&rq->lock); } + raw_spin_unlock(&next->pi_lock); } rq->stop = stop; diff --git a/kernel/time/clockevents.c b/kernel/time/clockevents.c index 50eb107f1198..a9b76a40319e 100644 --- a/kernel/time/clockevents.c +++ b/kernel/time/clockevents.c @@ -97,20 +97,6 @@ EXPORT_SYMBOL_GPL(clockevent_delta2ns); static int __clockevents_switch_state(struct clock_event_device *dev, enum clock_event_state state) { - /* Transition with legacy set_mode() callback */ - if (dev->set_mode) { - /* Legacy callback doesn't support new modes */ - if (state > CLOCK_EVT_STATE_ONESHOT) - return -ENOSYS; - /* - * 'clock_event_state' and 'clock_event_mode' have 1-to-1 - * mapping until *_ONESHOT, and so a simple cast will work. - */ - dev->set_mode((enum clock_event_mode)state, dev); - dev->mode = (enum clock_event_mode)state; - return 0; - } - if (dev->features & CLOCK_EVT_FEAT_DUMMY) return 0; @@ -204,12 +190,8 @@ int clockevents_tick_resume(struct clock_event_device *dev) { int ret = 0; - if (dev->set_mode) { - dev->set_mode(CLOCK_EVT_MODE_RESUME, dev); - dev->mode = CLOCK_EVT_MODE_RESUME; - } else if (dev->tick_resume) { + if (dev->tick_resume) ret = dev->tick_resume(dev); - } return ret; } @@ -460,26 +442,6 @@ int clockevents_unbind_device(struct clock_event_device *ced, int cpu) } EXPORT_SYMBOL_GPL(clockevents_unbind_device); -/* Sanity check of state transition callbacks */ -static int clockevents_sanity_check(struct clock_event_device *dev) -{ - /* Legacy set_mode() callback */ - if (dev->set_mode) { - /* We shouldn't be supporting new modes now */ - WARN_ON(dev->set_state_periodic || dev->set_state_oneshot || - dev->set_state_shutdown || dev->tick_resume || - dev->set_state_oneshot_stopped); - - BUG_ON(dev->mode != CLOCK_EVT_MODE_UNUSED); - return 0; - } - - if (dev->features & CLOCK_EVT_FEAT_DUMMY) - return 0; - - return 0; -} - /** * clockevents_register_device - register a clock event device * @dev: device to register @@ -488,8 +450,6 @@ void clockevents_register_device(struct clock_event_device *dev) { unsigned long flags; - BUG_ON(clockevents_sanity_check(dev)); - /* Initialize state to DETACHED */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); diff --git a/kernel/time/tick-common.c b/kernel/time/tick-common.c index d11c55b6ab7d..4fcd99e12aa0 100644 --- a/kernel/time/tick-common.c +++ b/kernel/time/tick-common.c @@ -398,7 +398,6 @@ void tick_shutdown(unsigned int cpu) * the set mode function! */ clockevent_set_state(dev, CLOCK_EVT_STATE_DETACHED); - dev->mode = CLOCK_EVT_MODE_UNUSED; clockevents_exchange_device(dev, NULL); dev->event_handler = clockevents_handle_noop; td->evtdev = NULL; diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 3319e16f31e5..7c7ec4515983 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -290,16 +290,17 @@ static int __init tick_nohz_full_setup(char *str) __setup("nohz_full=", tick_nohz_full_setup); static int tick_nohz_cpu_down_callback(struct notifier_block *nfb, - unsigned long action, - void *hcpu) + unsigned long action, + void *hcpu) { unsigned int cpu = (unsigned long)hcpu; switch (action & ~CPU_TASKS_FROZEN) { case CPU_DOWN_PREPARE: /* - * If we handle the timekeeping duty for full dynticks CPUs, - * we can't safely shutdown that CPU. + * The boot CPU handles housekeeping duty (unbound timers, + * workqueues, timekeeping, ...) on behalf of full dynticks + * CPUs. It must remain online when nohz full is enabled. */ if (tick_nohz_full_running && tick_do_timer_cpu == cpu) return NOTIFY_BAD; @@ -370,6 +371,12 @@ void __init tick_nohz_init(void) cpu_notifier(tick_nohz_cpu_down_callback, 0); pr_info("NO_HZ: Full dynticks CPUs: %*pbl.\n", cpumask_pr_args(tick_nohz_full_mask)); + + /* + * We need at least one CPU to handle housekeeping work such + * as timekeeping, unbound timers, workqueues, ... + */ + WARN_ON_ONCE(cpumask_empty(housekeeping_mask)); } #endif diff --git a/kernel/time/timekeeping.c b/kernel/time/timekeeping.c index f6ee2e6b6f5d..3739ac6aa473 100644 --- a/kernel/time/timekeeping.c +++ b/kernel/time/timekeeping.c @@ -1614,7 +1614,7 @@ static __always_inline void timekeeping_freqadjust(struct timekeeper *tk, negative = (tick_error < 0); /* Sort out the magnitude of the correction */ - tick_error = abs(tick_error); + tick_error = abs64(tick_error); for (adj = 0; tick_error > interval; adj++) tick_error >>= 1; diff --git a/kernel/time/timer_list.c b/kernel/time/timer_list.c index 129c96033e46..f75e35b60149 100644 --- a/kernel/time/timer_list.c +++ b/kernel/time/timer_list.c @@ -225,7 +225,7 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) (unsigned long long) dev->min_delta_ns); SEQ_printf(m, " mult: %u\n", dev->mult); SEQ_printf(m, " shift: %u\n", dev->shift); - SEQ_printf(m, " mode: %d\n", dev->mode); + SEQ_printf(m, " mode: %d\n", clockevent_get_state(dev)); SEQ_printf(m, " next_event: %Ld nsecs\n", (unsigned long long) ktime_to_ns(dev->next_event)); @@ -233,40 +233,34 @@ print_tickdevice(struct seq_file *m, struct tick_device *td, int cpu) print_name_offset(m, dev->set_next_event); SEQ_printf(m, "\n"); - if (dev->set_mode) { - SEQ_printf(m, " set_mode: "); - print_name_offset(m, dev->set_mode); + if (dev->set_state_shutdown) { + SEQ_printf(m, " shutdown: "); + print_name_offset(m, dev->set_state_shutdown); SEQ_printf(m, "\n"); - } else { - if (dev->set_state_shutdown) { - SEQ_printf(m, " shutdown: "); - print_name_offset(m, dev->set_state_shutdown); - SEQ_printf(m, "\n"); - } + } - if (dev->set_state_periodic) { - SEQ_printf(m, " periodic: "); - print_name_offset(m, dev->set_state_periodic); - SEQ_printf(m, "\n"); - } + if (dev->set_state_periodic) { + SEQ_printf(m, " periodic: "); + print_name_offset(m, dev->set_state_periodic); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot) { - SEQ_printf(m, " oneshot: "); - print_name_offset(m, dev->set_state_oneshot); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot) { + SEQ_printf(m, " oneshot: "); + print_name_offset(m, dev->set_state_oneshot); + SEQ_printf(m, "\n"); + } - if (dev->set_state_oneshot_stopped) { - SEQ_printf(m, " oneshot stopped: "); - print_name_offset(m, dev->set_state_oneshot_stopped); - SEQ_printf(m, "\n"); - } + if (dev->set_state_oneshot_stopped) { + SEQ_printf(m, " oneshot stopped: "); + print_name_offset(m, dev->set_state_oneshot_stopped); + SEQ_printf(m, "\n"); + } - if (dev->tick_resume) { - SEQ_printf(m, " resume: "); - print_name_offset(m, dev->tick_resume); - SEQ_printf(m, "\n"); - } + if (dev->tick_resume) { + SEQ_printf(m, " resume: "); + print_name_offset(m, dev->tick_resume); + SEQ_printf(m, "\n"); } SEQ_printf(m, " event_handler: "); diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c index 525f454f7531..b9b0e3b5da49 100644 --- a/net/ceph/messenger.c +++ b/net/ceph/messenger.c @@ -1353,11 +1353,12 @@ static void prepare_write_keepalive(struct ceph_connection *con) dout("prepare_write_keepalive %p\n", con); con_out_kvec_reset(con); if (con->peer_features & CEPH_FEATURE_MSGR_KEEPALIVE2) { - struct timespec ts = CURRENT_TIME; - struct ceph_timespec ceph_ts; - ceph_encode_timespec(&ceph_ts, &ts); + struct timespec now = CURRENT_TIME; + con_out_kvec_add(con, sizeof(tag_keepalive2), &tag_keepalive2); - con_out_kvec_add(con, sizeof(ceph_ts), &ceph_ts); + ceph_encode_timespec(&con->out_temp_keepalive2, &now); + con_out_kvec_add(con, sizeof(con->out_temp_keepalive2), + &con->out_temp_keepalive2); } else { con_out_kvec_add(con, sizeof(tag_keepalive), &tag_keepalive); } diff --git a/security/device_cgroup.c b/security/device_cgroup.c index 73455089feef..03c1652c9a1f 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -401,7 +401,7 @@ static bool verify_new_ex(struct dev_cgroup *dev_cgroup, bool match = false; RCU_LOCKDEP_WARN(!rcu_read_lock_held() && - lockdep_is_held(&devcgroup_mutex), + !lockdep_is_held(&devcgroup_mutex), "device_cgroup:verify_new_ex called without proper synchronization"); if (dev_cgroup->behavior == DEVCG_DEFAULT_ALLOW) { diff --git a/tools/perf/builtin-script.c b/tools/perf/builtin-script.c index eb51325e8ad9..284a76e04628 100644 --- a/tools/perf/builtin-script.c +++ b/tools/perf/builtin-script.c @@ -768,8 +768,8 @@ static int process_exit_event(struct perf_tool *tool, if (!evsel->attr.sample_id_all) { sample->cpu = 0; sample->time = 0; - sample->tid = event->comm.tid; - sample->pid = event->comm.pid; + sample->tid = event->fork.tid; + sample->pid = event->fork.pid; } print_sample_start(sample, thread, evsel); perf_event__fprintf(event, stdout); diff --git a/tools/perf/tests/sw-clock.c b/tools/perf/tests/sw-clock.c index 1aa21c90731b..5b83f56a3b6f 100644 --- a/tools/perf/tests/sw-clock.c +++ b/tools/perf/tests/sw-clock.c @@ -34,6 +34,8 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) .disabled = 1, .freq = 1, }; + struct cpu_map *cpus; + struct thread_map *threads; attr.sample_freq = 500; @@ -50,14 +52,19 @@ static int __test__sw_clock_freq(enum perf_sw_ids clock_id) } perf_evlist__add(evlist, evsel); - evlist->cpus = cpu_map__dummy_new(); - evlist->threads = thread_map__new_by_tid(getpid()); - if (!evlist->cpus || !evlist->threads) { + cpus = cpu_map__dummy_new(); + threads = thread_map__new_by_tid(getpid()); + if (!cpus || !threads) { err = -ENOMEM; pr_debug("Not enough memory to create thread/cpu maps\n"); - goto out_delete_evlist; + goto out_free_maps; } + perf_evlist__set_maps(evlist, cpus, threads); + + cpus = NULL; + threads = NULL; + if (perf_evlist__open(evlist)) { const char *knob = "/proc/sys/kernel/perf_event_max_sample_rate"; @@ -107,6 +114,9 @@ next_event: err = -1; } +out_free_maps: + cpu_map__put(cpus); + thread_map__put(threads); out_delete_evlist: perf_evlist__delete(evlist); return err; diff --git a/tools/perf/tests/task-exit.c b/tools/perf/tests/task-exit.c index 3a8fedef83bc..add16385f13e 100644 --- a/tools/perf/tests/task-exit.c +++ b/tools/perf/tests/task-exit.c @@ -43,6 +43,8 @@ int test__task_exit(void) }; const char *argv[] = { "true", NULL }; char sbuf[STRERR_BUFSIZE]; + struct cpu_map *cpus; + struct thread_map *threads; signal(SIGCHLD, sig_handler); @@ -58,14 +60,19 @@ int test__task_exit(void) * perf_evlist__prepare_workload we'll fill in the only thread * we're monitoring, the one forked there. */ - evlist->cpus = cpu_map__dummy_new(); - evlist->threads = thread_map__new_by_tid(-1); - if (!evlist->cpus || !evlist->threads) { + cpus = cpu_map__dummy_new(); + threads = thread_map__new_by_tid(-1); + if (!cpus || !threads) { err = -ENOMEM; pr_debug("Not enough memory to create thread/cpu maps\n"); - goto out_delete_evlist; + goto out_free_maps; } + perf_evlist__set_maps(evlist, cpus, threads); + + cpus = NULL; + threads = NULL; + err = perf_evlist__prepare_workload(evlist, &target, argv, false, workload_exec_failed_signal); if (err < 0) { @@ -114,6 +121,9 @@ retry: err = -1; } +out_free_maps: + cpu_map__put(cpus); + thread_map__put(threads); out_delete_evlist: perf_evlist__delete(evlist); return err; diff --git a/tools/perf/ui/browsers/hists.c b/tools/perf/ui/browsers/hists.c index cf86f2d3a5e7..c04c60d4863c 100644 --- a/tools/perf/ui/browsers/hists.c +++ b/tools/perf/ui/browsers/hists.c @@ -1968,7 +1968,8 @@ skip_annotation: &options[nr_options], dso); nr_options += add_map_opt(browser, &actions[nr_options], &options[nr_options], - browser->selection->map); + browser->selection ? + browser->selection->map : NULL); /* perf script support */ if (browser->he_selection) { @@ -1976,6 +1977,15 @@ skip_annotation: &actions[nr_options], &options[nr_options], thread, NULL); + /* + * Note that browser->selection != NULL + * when browser->he_selection is not NULL, + * so we don't need to check browser->selection + * before fetching browser->selection->sym like what + * we do before fetching browser->selection->map. + * + * See hist_browser__show_entry. + */ nr_options += add_script_opt(browser, &actions[nr_options], &options[nr_options], diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index d51a5200c8af..c8fc8a258f42 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -124,6 +124,33 @@ void perf_evlist__delete(struct perf_evlist *evlist) free(evlist); } +static void __perf_evlist__propagate_maps(struct perf_evlist *evlist, + struct perf_evsel *evsel) +{ + /* + * We already have cpus for evsel (via PMU sysfs) so + * keep it, if there's no target cpu list defined. + */ + if (!evsel->own_cpus || evlist->has_user_cpus) { + cpu_map__put(evsel->cpus); + evsel->cpus = cpu_map__get(evlist->cpus); + } else if (evsel->cpus != evsel->own_cpus) { + cpu_map__put(evsel->cpus); + evsel->cpus = cpu_map__get(evsel->own_cpus); + } + + thread_map__put(evsel->threads); + evsel->threads = thread_map__get(evlist->threads); +} + +static void perf_evlist__propagate_maps(struct perf_evlist *evlist) +{ + struct perf_evsel *evsel; + + evlist__for_each(evlist, evsel) + __perf_evlist__propagate_maps(evlist, evsel); +} + void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry) { entry->evlist = evlist; @@ -133,18 +160,19 @@ void perf_evlist__add(struct perf_evlist *evlist, struct perf_evsel *entry) if (!evlist->nr_entries++) perf_evlist__set_id_pos(evlist); + + __perf_evlist__propagate_maps(evlist, entry); } void perf_evlist__splice_list_tail(struct perf_evlist *evlist, - struct list_head *list, - int nr_entries) + struct list_head *list) { - bool set_id_pos = !evlist->nr_entries; + struct perf_evsel *evsel, *temp; - list_splice_tail(list, &evlist->entries); - evlist->nr_entries += nr_entries; - if (set_id_pos) - perf_evlist__set_id_pos(evlist); + __evlist__for_each_safe(list, temp, evsel) { + list_del_init(&evsel->node); + perf_evlist__add(evlist, evsel); + } } void __perf_evlist__set_leader(struct list_head *list) @@ -210,7 +238,7 @@ static int perf_evlist__add_attrs(struct perf_evlist *evlist, list_add_tail(&evsel->node, &head); } - perf_evlist__splice_list_tail(evlist, &head, nr_attrs); + perf_evlist__splice_list_tail(evlist, &head); return 0; @@ -1103,71 +1131,56 @@ int perf_evlist__mmap(struct perf_evlist *evlist, unsigned int pages, return perf_evlist__mmap_ex(evlist, pages, overwrite, 0, false); } -static int perf_evlist__propagate_maps(struct perf_evlist *evlist, - bool has_user_cpus) -{ - struct perf_evsel *evsel; - - evlist__for_each(evlist, evsel) { - /* - * We already have cpus for evsel (via PMU sysfs) so - * keep it, if there's no target cpu list defined. - */ - if (evsel->cpus && has_user_cpus) - cpu_map__put(evsel->cpus); - - if (!evsel->cpus || has_user_cpus) - evsel->cpus = cpu_map__get(evlist->cpus); - - evsel->threads = thread_map__get(evlist->threads); - - if ((evlist->cpus && !evsel->cpus) || - (evlist->threads && !evsel->threads)) - return -ENOMEM; - } - - return 0; -} - int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target) { - evlist->threads = thread_map__new_str(target->pid, target->tid, - target->uid); + struct cpu_map *cpus; + struct thread_map *threads; + + threads = thread_map__new_str(target->pid, target->tid, target->uid); - if (evlist->threads == NULL) + if (!threads) return -1; if (target__uses_dummy_map(target)) - evlist->cpus = cpu_map__dummy_new(); + cpus = cpu_map__dummy_new(); else - evlist->cpus = cpu_map__new(target->cpu_list); + cpus = cpu_map__new(target->cpu_list); - if (evlist->cpus == NULL) + if (!cpus) goto out_delete_threads; - return perf_evlist__propagate_maps(evlist, !!target->cpu_list); + evlist->has_user_cpus = !!target->cpu_list; + + perf_evlist__set_maps(evlist, cpus, threads); + + return 0; out_delete_threads: - thread_map__put(evlist->threads); - evlist->threads = NULL; + thread_map__put(threads); return -1; } -int perf_evlist__set_maps(struct perf_evlist *evlist, - struct cpu_map *cpus, - struct thread_map *threads) +void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads) { - if (evlist->cpus) + /* + * Allow for the possibility that one or another of the maps isn't being + * changed i.e. don't put it. Note we are assuming the maps that are + * being applied are brand new and evlist is taking ownership of the + * original reference count of 1. If that is not the case it is up to + * the caller to increase the reference count. + */ + if (cpus != evlist->cpus) { cpu_map__put(evlist->cpus); + evlist->cpus = cpus; + } - evlist->cpus = cpus; - - if (evlist->threads) + if (threads != evlist->threads) { thread_map__put(evlist->threads); + evlist->threads = threads; + } - evlist->threads = threads; - - return perf_evlist__propagate_maps(evlist, false); + perf_evlist__propagate_maps(evlist); } int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel) @@ -1387,6 +1400,8 @@ void perf_evlist__close(struct perf_evlist *evlist) static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist) { + struct cpu_map *cpus; + struct thread_map *threads; int err = -ENOMEM; /* @@ -1398,20 +1413,19 @@ static int perf_evlist__create_syswide_maps(struct perf_evlist *evlist) * error, and we may not want to do that fallback to a * default cpu identity map :-\ */ - evlist->cpus = cpu_map__new(NULL); - if (evlist->cpus == NULL) + cpus = cpu_map__new(NULL); + if (!cpus) goto out; - evlist->threads = thread_map__new_dummy(); - if (evlist->threads == NULL) - goto out_free_cpus; + threads = thread_map__new_dummy(); + if (!threads) + goto out_put; - err = 0; + perf_evlist__set_maps(evlist, cpus, threads); out: return err; -out_free_cpus: - cpu_map__put(evlist->cpus); - evlist->cpus = NULL; +out_put: + cpu_map__put(cpus); goto out; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index b39a6198f4ac..115d8b53c601 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -42,6 +42,7 @@ struct perf_evlist { int nr_mmaps; bool overwrite; bool enabled; + bool has_user_cpus; size_t mmap_len; int id_pos; int is_pos; @@ -155,9 +156,8 @@ int perf_evlist__enable_event_idx(struct perf_evlist *evlist, void perf_evlist__set_selected(struct perf_evlist *evlist, struct perf_evsel *evsel); -int perf_evlist__set_maps(struct perf_evlist *evlist, - struct cpu_map *cpus, - struct thread_map *threads); +void perf_evlist__set_maps(struct perf_evlist *evlist, struct cpu_map *cpus, + struct thread_map *threads); int perf_evlist__create_maps(struct perf_evlist *evlist, struct target *target); int perf_evlist__apply_filters(struct perf_evlist *evlist, struct perf_evsel **err_evsel); @@ -179,8 +179,7 @@ bool perf_evlist__valid_sample_id_all(struct perf_evlist *evlist); bool perf_evlist__valid_read_format(struct perf_evlist *evlist); void perf_evlist__splice_list_tail(struct perf_evlist *evlist, - struct list_head *list, - int nr_entries); + struct list_head *list); static inline struct perf_evsel *perf_evlist__first(struct perf_evlist *evlist) { diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index c53f79123b37..5410483d5219 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -1033,6 +1033,7 @@ void perf_evsel__exit(struct perf_evsel *evsel) perf_evsel__free_config_terms(evsel); close_cgroup(evsel->cgrp); cpu_map__put(evsel->cpus); + cpu_map__put(evsel->own_cpus); thread_map__put(evsel->threads); zfree(&evsel->group_name); zfree(&evsel->name); diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index 298e6bbca200..ef8925f7211a 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -98,6 +98,7 @@ struct perf_evsel { struct cgroup_sel *cgrp; void *handler; struct cpu_map *cpus; + struct cpu_map *own_cpus; struct thread_map *threads; unsigned int sample_size; int id_pos; diff --git a/tools/perf/util/header.c b/tools/perf/util/header.c index 41814547da15..fce6634aebe2 100644 --- a/tools/perf/util/header.c +++ b/tools/perf/util/header.c @@ -1438,7 +1438,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused, if (ph->needs_swap) nr = bswap_32(nr); - ph->env.nr_cpus_online = nr; + ph->env.nr_cpus_avail = nr; ret = readn(fd, &nr, sizeof(nr)); if (ret != sizeof(nr)) @@ -1447,7 +1447,7 @@ static int process_nrcpus(struct perf_file_section *section __maybe_unused, if (ph->needs_swap) nr = bswap_32(nr); - ph->env.nr_cpus_avail = nr; + ph->env.nr_cpus_online = nr; return 0; } diff --git a/tools/perf/util/intel-bts.c b/tools/perf/util/intel-bts.c index ea768625ab5b..eb0e7f8bf515 100644 --- a/tools/perf/util/intel-bts.c +++ b/tools/perf/util/intel-bts.c @@ -623,7 +623,7 @@ static int intel_bts_process_event(struct perf_session *session, if (err) return err; if (event->header.type == PERF_RECORD_EXIT) { - err = intel_bts_process_tid_exit(bts, event->comm.tid); + err = intel_bts_process_tid_exit(bts, event->fork.tid); if (err) return err; } diff --git a/tools/perf/util/intel-pt.c b/tools/perf/util/intel-pt.c index bb41c20e6005..535d86f8e4d1 100644 --- a/tools/perf/util/intel-pt.c +++ b/tools/perf/util/intel-pt.c @@ -1494,7 +1494,7 @@ static int intel_pt_process_event(struct perf_session *session, if (pt->timeless_decoding) { if (event->header.type == PERF_RECORD_EXIT) { err = intel_pt_process_timeless_queues(pt, - event->comm.tid, + event->fork.tid, sample->time); } } else if (timestamp) { diff --git a/tools/perf/util/parse-events.c b/tools/perf/util/parse-events.c index d826e6f515db..21ed6ee63da9 100644 --- a/tools/perf/util/parse-events.c +++ b/tools/perf/util/parse-events.c @@ -287,8 +287,8 @@ __add_event(struct list_head *list, int *idx, if (!evsel) return NULL; - if (cpus) - evsel->cpus = cpu_map__get(cpus); + evsel->cpus = cpu_map__get(cpus); + evsel->own_cpus = cpu_map__get(cpus); if (name) evsel->name = strdup(name); @@ -1140,10 +1140,9 @@ int parse_events(struct perf_evlist *evlist, const char *str, ret = parse_events__scanner(str, &data, PE_START_EVENTS); perf_pmu__parse_cleanup(); if (!ret) { - int entries = data.idx - evlist->nr_entries; struct perf_evsel *last; - perf_evlist__splice_list_tail(evlist, &data.list, entries); + perf_evlist__splice_list_tail(evlist, &data.list); evlist->nr_groups += data.nr_groups; last = perf_evlist__last(evlist); last->cmdline_group_boundary = true; diff --git a/tools/perf/util/parse-events.y b/tools/perf/util/parse-events.y index 591905a02b92..9cd70819c795 100644 --- a/tools/perf/util/parse-events.y +++ b/tools/perf/util/parse-events.y @@ -255,7 +255,7 @@ PE_PMU_EVENT_PRE '-' PE_PMU_EVENT_SUF sep_dc list_add_tail(&term->list, head); ALLOC_LIST(list); - ABORT_ON(parse_events_add_pmu(list, &data->idx, "cpu", head)); + ABORT_ON(parse_events_add_pmu(data, list, "cpu", head)); parse_events__free_terms(head); $$ = list; } diff --git a/tools/testing/selftests/x86/entry_from_vm86.c b/tools/testing/selftests/x86/entry_from_vm86.c index 9a43a59a9bb4..421c607a8856 100644 --- a/tools/testing/selftests/x86/entry_from_vm86.c +++ b/tools/testing/selftests/x86/entry_from_vm86.c @@ -116,8 +116,9 @@ static bool do_test(struct vm86plus_struct *v86, unsigned long eip, v86->regs.eip = eip; ret = vm86(VM86_ENTER, v86); - if (ret == -1 && errno == ENOSYS) { - printf("[SKIP]\tvm86 not supported\n"); + if (ret == -1 && (errno == ENOSYS || errno == EPERM)) { + printf("[SKIP]\tvm86 %s\n", + errno == ENOSYS ? "not supported" : "not allowed"); return false; } |