From 64be4c1c2428e148de6081af235e2418e6a66dda Mon Sep 17 00:00:00 2001 From: Daniel J Blueman Date: Mon, 5 Dec 2011 16:20:37 +0800 Subject: x86: Add x86_init platform override to fix up NUMA core numbering Add an x86_init vector for handling inconsistent core numbering. This is useful for multi-fabric platforms, such as Numascale NumaConnect. v2: - use struct x86_cpuinit_ops - provide default fall-back function to warn Signed-off-by: Daniel J Blueman Cc: Steffen Persvold Cc: Jesse Barnes Link: http://lkml.kernel.org/r/1323073238-32686-2-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 9 +++++++++ 1 file changed, 9 insertions(+) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a831..ad4da45effb9 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1140,6 +1140,15 @@ static void dbg_restore_debug_regs(void) #define dbg_restore_debug_regs() #endif /* ! CONFIG_KGDB */ +/* + * Prints an error where the NUMA and configured core-number mismatch and the + * platform didn't override this to fix it up + */ +void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) +{ + pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +} + /* * cpu_init() initializes state that is per-CPU. Some data is already * initialized (naturally) in the bootstrap process, such as the GDT -- cgit v1.2.3 From e4a02b4a951a7adf9d982b11c64686570c29fbe7 Mon Sep 17 00:00:00 2001 From: Steffen Persvold Date: Tue, 6 Dec 2011 01:10:31 +0100 Subject: x86: Fix the !CONFIG_NUMA build of the new CPU ID fixup code support I used "ifdef CONFIG_NUMA" simply because it doesn't make sense in a non-numa configuration even with SMP enabled. Besides, the only place where it is called right now is in kernel/cpu/amd.c:srat_detect_node() within the "CONFIG_NUMA" protected part. Signed-off-by: Steffen Persvold Cc: Daniel J Blueman Cc: Jesse Barnes Link: http://lkml.kernel.org/r/1323073238-32686-2-git-send-email-daniel@numascale-asia.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/cpu/common.c | 2 ++ 1 file changed, 2 insertions(+) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index ad4da45effb9..a70bd5b96b9e 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1146,7 +1146,9 @@ static void dbg_restore_debug_regs(void) */ void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) { +#ifdef CONFIG_NUMA pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); +#endif } /* -- cgit v1.2.3 From 141168c36cdee3ff23d9c7700b0edc47cb65479f Mon Sep 17 00:00:00 2001 From: Kevin Winchester Date: Tue, 20 Dec 2011 20:52:22 -0400 Subject: x86: Simplify code by removing a !SMP #ifdefs from 'struct cpuinfo_x86' Several fields in struct cpuinfo_x86 were not defined for the !SMP case, likely to save space. However, those fields still have some meaning for UP, and keeping them allows some #ifdef removal from other files. The additional size of the UP kernel from this change is not significant enough to worry about keeping up the distinction: text data bss dec hex filename 4737168 506459 972040 6215667 5ed7f3 vmlinux.o.before 4737444 506459 972040 6215943 5ed907 vmlinux.o.after for a difference of 276 bytes for an example UP config. If someone wants those 276 bytes back badly then it should be implemented in a cleaner way. Signed-off-by: Kevin Winchester Cc: Steffen Persvold Link: http://lkml.kernel.org/r/1324428742-12498-1-git-send-email-kjwinchester@gmail.com Signed-off-by: Ingo Molnar --- arch/x86/include/asm/processor.h | 2 -- arch/x86/kernel/amd_nb.c | 8 ++------ arch/x86/kernel/cpu/amd.c | 2 -- arch/x86/kernel/cpu/common.c | 7 ------- arch/x86/kernel/cpu/intel.c | 2 -- arch/x86/kernel/cpu/mcheck/mce.c | 2 -- arch/x86/kernel/cpu/mcheck/mce_amd.c | 7 +------ arch/x86/kernel/cpu/proc.c | 4 +--- drivers/edac/sb_edac.c | 2 -- drivers/hwmon/coretemp.c | 7 +++---- 10 files changed, 7 insertions(+), 36 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b650435ffb53..aa9088c26931 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -99,7 +99,6 @@ struct cpuinfo_x86 { u16 apicid; u16 initial_apicid; u16 x86_clflush_size; -#ifdef CONFIG_SMP /* number of cores as seen by the OS: */ u16 booted_cores; /* Physical processor id: */ @@ -110,7 +109,6 @@ struct cpuinfo_x86 { u8 compute_unit_id; /* Index into per_cpu list: */ u16 cpu_index; -#endif u32 microcode; } __attribute__((__aligned__(SMP_CACHE_BYTES))); diff --git a/arch/x86/kernel/amd_nb.c b/arch/x86/kernel/amd_nb.c index 4c39baa8facc..013c1810ce72 100644 --- a/arch/x86/kernel/amd_nb.c +++ b/arch/x86/kernel/amd_nb.c @@ -123,16 +123,14 @@ int amd_get_subcaches(int cpu) { struct pci_dev *link = node_to_amd_nb(amd_get_nb_id(cpu))->link; unsigned int mask; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING)) return 0; pci_read_config_dword(link, 0x1d4, &mask); -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif return (mask >> (4 * cuid)) & 0xf; } @@ -141,7 +139,7 @@ int amd_set_subcaches(int cpu, int mask) static unsigned int reset, ban; struct amd_northbridge *nb = node_to_amd_nb(amd_get_nb_id(cpu)); unsigned int reg; - int cuid = 0; + int cuid; if (!amd_nb_has_feature(AMD_NB_L3_PARTITIONING) || mask > 0xf) return -EINVAL; @@ -159,9 +157,7 @@ int amd_set_subcaches(int cpu, int mask) pci_write_config_dword(nb->misc, 0x1b8, reg & ~0x180000); } -#ifdef CONFIG_SMP cuid = cpu_data(cpu).compute_unit_id; -#endif mask <<= 4 * cuid; mask |= (0xf ^ (1 << cuid)) << 26; diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index ef21bdccd674..f4773f4aae35 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -148,7 +148,6 @@ static void __cpuinit init_amd_k6(struct cpuinfo_x86 *c) static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -192,7 +191,6 @@ static void __cpuinit amd_k7_smp_check(struct cpuinfo_x86 *c) valid_k7: ; -#endif } static void __cpuinit init_amd_k7(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index a70bd5b96b9e..850f2963a420 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -676,9 +676,7 @@ static void __init early_identify_cpu(struct cpuinfo_x86 *c) if (this_cpu->c_early_init) this_cpu->c_early_init(c); -#ifdef CONFIG_SMP c->cpu_index = 0; -#endif filter_cpuid_features(c, false); setup_smep(c); @@ -764,10 +762,7 @@ static void __cpuinit generic_identify(struct cpuinfo_x86 *c) c->apicid = c->initial_apicid; # endif #endif - -#ifdef CONFIG_X86_HT c->phys_proc_id = c->initial_apicid; -#endif } setup_smep(c); @@ -1146,9 +1141,7 @@ static void dbg_restore_debug_regs(void) */ void __cpuinit x86_default_fixup_cpu_id(struct cpuinfo_x86 *c, int node) { -#ifdef CONFIG_NUMA pr_err("NUMA core number %d differs from configured core number %d\n", node, c->phys_proc_id); -#endif } /* diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c index 523131213f08..3e6ff6cbf42a 100644 --- a/arch/x86/kernel/cpu/intel.c +++ b/arch/x86/kernel/cpu/intel.c @@ -181,7 +181,6 @@ static void __cpuinit trap_init_f00f_bug(void) static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) { -#ifdef CONFIG_SMP /* calling is from identify_secondary_cpu() ? */ if (!c->cpu_index) return; @@ -198,7 +197,6 @@ static void __cpuinit intel_smp_check(struct cpuinfo_x86 *c) WARN_ONCE(1, "WARNING: SMP operation may be unreliable" "with B stepping processors.\n"); } -#endif } static void __cpuinit intel_workarounds(struct cpuinfo_x86 *c) diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c index 2af127d4c3d1..e9c9d0aab36a 100644 --- a/arch/x86/kernel/cpu/mcheck/mce.c +++ b/arch/x86/kernel/cpu/mcheck/mce.c @@ -119,9 +119,7 @@ void mce_setup(struct mce *m) m->time = get_seconds(); m->cpuvendor = boot_cpu_data.x86_vendor; m->cpuid = cpuid_eax(1); -#ifdef CONFIG_SMP m->socketid = cpu_data(m->extcpu).phys_proc_id; -#endif m->apicid = cpu_data(m->extcpu).initial_apicid; rdmsrl(MSR_IA32_MCG_CAP, m->mcgcap); } diff --git a/arch/x86/kernel/cpu/mcheck/mce_amd.c b/arch/x86/kernel/cpu/mcheck/mce_amd.c index f5474218cffe..1d76872b6a45 100644 --- a/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ b/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -64,11 +64,9 @@ struct threshold_bank { }; static DEFINE_PER_CPU(struct threshold_bank * [NR_BANKS], threshold_banks); -#ifdef CONFIG_SMP static unsigned char shared_bank[NR_BANKS] = { 0, 0, 0, 0, 1 }; -#endif static DEFINE_PER_CPU(unsigned char, bank_map); /* see which banks are on */ @@ -202,10 +200,9 @@ void mce_amd_feature_init(struct cpuinfo_x86 *c) if (!block) per_cpu(bank_map, cpu) |= (1 << bank); -#ifdef CONFIG_SMP if (shared_bank[bank] && c->cpu_core_id) break; -#endif + offset = setup_APIC_mce(offset, (high & MASK_LVTOFF_HI) >> 20); @@ -531,7 +528,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) sprintf(name, "threshold_bank%i", bank); -#ifdef CONFIG_SMP if (cpu_data(cpu).cpu_core_id && shared_bank[bank]) { /* symlink */ i = cpumask_first(cpu_llc_shared_mask(cpu)); @@ -558,7 +554,6 @@ static __cpuinit int threshold_create_bank(unsigned int cpu, unsigned int bank) goto out; } -#endif b = kzalloc(sizeof(struct threshold_bank), GFP_KERNEL); if (!b) { diff --git a/arch/x86/kernel/cpu/proc.c b/arch/x86/kernel/cpu/proc.c index 14b23140e81f..8022c6681485 100644 --- a/arch/x86/kernel/cpu/proc.c +++ b/arch/x86/kernel/cpu/proc.c @@ -64,12 +64,10 @@ static void show_cpuinfo_misc(struct seq_file *m, struct cpuinfo_x86 *c) static int show_cpuinfo(struct seq_file *m, void *v) { struct cpuinfo_x86 *c = v; - unsigned int cpu = 0; + unsigned int cpu; int i; -#ifdef CONFIG_SMP cpu = c->cpu_index; -#endif seq_printf(m, "processor\t: %u\n" "vendor_id\t: %s\n" "cpu family\t: %d\n" diff --git a/drivers/edac/sb_edac.c b/drivers/edac/sb_edac.c index 7a402bfbee7d..88df48956c1b 100644 --- a/drivers/edac/sb_edac.c +++ b/drivers/edac/sb_edac.c @@ -1609,11 +1609,9 @@ static int sbridge_mce_check_error(struct notifier_block *nb, unsigned long val, mce->cpuvendor, mce->cpuid, mce->time, mce->socketid, mce->apicid); -#ifdef CONFIG_SMP /* Only handle if it is the right mc controller */ if (cpu_data(mce->cpu).phys_proc_id != pvt->sbridge_dev->mc) return NOTIFY_DONE; -#endif smp_rmb(); if ((pvt->mce_out + 1) % MCE_LOG_LEN == pvt->mce_in) { diff --git a/drivers/hwmon/coretemp.c b/drivers/hwmon/coretemp.c index 104b3767516c..1fdef885341c 100644 --- a/drivers/hwmon/coretemp.c +++ b/drivers/hwmon/coretemp.c @@ -57,16 +57,15 @@ MODULE_PARM_DESC(tjmax, "TjMax value in degrees Celsius"); #define TOTAL_ATTRS (MAX_CORE_ATTRS + 1) #define MAX_CORE_DATA (NUM_REAL_CORES + BASE_SYSFS_ATTR_NO) -#ifdef CONFIG_SMP #define TO_PHYS_ID(cpu) cpu_data(cpu).phys_proc_id #define TO_CORE_ID(cpu) cpu_data(cpu).cpu_core_id +#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) + +#ifdef CONFIG_SMP #define for_each_sibling(i, cpu) for_each_cpu(i, cpu_sibling_mask(cpu)) #else -#define TO_PHYS_ID(cpu) (cpu) -#define TO_CORE_ID(cpu) (cpu) #define for_each_sibling(i, cpu) for (i = 0; false; ) #endif -#define TO_ATTR_NO(cpu) (TO_CORE_ID(cpu) + BASE_SYSFS_ATTR_NO) /* * Per-Core Temperature Data -- cgit v1.2.3 From 228bdaa95fb830e08b6acd1afd4d2c55093cabfa Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 9 Dec 2011 03:02:19 -0500 Subject: x86: Keep current stack in NMI breakpoints We want to allow NMI handlers to have breakpoints to be able to remove stop_machine from ftrace, kprobes and jump_labels. But if an NMI interrupts a current breakpoint, and then it triggers a breakpoint itself, it will switch to the breakpoint stack and corrupt the data on it for the breakpoint processing that it interrupted. Instead, have the NMI check if it interrupted breakpoint processing by checking if the stack that is currently used is a breakpoint stack. If it is, then load a special IDT that changes the IST for the debug exception to keep the same stack in kernel context. When the NMI is done, it puts it back. This way, if the NMI does trigger a breakpoint, it will keep using the same stack and not stomp on the breakpoint data for the breakpoint it interrupted. Suggested-by: Peter Zijlstra Signed-off-by: Steven Rostedt --- arch/x86/include/asm/desc.h | 12 ++++++++++++ arch/x86/include/asm/processor.h | 6 ++++++ arch/x86/kernel/cpu/common.c | 22 ++++++++++++++++++++++ arch/x86/kernel/head_64.S | 4 ++++ arch/x86/kernel/nmi.c | 15 +++++++++++++++ arch/x86/kernel/traps.c | 6 ++++++ 6 files changed, 65 insertions(+) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index 41935fadfdfc..e95822d683f4 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -35,6 +35,8 @@ static inline void fill_ldt(struct desc_struct *desc, const struct user_desc *in extern struct desc_ptr idt_descr; extern gate_desc idt_table[]; +extern struct desc_ptr nmi_idt_descr; +extern gate_desc nmi_idt_table[]; struct gdt_page { struct desc_struct gdt[GDT_ENTRIES]; @@ -307,6 +309,16 @@ static inline void set_desc_limit(struct desc_struct *desc, unsigned long limit) desc->limit = (limit >> 16) & 0xf; } +#ifdef CONFIG_X86_64 +static inline void set_nmi_gate(int gate, void *addr) +{ + gate_desc s; + + pack_gate(&s, GATE_INTERRUPT, (unsigned long)addr, 0, 0, __KERNEL_CS); + write_idt_entry(nmi_idt_table, gate, &s); +} +#endif + static inline void _set_gate(int gate, unsigned type, void *addr, unsigned dpl, unsigned ist, unsigned seg) { diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index b650435ffb53..4b39d6d7e3a1 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -402,6 +402,9 @@ DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -416,6 +419,9 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index aa003b13a831..caa404556b9c 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1026,6 +1026,8 @@ __setup("clearcpuid=", setup_disablecpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr = { NR_VECTORS * 16 - 1, (unsigned long) idt_table }; +struct desc_ptr nmi_idt_descr = { NR_VECTORS * 16 - 1, + (unsigned long) nmi_idt_table }; DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE); @@ -1090,6 +1092,24 @@ unsigned long kernel_eflags; */ DEFINE_PER_CPU(struct orig_ist, orig_ist); +static DEFINE_PER_CPU(unsigned long, debug_stack_addr); + +int is_debug_stack(unsigned long addr) +{ + return addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); +} + +void debug_stack_set_zero(void) +{ + load_idt((const struct desc_ptr *)&nmi_idt_descr); +} + +void debug_stack_reset(void) +{ + load_idt((const struct desc_ptr *)&idt_descr); +} + #else /* CONFIG_X86_64 */ DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; @@ -1208,6 +1228,8 @@ void __cpuinit cpu_init(void) estacks += exception_stack_sizes[v]; oist->ist[v] = t->x86_tss.ist[v] = (unsigned long)estacks; + if (v == DEBUG_STACK-1) + per_cpu(debug_stack_addr, cpu) = (unsigned long)estacks; } } diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index e11e39478a49..40f4eb3766d1 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -417,6 +417,10 @@ ENTRY(phys_base) ENTRY(idt_table) .skip IDT_ENTRIES * 16 + .align L1_CACHE_BYTES +ENTRY(nmi_idt_table) + .skip IDT_ENTRIES * 16 + __PAGE_ALIGNED_BSS .align PAGE_SIZE ENTRY(empty_zero_page) diff --git a/arch/x86/kernel/nmi.c b/arch/x86/kernel/nmi.c index e88f37b58ddd..de8d4b333f40 100644 --- a/arch/x86/kernel/nmi.c +++ b/arch/x86/kernel/nmi.c @@ -408,6 +408,18 @@ static notrace __kprobes void default_do_nmi(struct pt_regs *regs) dotraplinkage notrace __kprobes void do_nmi(struct pt_regs *regs, long error_code) { + int update_debug_stack = 0; + + /* + * If we interrupted a breakpoint, it is possible that + * the nmi handler will have breakpoints too. We need to + * change the IDT such that breakpoints that happen here + * continue to use the NMI stack. + */ + if (unlikely(is_debug_stack(regs->sp))) { + debug_stack_set_zero(); + update_debug_stack = 1; + } nmi_enter(); inc_irq_stat(__nmi_count); @@ -416,6 +428,9 @@ do_nmi(struct pt_regs *regs, long error_code) default_do_nmi(regs); nmi_exit(); + + if (unlikely(update_debug_stack)) + debug_stack_reset(); } void stop_nmi(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a8e3eb83466c..a93c5cabc36a 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -723,4 +723,10 @@ void __init trap_init(void) cpu_init(); x86_init.irqs.trap_init(); + +#ifdef CONFIG_X86_64 + memcpy(&nmi_idt_table, &idt_table, IDT_ENTRIES * 16); + set_nmi_gate(1, &debug); + set_nmi_gate(3, &int3); +#endif } -- cgit v1.2.3 From 42181186ad4db986fcaa40ca95c6e407e9e79372 Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 16 Dec 2011 11:43:02 -0500 Subject: x86: Add counter when debug stack is used with interrupts enabled Mathieu Desnoyers pointed out a case that can cause issues with NMIs running on the debug stack: int3 -> interrupt -> NMI -> int3 Because the interrupt changes the stack, the NMI will not see that it preempted the debug stack. Looking deeper at this case, interrupts only happen when the int3 is from userspace or in an a location in the exception table (fixup). userspace -> int3 -> interurpt -> NMI -> int3 All other int3s that happen in the kernel should be processed without ever enabling interrupts, as the do_trap() call will panic the kernel if it is called to process any other location within the kernel. Adding a counter around the sections that enable interrupts while using the debug stack allows the NMI to also check that case. If the NMI sees that it either interrupted a task using the debug stack or the debug counter is non-zero, then it will have to change the IDT table to make the int3 not change stacks (which will corrupt the stack if it does). Note, I had to move the debug_usage functions out of processor.h and into debugreg.h because of the static inlined functions to inc and dec the debug_usage counter. __get_cpu_var() requires smp.h which includes processor.h, and would fail to build. Link: http://lkml.kernel.org/r/1323976535.23971.112.camel@gandalf.stny.rr.com Reported-by: Mathieu Desnoyers Cc: Linus Torvalds Cc: Peter Zijlstra Cc: H. Peter Anvin Cc: Thomas Gleixner Cc: Paul Turner Cc: Frederic Weisbecker Signed-off-by: Steven Rostedt --- arch/x86/include/asm/debugreg.h | 22 ++++++++++++++++++++++ arch/x86/include/asm/processor.h | 6 ------ arch/x86/kernel/cpu/common.c | 6 ++++-- arch/x86/kernel/traps.c | 14 ++++++++++++++ 4 files changed, 40 insertions(+), 8 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/debugreg.h b/arch/x86/include/asm/debugreg.h index 078ad0caefc6..b903d5ea3941 100644 --- a/arch/x86/include/asm/debugreg.h +++ b/arch/x86/include/asm/debugreg.h @@ -101,6 +101,28 @@ extern void aout_dump_debugregs(struct user *dump); extern void hw_breakpoint_restore(void); +#ifdef CONFIG_X86_64 +DECLARE_PER_CPU(int, debug_stack_usage); +static inline void debug_stack_usage_inc(void) +{ + __get_cpu_var(debug_stack_usage)++; +} +static inline void debug_stack_usage_dec(void) +{ + __get_cpu_var(debug_stack_usage)--; +} +int is_debug_stack(unsigned long addr); +void debug_stack_set_zero(void); +void debug_stack_reset(void); +#else /* !X86_64 */ +static inline int is_debug_stack(unsigned long addr) { return 0; } +static inline void debug_stack_set_zero(void) { } +static inline void debug_stack_reset(void) { } +static inline void debug_stack_usage_inc(void) { } +static inline void debug_stack_usage_dec(void) { } +#endif /* X86_64 */ + + #endif /* __KERNEL__ */ #endif /* _ASM_X86_DEBUGREG_H */ diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index 4b39d6d7e3a1..b650435ffb53 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -402,9 +402,6 @@ DECLARE_PER_CPU(char *, irq_stack_ptr); DECLARE_PER_CPU(unsigned int, irq_count); extern unsigned long kernel_eflags; extern asmlinkage void ignore_sysret(void); -int is_debug_stack(unsigned long addr); -void debug_stack_set_zero(void); -void debug_stack_reset(void); #else /* X86_64 */ #ifdef CONFIG_CC_STACKPROTECTOR /* @@ -419,9 +416,6 @@ struct stack_canary { }; DECLARE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif -static inline int is_debug_stack(unsigned long addr) { return 0; } -static inline void debug_stack_set_zero(void) { } -static inline void debug_stack_reset(void) { } #endif /* X86_64 */ extern unsigned int xstate_size; diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index caa404556b9c..266e4649b1da 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1093,11 +1093,13 @@ unsigned long kernel_eflags; DEFINE_PER_CPU(struct orig_ist, orig_ist); static DEFINE_PER_CPU(unsigned long, debug_stack_addr); +DEFINE_PER_CPU(int, debug_stack_usage); int is_debug_stack(unsigned long addr) { - return addr <= __get_cpu_var(debug_stack_addr) && - addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ); + return __get_cpu_var(debug_stack_usage) || + (addr <= __get_cpu_var(debug_stack_addr) && + addr > (__get_cpu_var(debug_stack_addr) - DEBUG_STKSZ)); } void debug_stack_set_zero(void) diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index a93c5cabc36a..0072b38e3ea1 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -316,9 +316,15 @@ dotraplinkage void __kprobes do_int3(struct pt_regs *regs, long error_code) return; #endif + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); preempt_conditional_sti(regs); do_trap(3, SIGTRAP, "int3", regs, error_code, NULL); preempt_conditional_cli(regs); + debug_stack_usage_dec(); } #ifdef CONFIG_X86_64 @@ -411,6 +417,12 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) SIGTRAP) == NOTIFY_STOP) return; + /* + * Let others (NMI) know that the debug stack is in use + * as we may switch to the interrupt stack. + */ + debug_stack_usage_inc(); + /* It's safe to allow irq's after DR6 has been saved */ preempt_conditional_sti(regs); @@ -418,6 +430,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) handle_vm86_trap((struct kernel_vm86_regs *) regs, error_code, 1); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } @@ -437,6 +450,7 @@ dotraplinkage void __kprobes do_debug(struct pt_regs *regs, long error_code) if (tsk->thread.debugreg6 & (DR_STEP | DR_TRAP_BITS) || user_icebp) send_sigtrap(tsk, regs, error_code, si_code); preempt_conditional_cli(regs); + debug_stack_usage_dec(); return; } -- cgit v1.2.3 From 7e16838d94b566a17b65231073d179bc04d590c8 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Sun, 19 Feb 2012 13:27:00 -0800 Subject: i387: support lazy restore of FPU state This makes us recognize when we try to restore FPU state that matches what we already have in the FPU on this CPU, and avoids the restore entirely if so. To do this, we add two new data fields: - a percpu 'fpu_owner_task' variable that gets written any time we update the "has_fpu" field, and thus acts as a kind of back-pointer to the task that owns the CPU. The exception is when we save the FPU state as part of a context switch - if the save can keep the FPU state around, we leave the 'fpu_owner_task' variable pointing at the task whose FP state still remains on the CPU. - a per-thread 'last_cpu' field, that indicates which CPU that thread used its FPU on last. We update this on every context switch (writing an invalid CPU number if the last context switch didn't leave the FPU in a lazily usable state), so we know that *that* thread has done nothing else with the FPU since. These two fields together can be used when next switching back to the task to see if the CPU still matches: if 'fpu_owner_task' matches the task we are switching to, we know that no other task (or kernel FPU usage) touched the FPU on this CPU in the meantime, and if the current CPU number matches the 'last_cpu' field, we know that this thread did no other FP work on any other CPU, so the FPU state on the CPU must match what was saved on last context switch. In that case, we can avoid the 'f[x]rstor' entirely, and just clear the CR0.TS bit. Signed-off-by: Linus Torvalds --- arch/x86/include/asm/i387.h | 35 +++++++++++++++++++++++------------ arch/x86/include/asm/processor.h | 3 ++- arch/x86/kernel/cpu/common.c | 2 ++ arch/x86/kernel/process_32.c | 2 +- arch/x86/kernel/process_64.c | 2 +- 5 files changed, 29 insertions(+), 15 deletions(-) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/include/asm/i387.h b/arch/x86/include/asm/i387.h index 74c607b37e87..247904945d3f 100644 --- a/arch/x86/include/asm/i387.h +++ b/arch/x86/include/asm/i387.h @@ -32,6 +32,8 @@ extern int init_fpu(struct task_struct *child); extern void math_state_restore(void); extern int dump_fpu(struct pt_regs *, struct user_i387_struct *); +DECLARE_PER_CPU(struct task_struct *, fpu_owner_task); + extern user_regset_active_fn fpregs_active, xfpregs_active; extern user_regset_get_fn fpregs_get, xfpregs_get, fpregs_soft_get, xstateregs_get; @@ -276,7 +278,7 @@ static inline int restore_fpu_checking(struct task_struct *tsk) "emms\n\t" /* clear stack tags */ "fildl %P[addr]", /* set F?P to defined value */ X86_FEATURE_FXSAVE_LEAK, - [addr] "m" (tsk->thread.has_fpu)); + [addr] "m" (tsk->thread.fpu.has_fpu)); return fpu_restore_checking(&tsk->thread.fpu); } @@ -288,19 +290,21 @@ static inline int restore_fpu_checking(struct task_struct *tsk) */ static inline int __thread_has_fpu(struct task_struct *tsk) { - return tsk->thread.has_fpu; + return tsk->thread.fpu.has_fpu; } /* Must be paired with an 'stts' after! */ static inline void __thread_clear_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 0; + tsk->thread.fpu.has_fpu = 0; + percpu_write(fpu_owner_task, NULL); } /* Must be paired with a 'clts' before! */ static inline void __thread_set_has_fpu(struct task_struct *tsk) { - tsk->thread.has_fpu = 1; + tsk->thread.fpu.has_fpu = 1; + percpu_write(fpu_owner_task, tsk); } /* @@ -345,18 +349,22 @@ typedef struct { int preload; } fpu_switch_t; * We don't do that yet, so "fpu_lazy_restore()" always returns * false, but some day.. */ -#define fpu_lazy_restore(tsk) (0) -#define fpu_lazy_state_intact(tsk) do { } while (0) +static inline int fpu_lazy_restore(struct task_struct *new, unsigned int cpu) +{ + return new == percpu_read_stable(fpu_owner_task) && + cpu == new->thread.fpu.last_cpu; +} -static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new) +static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct task_struct *new, int cpu) { fpu_switch_t fpu; fpu.preload = tsk_used_math(new) && new->fpu_counter > 5; if (__thread_has_fpu(old)) { - if (__save_init_fpu(old)) - fpu_lazy_state_intact(old); - __thread_clear_has_fpu(old); + if (!__save_init_fpu(old)) + cpu = ~0; + old->thread.fpu.last_cpu = cpu; + old->thread.fpu.has_fpu = 0; /* But leave fpu_owner_task! */ /* Don't change CR0.TS if we just switch! */ if (fpu.preload) { @@ -367,9 +375,10 @@ static inline fpu_switch_t switch_fpu_prepare(struct task_struct *old, struct ta stts(); } else { old->fpu_counter = 0; + old->thread.fpu.last_cpu = ~0; if (fpu.preload) { new->fpu_counter++; - if (fpu_lazy_restore(new)) + if (fpu_lazy_restore(new, cpu)) fpu.preload = 0; else prefetch(new->thread.fpu.state); @@ -463,8 +472,10 @@ static inline void kernel_fpu_begin(void) __save_init_fpu(me); __thread_clear_has_fpu(me); /* We do 'stts()' in kernel_fpu_end() */ - } else + } else { + percpu_write(fpu_owner_task, NULL); clts(); + } } static inline void kernel_fpu_end(void) diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index f7c89e231c6c..58545c97d071 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -374,6 +374,8 @@ union thread_xstate { }; struct fpu { + unsigned int last_cpu; + unsigned int has_fpu; union thread_xstate *state; }; @@ -454,7 +456,6 @@ struct thread_struct { unsigned long trap_no; unsigned long error_code; /* floating point and extended processor state */ - unsigned long has_fpu; struct fpu fpu; #ifdef CONFIG_X86_32 /* Virtual 86 mode info */ diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index d43cad74f166..b667148dfad7 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1044,6 +1044,8 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); + /* * Special IST stacks which the CPU switches to when it calls * an IST-marked descriptor entry. Up to 7 stacks (hardware diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index bc32761bc27a..c08d1ff12b7c 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -304,7 +304,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0. diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 8ad880b3bc1c..cfa5c90c01db 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -389,7 +389,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) unsigned fsindex, gsindex; fpu_switch_t fpu; - fpu = switch_fpu_prepare(prev_p, next_p); + fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* * Reload esp0, LDT and the page table pointer: -- cgit v1.2.3 From 27e74da9800289e69ba907777df1e2085231eff7 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 20 Feb 2012 19:34:10 -0800 Subject: i387: export 'fpu_owner_task' per-cpu variable (And define it properly for x86-32, which had its 'current_task' declaration in separate from x86-64) Bitten by my dislike for modules on the machines I use, and the fact that apparently nobody else actually wanted to test the patches I sent out. Snif. Nobody else cares. Anyway, we probably should uninline the 'kernel_fpu_begin()' function that is what modules actually use and that references this, but this is the minimal fix for now. Reported-by: Josh Boyer Reported-and-tested-by: Jongman Heo Signed-off-by: Linus Torvalds --- arch/x86/kernel/cpu/common.c | 3 +++ 1 file changed, 3 insertions(+) (limited to 'arch/x86/kernel/cpu/common.c') diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index b667148dfad7..c0f7d68d318f 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -1045,6 +1045,7 @@ DEFINE_PER_CPU(char *, irq_stack_ptr) = DEFINE_PER_CPU(unsigned int, irq_count) = -1; DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); /* * Special IST stacks which the CPU switches to when it calls @@ -1113,6 +1114,8 @@ void debug_stack_reset(void) DEFINE_PER_CPU(struct task_struct *, current_task) = &init_task; EXPORT_PER_CPU_SYMBOL(current_task); +DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +EXPORT_PER_CPU_SYMBOL(fpu_owner_task); #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); -- cgit v1.2.3