diff options
85 files changed, 1537 insertions, 1373 deletions
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S index 1d7fbbcc196d..8ef964ddc18e 100644 --- a/arch/x86/boot/compressed/head_32.S +++ b/arch/x86/boot/compressed/head_32.S @@ -29,6 +29,7 @@ #include <asm/page_types.h> #include <asm/boot.h> #include <asm/asm-offsets.h> +#include <asm/bootparam.h> __HEAD ENTRY(startup_32) @@ -102,7 +103,7 @@ preferred_addr: * Test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 1f cli diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S index 6b1766c6c082..90c152135e92 100644 --- a/arch/x86/boot/compressed/head_64.S +++ b/arch/x86/boot/compressed/head_64.S @@ -31,6 +31,7 @@ #include <asm/msr.h> #include <asm/processor-flags.h> #include <asm/asm-offsets.h> +#include <asm/bootparam.h> __HEAD .code32 @@ -46,7 +47,7 @@ ENTRY(startup_32) * Test KEEP_SEGMENTS flag to see if the bootloader is asking * us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 1f cli diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile index e785b422b766..bb635c641869 100644 --- a/arch/x86/ia32/Makefile +++ b/arch/x86/ia32/Makefile @@ -3,7 +3,6 @@ # obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o -obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o obj-$(CONFIG_IA32_AOUT) += ia32_aout.o diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c index d0165c9a2932..1f5e2b0e09ff 100644 --- a/arch/x86/ia32/ia32_signal.c +++ b/arch/x86/ia32/ia32_signal.c @@ -203,6 +203,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs, err |= restore_xstate_sig(buf, 1); + force_iret(); + return err; } diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S index 156ebcab4ada..ad9efef65a6b 100644 --- a/arch/x86/ia32/ia32entry.S +++ b/arch/x86/ia32/ia32entry.S @@ -30,24 +30,13 @@ .section .entry.text, "ax" - .macro IA32_ARG_FIXUP noebp=0 - movl %edi,%r8d - .if \noebp - .else - movl %ebp,%r9d - .endif - xchg %ecx,%esi - movl %ebx,%edi - movl %edx,%edx /* zero extension */ - .endm - - /* clobbers %eax */ - .macro CLEAR_RREGS offset=0, _r9=rax + /* clobbers %rax */ + .macro CLEAR_RREGS _r9=rax xorl %eax,%eax - movq %rax,\offset+R11(%rsp) - movq %rax,\offset+R10(%rsp) - movq %\_r9,\offset+R9(%rsp) - movq %rax,\offset+R8(%rsp) + movq %rax,R11(%rsp) + movq %rax,R10(%rsp) + movq %\_r9,R9(%rsp) + movq %rax,R8(%rsp) .endm /* @@ -60,14 +49,14 @@ * If it's -1 to make us punt the syscall, then (u32)-1 is still * an appropriately invalid value. */ - .macro LOAD_ARGS32 offset, _r9=0 + .macro LOAD_ARGS32 _r9=0 .if \_r9 - movl \offset+16(%rsp),%r9d + movl R9(%rsp),%r9d .endif - movl \offset+40(%rsp),%ecx - movl \offset+48(%rsp),%edx - movl \offset+56(%rsp),%esi - movl \offset+64(%rsp),%edi + movl RCX(%rsp),%ecx + movl RDX(%rsp),%edx + movl RSI(%rsp),%esi + movl RDI(%rsp),%edi movl %eax,%eax /* zero extension */ .endm @@ -99,35 +88,38 @@ ENDPROC(native_irq_enable_sysexit) /* * 32bit SYSENTER instruction entry. * + * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs. + * IF and VM in rflags are cleared (IOW: interrupts are off). + * SYSENTER does not save anything on the stack, + * and does not save old rip (!!!) and rflags. + * * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp user stack - * 0(%ebp) Arg6 - * - * Interrupts off. - * + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp user stack + * 0(%ebp) arg6 + * * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code + * path below. We set up a complete hardware stack frame to share code * with the int 0x80 path. - */ + */ ENTRY(ia32_sysenter_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME CFI_DEF_CFA rsp,0 CFI_REGISTER rsp,rbp SWAPGS_UNSAFE_STACK - movq PER_CPU_VAR(kernel_stack), %rsp - addq $(KERNEL_STACK_OFFSET),%rsp + movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp /* * No need to follow this irqs on/off section: the syscall * disabled irqs, here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) + /* Construct iret frame (ss,rsp,rflags,cs,rip) */ movl %ebp,%ebp /* zero extension */ pushq_cfi $__USER32_DS /*CFI_REL_OFFSET ss,0*/ @@ -140,13 +132,19 @@ ENTRY(ia32_sysenter_target) pushq_cfi $__USER32_CS /*CFI_REL_OFFSET cs,0*/ movl %eax, %eax + /* Store thread_info->sysenter_return in rip stack slot */ pushq_cfi %r10 CFI_REL_OFFSET rip,0 + /* Store orig_ax */ pushq_cfi %rax + /* Construct the rest of "struct pt_regs" */ cld - SAVE_ARGS 0,1,0 - /* no need to do an access_ok check here because rbp has been - 32bit zero extended */ + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS_EXCEPT_R891011 + /* + * no need to do an access_ok check here because rbp has been + * 32bit zero extended + */ ASM_STAC 1: movl (%rbp),%ebp _ASM_EXTABLE(1b,ia32_badarg) @@ -157,32 +155,39 @@ ENTRY(ia32_sysenter_target) * ourselves. To save a few cycles, we can check whether * NT was set instead of doing an unconditional popfq. */ - testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp) + testl $X86_EFLAGS_NT,EFLAGS(%rsp) jnz sysenter_fix_flags sysenter_flags_fixed: - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) CFI_REMEMBER_STATE jnz sysenter_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys sysenter_do_call: - IA32_ARG_FIXUP + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ sysenter_dispatch: call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) jnz sysexit_audit sysexit_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) /* clear IF, that popfq doesn't enable interrupts early */ - andl $~0x200,EFLAGS-ARGOFFSET(%rsp) - movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */ + andl $~0x200,EFLAGS(%rsp) + movl RIP(%rsp),%edx /* User %eip */ CFI_REGISTER rip,rdx - RESTORE_ARGS 0,24,0,0,0,0 + RESTORE_RSI_RDI + /* pop everything except ss,rsp,rflags slots */ + REMOVE_PT_GPREGS_FROM_STACK 3*8 xorq %r8,%r8 xorq %r9,%r9 xorq %r10,%r10 @@ -192,6 +197,10 @@ sysexit_from_sys_call: popq_cfi %rcx /* User %esp */ CFI_REGISTER rsp,rcx TRACE_IRQS_ON + /* + * 32bit SYSEXIT restores eip from edx, esp from ecx. + * cs and ss are loaded from MSRs. + */ ENABLE_INTERRUPTS_SYSEXIT32 CFI_RESTORE_STATE @@ -205,18 +214,18 @@ sysexit_from_sys_call: movl %ebx,%esi /* 2nd arg: 1st syscall arg */ movl %eax,%edi /* 1st arg: syscall number */ call __audit_syscall_entry - movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */ + movl RAX(%rsp),%eax /* reload syscall number */ cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys movl %ebx,%edi /* reload 1st syscall arg */ - movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */ - movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */ - movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */ - movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */ + movl RCX(%rsp),%esi /* reload 2nd syscall arg */ + movl RDX(%rsp),%edx /* reload 3rd syscall arg */ + movl RSI(%rsp),%ecx /* reload 4th syscall arg */ + movl RDI(%rsp),%r8d /* reload 5th syscall arg */ .endm .macro auditsys_exit exit - testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) jnz ia32_ret_from_sys_call TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) @@ -227,13 +236,13 @@ sysexit_from_sys_call: 1: setbe %al /* 1 if error, 0 if not */ movzbl %al,%edi /* zero-extend that into %edi */ call __audit_syscall_exit - movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */ + movq RAX(%rsp),%rax /* reload syscall return value */ movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl %edi,TI_flags+THREAD_INFO(%rsp,RIP) jz \exit - CLEAR_RREGS -ARGOFFSET + CLEAR_RREGS jmp int_with_check .endm @@ -253,16 +262,16 @@ sysenter_fix_flags: sysenter_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) jz sysenter_auditsys #endif - SAVE_REST + SAVE_EXTRA_REGS CLEAR_RREGS movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */ jmp sysenter_do_call @@ -272,23 +281,33 @@ ENDPROC(ia32_sysenter_target) /* * 32bit SYSCALL instruction entry. * + * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. + * + * Note: rflags saving+masking-with-MSR happens only in Long mode + * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it). + * Don't get confused: rflags saving+masking depends on Long Mode Active bit + * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes + * or target CS descriptor's L bit (SYSCALL does not read segment descriptors). + * * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx return EIP - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg2 [note: not saved in the stack frame, should not be touched] - * %esp user stack - * 0(%esp) Arg6 - * - * Interrupts off. - * + * eax system call number + * ecx return address + * ebx arg1 + * ebp arg2 (note: not saved in the stack frame, should not be touched) + * edx arg3 + * esi arg4 + * edi arg5 + * esp user stack + * 0(%esp) arg6 + * * This is purely a fast path. For anything complicated we use the int 0x80 - * path below. Set up a complete hardware stack frame to share code - * with the int 0x80 path. - */ + * path below. We set up a complete hardware stack frame to share code + * with the int 0x80 path. + */ ENTRY(ia32_cstar_target) CFI_STARTPROC32 simple CFI_SIGNAL_FRAME @@ -304,62 +323,77 @@ ENTRY(ia32_cstar_target) * disabled irqs and here we enable it straight after entry: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8,0,0 + ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ + SAVE_C_REGS_EXCEPT_RCX_R891011 movl %eax,%eax /* zero extension */ - movq %rax,ORIG_RAX-ARGOFFSET(%rsp) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */ + movq %rax,ORIG_RAX(%rsp) + movq %rcx,RIP(%rsp) + CFI_REL_OFFSET rip,RIP + movq %rbp,RCX(%rsp) /* this lies slightly to ptrace */ movl %ebp,%ecx - movq $__USER32_CS,CS-ARGOFFSET(%rsp) - movq $__USER32_DS,SS-ARGOFFSET(%rsp) - movq %r11,EFLAGS-ARGOFFSET(%rsp) - /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - movq %r8,RSP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rsp,RSP-ARGOFFSET - /* no need to do an access_ok check here because r8 has been - 32bit zero extended */ - /* hardware stack frame is complete now */ + movq $__USER32_CS,CS(%rsp) + movq $__USER32_DS,SS(%rsp) + movq %r11,EFLAGS(%rsp) + /*CFI_REL_OFFSET rflags,EFLAGS*/ + movq %r8,RSP(%rsp) + CFI_REL_OFFSET rsp,RSP + /* iret stack frame is complete now */ + /* + * no need to do an access_ok check here because r8 has been + * 32bit zero extended + */ ASM_STAC 1: movl (%r8),%r9d _ASM_EXTABLE(1b,ia32_badarg) ASM_CLAC - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) CFI_REMEMBER_STATE jnz cstar_tracesys cmpq $IA32_NR_syscalls-1,%rax ja ia32_badsys cstar_do_call: - IA32_ARG_FIXUP 1 + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + /* r9 already loaded */ /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ cstar_dispatch: call *ia32_sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) jnz sysretl_audit sysretl_from_sys_call: - andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - RESTORE_ARGS 0,-ARG_SKIP,0,0,0 - movl RIP-ARGOFFSET(%rsp),%ecx + andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + RESTORE_RSI_RDI_RDX + movl RIP(%rsp),%ecx CFI_REGISTER rip,rcx - movl EFLAGS-ARGOFFSET(%rsp),%r11d + movl EFLAGS(%rsp),%r11d /*CFI_REGISTER rflags,r11*/ xorq %r10,%r10 xorq %r9,%r9 xorq %r8,%r8 TRACE_IRQS_ON - movl RSP-ARGOFFSET(%rsp),%esp + movl RSP(%rsp),%esp CFI_RESTORE rsp + /* + * 64bit->32bit SYSRET restores eip from ecx, + * eflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + * (Note: 32bit->32bit SYSRET is different: since r11 + * does not exist, it merely sets eflags.IF=1). + */ USERGS_SYSRET32 - + #ifdef CONFIG_AUDITSYSCALL cstar_auditsys: CFI_RESTORE_STATE - movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */ + movl %r9d,R9(%rsp) /* register to be clobbered by call */ auditsys_entry_common - movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */ + movl R9(%rsp),%r9d /* reload 6th syscall arg */ jmp cstar_dispatch sysretl_audit: @@ -368,17 +402,17 @@ sysretl_audit: cstar_tracesys: #ifdef CONFIG_AUDITSYSCALL - testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP) jz cstar_auditsys #endif xchgl %r9d,%ebp - SAVE_REST - CLEAR_RREGS 0, r9 + SAVE_EXTRA_REGS + CLEAR_RREGS r9 movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */ - RESTORE_REST + LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS xchgl %ebp,%r9d cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */ @@ -391,26 +425,26 @@ ia32_badarg: jmp ia32_sysret CFI_ENDPROC -/* - * Emulated IA32 system calls via int 0x80. +/* + * Emulated IA32 system calls via int 0x80. * - * Arguments: - * %eax System call number. - * %ebx Arg1 - * %ecx Arg2 - * %edx Arg3 - * %esi Arg4 - * %edi Arg5 - * %ebp Arg6 [note: not saved in the stack frame, should not be touched] + * Arguments: + * eax system call number + * ebx arg1 + * ecx arg2 + * edx arg3 + * esi arg4 + * edi arg5 + * ebp arg6 (note: not saved in the stack frame, should not be touched) * * Notes: - * Uses the same stack frame as the x86-64 version. - * All registers except %eax must be saved (but ptrace may violate that) + * Uses the same stack frame as the x86-64 version. + * All registers except eax must be saved (but ptrace may violate that). * Arguments are zero extended. For system calls that want sign extension and * take long arguments a wrapper is needed. Most calls can just be called * directly. - * Assumes it is only called from user space and entered with interrupts off. - */ + * Assumes it is only called from user space and entered with interrupts off. + */ ENTRY(ia32_syscall) CFI_STARTPROC32 simple @@ -429,40 +463,46 @@ ENTRY(ia32_syscall) */ ENABLE_INTERRUPTS(CLBR_NONE) movl %eax,%eax - pushq_cfi %rax + pushq_cfi %rax /* store orig_ax */ cld /* note the registers are not zero extended to the sf. this could be a problem. */ - SAVE_ARGS 0,1,0 - orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET) - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS_EXCEPT_R891011 + orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP) + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) jnz ia32_tracesys cmpq $(IA32_NR_syscalls-1),%rax ja ia32_badsys ia32_do_call: - IA32_ARG_FIXUP + /* 32bit syscall -> 64bit C ABI argument conversion */ + movl %edi,%r8d /* arg5 */ + movl %ebp,%r9d /* arg6 */ + xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */ + movl %ebx,%edi /* arg1 */ + movl %edx,%edx /* arg3 (zero extension) */ call *ia32_sys_call_table(,%rax,8) # xxx: rip relative ia32_sysret: - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) ia32_ret_from_sys_call: - CLEAR_RREGS -ARGOFFSET - jmp int_ret_from_sys_call + CLEAR_RREGS + jmp int_ret_from_sys_call -ia32_tracesys: - SAVE_REST +ia32_tracesys: + SAVE_EXTRA_REGS CLEAR_RREGS movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */ movq %rsp,%rdi /* &pt_regs -> arg1 */ call syscall_trace_enter - LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */ - RESTORE_REST + LOAD_ARGS32 /* reload args from stack in case ptrace changed it */ + RESTORE_EXTRA_REGS cmpq $(IA32_NR_syscalls-1),%rax ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */ jmp ia32_do_call END(ia32_syscall) ia32_badsys: - movq $0,ORIG_RAX-ARGOFFSET(%rsp) + movq $0,ORIG_RAX(%rsp) movq $-ENOSYS,%rax jmp ia32_sysret @@ -492,24 +532,23 @@ GLOBAL(stub32_clone) ALIGN ia32_ptregs_common: - popq %r11 CFI_ENDPROC CFI_STARTPROC32 simple CFI_SIGNAL_FRAME - CFI_DEF_CFA rsp,SS+8-ARGOFFSET - CFI_REL_OFFSET rax,RAX-ARGOFFSET - CFI_REL_OFFSET rcx,RCX-ARGOFFSET - CFI_REL_OFFSET rdx,RDX-ARGOFFSET - CFI_REL_OFFSET rsi,RSI-ARGOFFSET - CFI_REL_OFFSET rdi,RDI-ARGOFFSET - CFI_REL_OFFSET rip,RIP-ARGOFFSET -/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/ -/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/ - CFI_REL_OFFSET rsp,RSP-ARGOFFSET -/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/ - SAVE_REST + CFI_DEF_CFA rsp,SIZEOF_PTREGS + CFI_REL_OFFSET rax,RAX + CFI_REL_OFFSET rcx,RCX + CFI_REL_OFFSET rdx,RDX + CFI_REL_OFFSET rsi,RSI + CFI_REL_OFFSET rdi,RDI + CFI_REL_OFFSET rip,RIP +/* CFI_REL_OFFSET cs,CS*/ +/* CFI_REL_OFFSET rflags,EFLAGS*/ + CFI_REL_OFFSET rsp,RSP +/* CFI_REL_OFFSET ss,SS*/ + SAVE_EXTRA_REGS 8 call *%rax - RESTORE_REST - jmp ia32_sysret /* misbalances the return cache */ + RESTORE_EXTRA_REGS 8 + ret CFI_ENDPROC END(ia32_ptregs_common) diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c deleted file mode 100644 index 51ecd5b4e787..000000000000 --- a/arch/x86/ia32/nosyscall.c +++ /dev/null @@ -1,7 +0,0 @@ -#include <linux/kernel.h> -#include <linux/errno.h> - -long compat_ni_syscall(void) -{ - return -ENOSYS; -} diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c index 8e0ceecdc957..719cd702b0a4 100644 --- a/arch/x86/ia32/sys_ia32.c +++ b/arch/x86/ia32/sys_ia32.c @@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high, advice); } -long sys32_vm86_warning(void) -{ - struct task_struct *me = current; - static char lastcomm[sizeof(me->comm)]; - - if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) { - compat_printk(KERN_INFO - "%s: vm86 mode not supported on 64 bit kernel\n", - me->comm); - strncpy(lastcomm, me->comm, sizeof(lastcomm)); - } - return -ENOSYS; -} - asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi, size_t count) { diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c deleted file mode 100644 index 4754ba0f5d9f..000000000000 --- a/arch/x86/ia32/syscall_ia32.c +++ /dev/null @@ -1,25 +0,0 @@ -/* System call table for ia32 emulation. */ - -#include <linux/linkage.h> -#include <linux/sys.h> -#include <linux/cache.h> -#include <asm/asm-offsets.h> - -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ; -#include <asm/syscalls_32.h> -#undef __SYSCALL_I386 - -#define __SYSCALL_I386(nr, sym, compat) [nr] = compat, - -typedef void (*sys_call_ptr_t)(void); - -extern void compat_ni_syscall(void); - -const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { - /* - * Smells like a compiler bug -- it doesn't work - * when the & below is removed. - */ - [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall, -#include <asm/syscalls_32.h> -}; diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h index 372231c22a47..524bddce0b76 100644 --- a/arch/x86/include/asm/alternative-asm.h +++ b/arch/x86/include/asm/alternative-asm.h @@ -18,12 +18,53 @@ .endm #endif -.macro altinstruction_entry orig alt feature orig_len alt_len +.macro altinstruction_entry orig alt feature orig_len alt_len pad_len .long \orig - . .long \alt - . .word \feature .byte \orig_len .byte \alt_len + .byte \pad_len +.endm + +.macro ALTERNATIVE oldinstr, newinstr, feature +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr +144: + .popsection +.endm + +.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2 +140: + \oldinstr +141: + .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90 + .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90 +142: + + .pushsection .altinstructions,"a" + altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b + altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b + .popsection + + .pushsection .altinstr_replacement,"ax" +143: + \newinstr1 +144: + \newinstr2 +145: + .popsection .endm #endif /* __ASSEMBLY__ */ diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h index 473bdbee378a..5aef6a97d80e 100644 --- a/arch/x86/include/asm/alternative.h +++ b/arch/x86/include/asm/alternative.h @@ -48,8 +48,9 @@ struct alt_instr { s32 repl_offset; /* offset to replacement instruction */ u16 cpuid; /* cpuid bit set for replacement */ u8 instrlen; /* length of original instruction */ - u8 replacementlen; /* length of new instruction, <= instrlen */ -}; + u8 replacementlen; /* length of new instruction */ + u8 padlen; /* length of build-time padding */ +} __packed; extern void alternative_instructions(void); extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); @@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end) } #endif /* CONFIG_SMP */ -#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n" +#define b_replacement(num) "664"#num +#define e_replacement(num) "665"#num -#define b_replacement(number) "663"#number -#define e_replacement(number) "664"#number +#define alt_end_marker "663" +#define alt_slen "662b-661b" +#define alt_pad_len alt_end_marker"b-662b" +#define alt_total_slen alt_end_marker"b-661b" +#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f" -#define alt_slen "662b-661b" -#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f" +#define __OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \ + "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n" -#define ALTINSTR_ENTRY(feature, number) \ +#define OLDINSTR(oldinstr, num) \ + __OLDINSTR(oldinstr, num) \ + alt_end_marker ":\n" + +/* + * Pad the second replacement alternative with additional NOPs if it is + * additionally longer than the first replacement alternative. + */ +#define OLDINSTR_2(oldinstr, num1, num2) \ + __OLDINSTR(oldinstr, num1) \ + ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \ + "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \ + alt_end_marker ":\n" + +#define ALTINSTR_ENTRY(feature, num) \ " .long 661b - .\n" /* label */ \ - " .long " b_replacement(number)"f - .\n" /* new instruction */ \ + " .long " b_replacement(num)"f - .\n" /* new instruction */ \ " .word " __stringify(feature) "\n" /* feature bit */ \ - " .byte " alt_slen "\n" /* source len */ \ - " .byte " alt_rlen(number) "\n" /* replacement len */ + " .byte " alt_total_slen "\n" /* source len */ \ + " .byte " alt_rlen(num) "\n" /* replacement len */ \ + " .byte " alt_pad_len "\n" /* pad len */ -#define DISCARD_ENTRY(number) /* rlen <= slen */ \ - " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n" - -#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \ - b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t" +#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \ + b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t" /* alternative assembly primitive: */ #define ALTERNATIVE(oldinstr, newinstr, feature) \ - OLDINSTR(oldinstr) \ + OLDINSTR(oldinstr, 1) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature, 1) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ ".popsection" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ - OLDINSTR(oldinstr) \ + OLDINSTR_2(oldinstr, 1, 2) \ ".pushsection .altinstructions,\"a\"\n" \ ALTINSTR_ENTRY(feature1, 1) \ ALTINSTR_ENTRY(feature2, 2) \ ".popsection\n" \ - ".pushsection .discard,\"aw\",@progbits\n" \ - DISCARD_ENTRY(1) \ - DISCARD_ENTRY(2) \ - ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ @@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end) #define alternative(oldinstr, newinstr, feature) \ asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory") +#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \ + asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory") + /* * Alternative inline assembly with input. * diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h index efc3b22d896e..8118e94d50ab 100644 --- a/arch/x86/include/asm/apic.h +++ b/arch/x86/include/asm/apic.h @@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v) { volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg); - alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP, + alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP, ASM_OUTPUT2("=r" (v), "=m" (*addr)), ASM_OUTPUT2("0" (v), "m" (*addr))); } diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h index 2ab1eb33106e..959e45b81fe2 100644 --- a/arch/x86/include/asm/barrier.h +++ b/arch/x86/include/asm/barrier.h @@ -95,13 +95,11 @@ do { \ * Stop RDTSC speculation. This is needed when you need to use RDTSC * (or get_cycles or vread that possibly accesses the TSC) in a defined * code region. - * - * (Could use an alternative three way for this if there was one.) */ static __always_inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif /* _ASM_X86_BARRIER_H */ diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h index 1f1297b46f83..4b5f7bf2b780 100644 --- a/arch/x86/include/asm/calling.h +++ b/arch/x86/include/asm/calling.h @@ -55,143 +55,148 @@ For 32-bit we have the following conventions - kernel is built with * for assembly code: */ -#define R15 0 -#define R14 8 -#define R13 16 -#define R12 24 -#define RBP 32 -#define RBX 40 - -/* arguments: interrupts/non tracing syscalls only save up to here: */ -#define R11 48 -#define R10 56 -#define R9 64 -#define R8 72 -#define RAX 80 -#define RCX 88 -#define RDX 96 -#define RSI 104 -#define RDI 112 -#define ORIG_RAX 120 /* + error_code */ -/* end of arguments */ - -/* cpu exception frame or undefined in case of fast syscall: */ -#define RIP 128 -#define CS 136 -#define EFLAGS 144 -#define RSP 152 -#define SS 160 - -#define ARGOFFSET R11 - - .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0 - subq $9*8+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET 9*8+\addskip - movq_cfi rdi, 8*8 - movq_cfi rsi, 7*8 - movq_cfi rdx, 6*8 - - .if \save_rcx - movq_cfi rcx, 5*8 - .endif +/* The layout forms the "struct pt_regs" on the stack: */ +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ +#define R15 0*8 +#define R14 1*8 +#define R13 2*8 +#define R12 3*8 +#define RBP 4*8 +#define RBX 5*8 +/* These regs are callee-clobbered. Always saved on kernel entry. */ +#define R11 6*8 +#define R10 7*8 +#define R9 8*8 +#define R8 9*8 +#define RAX 10*8 +#define RCX 11*8 +#define RDX 12*8 +#define RSI 13*8 +#define RDI 14*8 +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 15*8 +/* Return frame for iretq */ +#define RIP 16*8 +#define CS 17*8 +#define EFLAGS 18*8 +#define RSP 19*8 +#define SS 20*8 + +#define SIZEOF_PTREGS 21*8 + + .macro ALLOC_PT_GPREGS_ON_STACK addskip=0 + subq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET 15*8+\addskip + .endm - .if \rax_enosys - movq $-ENOSYS, 4*8(%rsp) - .else - movq_cfi rax, 4*8 + .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1 + .if \r11 + movq_cfi r11, 6*8+\offset .endif - - .if \save_r891011 - movq_cfi r8, 3*8 - movq_cfi r9, 2*8 - movq_cfi r10, 1*8 - movq_cfi r11, 0*8 + .if \r8910 + movq_cfi r10, 7*8+\offset + movq_cfi r9, 8*8+\offset + movq_cfi r8, 9*8+\offset + .endif + .if \rax + movq_cfi rax, 10*8+\offset + .endif + .if \rcx + movq_cfi rcx, 11*8+\offset .endif + movq_cfi rdx, 12*8+\offset + movq_cfi rsi, 13*8+\offset + movq_cfi rdi, 14*8+\offset + .endm + .macro SAVE_C_REGS offset=0 + SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0 + SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1 + .endm + .macro SAVE_C_REGS_EXCEPT_R891011 + SAVE_C_REGS_HELPER 0, 1, 1, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RCX_R891011 + SAVE_C_REGS_HELPER 0, 1, 0, 0, 0 + .endm + .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + SAVE_C_REGS_HELPER 0, 0, 0, 1, 0 + .endm + .macro SAVE_EXTRA_REGS offset=0 + movq_cfi r15, 0*8+\offset + movq_cfi r14, 1*8+\offset + movq_cfi r13, 2*8+\offset + movq_cfi r12, 3*8+\offset + movq_cfi rbp, 4*8+\offset + movq_cfi rbx, 5*8+\offset + .endm + .macro SAVE_EXTRA_REGS_RBP offset=0 + movq_cfi rbp, 4*8+\offset .endm -#define ARG_SKIP (9*8) + .macro RESTORE_EXTRA_REGS offset=0 + movq_cfi_restore 0*8+\offset, r15 + movq_cfi_restore 1*8+\offset, r14 + movq_cfi_restore 2*8+\offset, r13 + movq_cfi_restore 3*8+\offset, r12 + movq_cfi_restore 4*8+\offset, rbp + movq_cfi_restore 5*8+\offset, rbx + .endm - .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \ - rstor_r8910=1, rstor_rdx=1 + .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 .if \rstor_r11 - movq_cfi_restore 0*8, r11 + movq_cfi_restore 6*8, r11 .endif - .if \rstor_r8910 - movq_cfi_restore 1*8, r10 - movq_cfi_restore 2*8, r9 - movq_cfi_restore 3*8, r8 + movq_cfi_restore 7*8, r10 + movq_cfi_restore 8*8, r9 + movq_cfi_restore 9*8, r8 .endif - .if \rstor_rax - movq_cfi_restore 4*8, rax + movq_cfi_restore 10*8, rax .endif - .if \rstor_rcx - movq_cfi_restore 5*8, rcx + movq_cfi_restore 11*8, rcx .endif - .if \rstor_rdx - movq_cfi_restore 6*8, rdx - .endif - - movq_cfi_restore 7*8, rsi - movq_cfi_restore 8*8, rdi - - .if ARG_SKIP+\addskip > 0 - addq $ARG_SKIP+\addskip, %rsp - CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip) + movq_cfi_restore 12*8, rdx .endif + movq_cfi_restore 13*8, rsi + movq_cfi_restore 14*8, rdi .endm - - .macro LOAD_ARGS offset, skiprax=0 - movq \offset(%rsp), %r11 - movq \offset+8(%rsp), %r10 - movq \offset+16(%rsp), %r9 - movq \offset+24(%rsp), %r8 - movq \offset+40(%rsp), %rcx - movq \offset+48(%rsp), %rdx - movq \offset+56(%rsp), %rsi - movq \offset+64(%rsp), %rdi - .if \skiprax - .else - movq \offset+72(%rsp), %rax - .endif + .macro RESTORE_C_REGS + RESTORE_C_REGS_HELPER 1,1,1,1,1 .endm - -#define REST_SKIP (6*8) - - .macro SAVE_REST - subq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq_cfi rbx, 5*8 - movq_cfi rbp, 4*8 - movq_cfi r12, 3*8 - movq_cfi r13, 2*8 - movq_cfi r14, 1*8 - movq_cfi r15, 0*8 + .macro RESTORE_C_REGS_EXCEPT_RAX + RESTORE_C_REGS_HELPER 0,1,1,1,1 .endm - - .macro RESTORE_REST - movq_cfi_restore 0*8, r15 - movq_cfi_restore 1*8, r14 - movq_cfi_restore 2*8, r13 - movq_cfi_restore 3*8, r12 - movq_cfi_restore 4*8, rbp - movq_cfi_restore 5*8, rbx - addq $REST_SKIP, %rsp - CFI_ADJUST_CFA_OFFSET -(REST_SKIP) + .macro RESTORE_C_REGS_EXCEPT_RCX + RESTORE_C_REGS_HELPER 1,0,1,1,1 .endm - - .macro SAVE_ALL - SAVE_ARGS - SAVE_REST + .macro RESTORE_C_REGS_EXCEPT_R11 + RESTORE_C_REGS_HELPER 1,1,0,1,1 + .endm + .macro RESTORE_C_REGS_EXCEPT_RCX_R11 + RESTORE_C_REGS_HELPER 1,0,0,1,1 + .endm + .macro RESTORE_RSI_RDI + RESTORE_C_REGS_HELPER 0,0,0,0,0 + .endm + .macro RESTORE_RSI_RDI_RDX + RESTORE_C_REGS_HELPER 0,0,0,0,1 .endm - .macro RESTORE_ALL addskip=0 - RESTORE_REST - RESTORE_ARGS 1, \addskip + .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 + addq $15*8+\addskip, %rsp + CFI_ADJUST_CFA_OFFSET -(15*8+\addskip) .endm .macro icebp @@ -210,37 +215,23 @@ For 32-bit we have the following conventions - kernel is built with */ .macro SAVE_ALL - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx .endm .macro RESTORE_ALL - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebp - CFI_RESTORE ebp - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg ebx + popl_cfi_reg ecx + popl_cfi_reg edx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebp + popl_cfi_reg eax .endm #endif /* CONFIG_X86_64 */ diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h index 59c6c401f79f..acdee09228b3 100644 --- a/arch/x86/include/asm/compat.h +++ b/arch/x86/include/asm/compat.h @@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len) sp = task_pt_regs(current)->sp; } else { /* -128 for the x32 ABI redzone */ - sp = this_cpu_read(old_rsp) - 128; + sp = task_pt_regs(current)->sp - 128; } return (void __user *)round_down(sp - len, 16); diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h index 90a54851aedc..0f7a5a1a8db2 100644 --- a/arch/x86/include/asm/cpufeature.h +++ b/arch/x86/include/asm/cpufeature.h @@ -231,6 +231,7 @@ #define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ #define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ #define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */ #define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ #define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ #define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ @@ -418,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* 1: do replace */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (X86_FEATURE_ALWAYS) : : t_warn); @@ -432,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P0\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 0\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" /* skipping size check since replacement size = 0 */ : : "i" (bit) : : t_no); @@ -457,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) " .word %P1\n" /* feature bit */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -483,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit) static __always_inline __pure bool _static_cpu_has_safe(u16 bit) { #ifdef CC_HAVE_ASM_GOTO -/* - * We need to spell the jumps to the compiler because, depending on the offset, - * the replacement jump can be bigger than the original jump, and this we cannot - * have. Thus, we force the jump to the widest, 4-byte, signed relative - * offset even though the last would often fit in less bytes. - */ - asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n" + asm_volatile_goto("1: jmp %l[t_dynamic]\n" "2:\n" + ".skip -(((5f-4f) - (2b-1b)) > 0) * " + "((5f-4f) - (2b-1b)),0x90\n" + "3:\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ - " .long 3f - .\n" /* repl offset */ + " .long 4f - .\n" /* repl offset */ " .word %P1\n" /* always replace */ - " .byte 2b - 1b\n" /* src len */ - " .byte 4f - 3f\n" /* repl len */ + " .byte 3b - 1b\n" /* src len */ + " .byte 5f - 4f\n" /* repl len */ + " .byte 3b - 2b\n" /* pad len */ ".previous\n" ".section .altinstr_replacement,\"ax\"\n" - "3: .byte 0xe9\n .long %l[t_no] - 2b\n" - "4:\n" + "4: jmp %l[t_no]\n" + "5:\n" ".previous\n" ".section .altinstructions,\"a\"\n" " .long 1b - .\n" /* src offset */ " .long 0\n" /* no replacement */ " .word %P0\n" /* feature bit */ - " .byte 2b - 1b\n" /* src len */ + " .byte 3b - 1b\n" /* src len */ " .byte 0\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" : : "i" (bit), "i" (X86_FEATURE_ALWAYS) : : t_dynamic, t_no); @@ -527,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P2\n" /* always replace */ " .byte 2b - 1b\n" /* source len */ " .byte 4f - 3f\n" /* replacement len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */ @@ -541,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit) " .word %P1\n" /* feature bit */ " .byte 4b - 3b\n" /* src len */ " .byte 6f - 5f\n" /* repl len */ + " .byte 0\n" /* pad len */ ".previous\n" ".section .discard,\"aw\",@progbits\n" " .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */ diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h index a94b82e8f156..a0bf89fd2647 100644 --- a/arch/x86/include/asm/desc.h +++ b/arch/x86/include/asm/desc.h @@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr, * Pentium F0 0F bugfix can have resulted in the mapped * IDT being write-protected. */ -#define set_intr_gate(n, addr) \ +#define set_intr_gate_notrace(n, addr) \ do { \ BUG_ON((unsigned)n > 0xFF); \ _set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \ __KERNEL_CS); \ + } while (0) + +#define set_intr_gate(n, addr) \ + do { \ + set_intr_gate_notrace(n, addr); \ _trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\ 0, 0, __KERNEL_CS); \ } while (0) diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h index f6f15986df6c..de1cdaf4d743 100644 --- a/arch/x86/include/asm/dwarf2.h +++ b/arch/x86/include/asm/dwarf2.h @@ -86,11 +86,23 @@ CFI_ADJUST_CFA_OFFSET 8 .endm + .macro pushq_cfi_reg reg + pushq %\reg + CFI_ADJUST_CFA_OFFSET 8 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popq_cfi reg popq \reg CFI_ADJUST_CFA_OFFSET -8 .endm + .macro popq_cfi_reg reg + popq %\reg + CFI_ADJUST_CFA_OFFSET -8 + CFI_RESTORE \reg + .endm + .macro pushfq_cfi pushfq CFI_ADJUST_CFA_OFFSET 8 @@ -116,11 +128,23 @@ CFI_ADJUST_CFA_OFFSET 4 .endm + .macro pushl_cfi_reg reg + pushl %\reg + CFI_ADJUST_CFA_OFFSET 4 + CFI_REL_OFFSET \reg, 0 + .endm + .macro popl_cfi reg popl \reg CFI_ADJUST_CFA_OFFSET -4 .endm + .macro popl_cfi_reg reg + popl %\reg + CFI_ADJUST_CFA_OFFSET -4 + CFI_RESTORE \reg + .endm + .macro pushfl_cfi pushfl CFI_ADJUST_CFA_OFFSET 4 diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h index 47f29b1d1846..e7814b74caf8 100644 --- a/arch/x86/include/asm/insn.h +++ b/arch/x86/include/asm/insn.h @@ -69,7 +69,7 @@ struct insn { const insn_byte_t *next_byte; }; -#define MAX_INSN_SIZE 16 +#define MAX_INSN_SIZE 15 #define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6) #define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3) diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h index 0a8b519226b8..021bee9b86b6 100644 --- a/arch/x86/include/asm/irqflags.h +++ b/arch/x86/include/asm/irqflags.h @@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void) #define ARCH_LOCKDEP_SYS_EXIT_IRQ \ TRACE_IRQS_ON; \ sti; \ - SAVE_REST; \ + SAVE_EXTRA_REGS; \ LOCKDEP_SYS_EXIT; \ - RESTORE_REST; \ + RESTORE_EXTRA_REGS; \ cli; \ TRACE_IRQS_OFF; diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h index ec1c93588cef..572099710ba2 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -209,9 +209,24 @@ struct x86_hw_tss { unsigned short back_link, __blh; unsigned long sp0; unsigned short ss0, __ss0h; - unsigned long sp1; - /* ss1 caches MSR_IA32_SYSENTER_CS: */ - unsigned short ss1, __ss1h; + + /* + * We don't use ring 1, so sp1 and ss1 are convenient scratch + * spaces in the same cacheline as sp0. We use them to cache + * some MSR values to avoid unnecessary wrmsr instructions. + * + * We use SYSENTER_ESP to find sp0 and for the NMI emergency + * stack, but we need to context switch it because we do + * horrible things to the kernel stack in vm86 mode. + * + * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid + * corrupting the stack if we went through the sysenter path + * from vm86 mode. + */ + unsigned long sp1; /* MSR_IA32_SYSENTER_ESP */ + unsigned short ss1; /* MSR_IA32_SYSENTER_CS */ + + unsigned short __ss1h; unsigned long sp2; unsigned short ss2, __ss2h; unsigned long __cr3; @@ -276,13 +291,17 @@ struct tss_struct { unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; /* - * .. and then another 0x100 bytes for the emergency kernel stack: + * Space for the temporary SYSENTER stack: */ - unsigned long stack[64]; + unsigned long SYSENTER_stack[64]; } ____cacheline_aligned; -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss); +DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); + +#ifdef CONFIG_X86_32 +DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#endif /* * Save the original ist values for checking stack pointers during debugging @@ -474,7 +493,6 @@ struct thread_struct { #ifdef CONFIG_X86_32 unsigned long sysenter_cs; #else - unsigned long usersp; /* Copy from PDA */ unsigned short es; unsigned short ds; unsigned short fsindex; @@ -564,6 +582,16 @@ static inline void native_swapgs(void) #endif } +static inline unsigned long current_top_of_stack(void) +{ +#ifdef CONFIG_X86_64 + return this_cpu_read_stable(cpu_tss.x86_tss.sp0); +#else + /* sp0 on x86_32 is special in and around vm86 mode. */ + return this_cpu_read_stable(cpu_current_top_of_stack); +#endif +} + #ifdef CONFIG_PARAVIRT #include <asm/paravirt.h> #else @@ -761,10 +789,10 @@ extern char ignore_fpu_irq; #define ARCH_HAS_SPINLOCK_PREFETCH #ifdef CONFIG_X86_32 -# define BASE_PREFETCH ASM_NOP4 +# define BASE_PREFETCH "" # define ARCH_HAS_PREFETCH #else -# define BASE_PREFETCH "prefetcht0 (%1)" +# define BASE_PREFETCH "prefetcht0 %P1" #endif /* @@ -775,10 +803,9 @@ extern char ignore_fpu_irq; */ static inline void prefetch(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchnta (%1)", + alternative_input(BASE_PREFETCH, "prefetchnta %P1", X86_FEATURE_XMM, - "r" (x)); + "m" (*(const char *)x)); } /* @@ -788,10 +815,9 @@ static inline void prefetch(const void *x) */ static inline void prefetchw(const void *x) { - alternative_input(BASE_PREFETCH, - "prefetchw (%1)", - X86_FEATURE_3DNOW, - "r" (x)); + alternative_input(BASE_PREFETCH, "prefetchw %P1", + X86_FEATURE_3DNOWPREFETCH, + "m" (*(const char *)x)); } static inline void spin_lock_prefetch(const void *x) @@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x) prefetchw(x); } +#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ + TOP_OF_KERNEL_STACK_PADDING) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x) #define STACK_TOP_MAX STACK_TOP #define INIT_THREAD { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ + .sp0 = TOP_OF_INIT_STACK, \ .vm86_info = NULL, \ .sysenter_cs = __KERNEL_CS, \ .io_bitmap_ptr = NULL, \ } -/* - * Note that the .io_bitmap member must be extra-big. This is because - * the CPU will access an additional byte beyond the end of the IO - * permission bitmap. The extra byte must be all 1 bits, and must - * be within the limit. - */ -#define INIT_TSS { \ - .x86_tss = { \ - .sp0 = sizeof(init_stack) + (long)&init_stack, \ - .ss0 = __KERNEL_DS, \ - .ss1 = __KERNEL_CS, \ - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \ - }, \ - .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \ -} - extern unsigned long thread_saved_pc(struct task_struct *tsk); -#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long)) -#define KSTK_TOP(info) \ -({ \ - unsigned long *__ptr = (unsigned long *)(info); \ - (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \ -}) - /* - * The below -8 is to reserve 8 bytes on top of the ring0 stack. + * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. * This is necessary to guarantee that the entire "struct pt_regs" * is accessible even if the CPU haven't stored the SS/ESP registers * on the stack (interrupt gate does not save these registers @@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); * "struct pt_regs" is possible, but they may contain the * completely wrong values. */ -#define task_pt_regs(task) \ -({ \ - struct pt_regs *__regs__; \ - __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \ - __regs__ - 1; \ +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ }) #define KSTK_ESP(task) (task_pt_regs(task)->sp) @@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ -} - -#define INIT_TSS { \ - .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \ + .sp0 = TOP_OF_INIT_STACK \ } /* @@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk); #define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); -/* - * User space RSP while inside the SYSCALL fast path - */ -DECLARE_PER_CPU(unsigned long, old_rsp); - #endif /* CONFIG_X86_64 */ extern void start_thread(struct pt_regs *regs, unsigned long new_ip, diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h index 86fc2bb82287..83b874da2762 100644 --- a/arch/x86/include/asm/ptrace.h +++ b/arch/x86/include/asm/ptrace.h @@ -31,13 +31,17 @@ struct pt_regs { #else /* __i386__ */ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long bp; unsigned long bx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -47,9 +51,12 @@ struct pt_regs { unsigned long dx; unsigned long si; unsigned long di; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_ax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long ip; unsigned long cs; unsigned long flags; @@ -138,12 +145,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs) #endif } -#define current_user_stack_pointer() this_cpu_read(old_rsp) -/* ia32 vs. x32 difference */ -#define compat_user_stack_pointer() \ - (test_thread_flag(TIF_IA32) \ - ? current_pt_regs()->sp \ - : this_cpu_read(old_rsp)) +#define current_user_stack_pointer() current_pt_regs()->sp +#define compat_user_stack_pointer() current_pt_regs()->sp #endif #ifdef CONFIG_X86_32 @@ -248,7 +251,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs, */ #define arch_ptrace_stop_needed(code, info) \ ({ \ - set_thread_flag(TIF_NOTIFY_RESUME); \ + force_iret(); \ false; \ }) diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h index 9dfce4e0417d..6fe6b182c998 100644 --- a/arch/x86/include/asm/sigcontext.h +++ b/arch/x86/include/asm/sigcontext.h @@ -57,9 +57,9 @@ struct sigcontext { unsigned long ip; unsigned long flags; unsigned short cs; - unsigned short gs; - unsigned short fs; - unsigned short __pad0; + unsigned short __pad2; /* Was called gs, but was always zero. */ + unsigned short __pad1; /* Was called fs, but was always zero. */ + unsigned short ss; unsigned long err; unsigned long trapno; unsigned long oldmask; diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h index 8d3120f4e270..ba665ebd17bb 100644 --- a/arch/x86/include/asm/smap.h +++ b/arch/x86/include/asm/smap.h @@ -27,23 +27,11 @@ #ifdef CONFIG_X86_SMAP -#define ASM_CLAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_CLAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection - -#define ASM_STAC \ - 661: ASM_NOP3 ; \ - .pushsection .altinstr_replacement, "ax" ; \ - 662: __ASM_STAC ; \ - .popsection ; \ - .pushsection .altinstructions, "a" ; \ - altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \ - .popsection +#define ASM_CLAC \ + ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP + +#define ASM_STAC \ + ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP #else /* CONFIG_X86_SMAP */ @@ -61,20 +49,20 @@ static __always_inline void clac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP); } static __always_inline void stac(void) { /* Note: a barrier is implicit in alternative() */ - alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP); + alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP); } /* These macros can be used in asm() statements */ #define ASM_CLAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP) #define ASM_STAC \ - ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP) + ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP) #else /* CONFIG_X86_SMAP */ diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h index 6a4b00fafb00..2ec1a5392542 100644 --- a/arch/x86/include/asm/special_insns.h +++ b/arch/x86/include/asm/special_insns.h @@ -4,6 +4,8 @@ #ifdef __KERNEL__ +#include <asm/nops.h> + static inline void native_clts(void) { asm volatile("clts"); @@ -199,6 +201,14 @@ static inline void clflushopt(volatile void *__p) "+m" (*(volatile char __force *)__p)); } +static inline void pcommit_sfence(void) +{ + alternative(ASM_NOP7, + ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */ + "sfence", + X86_FEATURE_PCOMMIT); +} + #define nop() asm volatile ("nop") diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..0abf7ab20ce2 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -13,6 +13,33 @@ #include <asm/types.h> /* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + +/* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages @@ -158,10 +185,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_read_stable(kernel_stack) + - KERNEL_STACK_OFFSET - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } static inline unsigned long current_stack_pointer(void) @@ -236,6 +260,16 @@ static inline bool is_ia32_task(void) #endif return false; } + +/* + * Force syscall return via IRET by making it look as if there was + * some work pending. IRET is our most capable (but slowest) syscall + * return path, which is able to restore modified SS, CS and certain + * EFLAGS values that other (fast) syscall return instructions + * are not able to restore properly. + */ +#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) + #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h index 7b0a55a88851..580aee3072e0 100644 --- a/arch/x86/include/uapi/asm/ptrace-abi.h +++ b/arch/x86/include/uapi/asm/ptrace-abi.h @@ -25,13 +25,17 @@ #else /* __i386__ */ #if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS) +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ #define R15 0 #define R14 8 #define R13 16 #define R12 24 #define RBP 32 #define RBX 40 -/* arguments: interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ #define R11 48 #define R10 56 #define R9 64 @@ -41,15 +45,17 @@ #define RDX 96 #define RSI 104 #define RDI 112 -#define ORIG_RAX 120 /* = ERROR */ -/* end of arguments */ -/* cpu exception frame or undefined in case of fast syscall. */ +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ +#define ORIG_RAX 120 +/* Return frame for iretq */ #define RIP 128 #define CS 136 #define EFLAGS 144 #define RSP 152 #define SS 160 -#define ARGOFFSET R11 #endif /* __ASSEMBLY__ */ /* top of stack page */ diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h index ac4b9aa4d999..bc16115af39b 100644 --- a/arch/x86/include/uapi/asm/ptrace.h +++ b/arch/x86/include/uapi/asm/ptrace.h @@ -41,13 +41,17 @@ struct pt_regs { #ifndef __KERNEL__ struct pt_regs { +/* + * C ABI says these regs are callee-preserved. They aren't saved on kernel entry + * unless syscall needs a complete, fully filled "struct pt_regs". + */ unsigned long r15; unsigned long r14; unsigned long r13; unsigned long r12; unsigned long rbp; unsigned long rbx; -/* arguments: non interrupts/non tracing syscalls only save up to here*/ +/* These regs are callee-clobbered. Always saved on kernel entry. */ unsigned long r11; unsigned long r10; unsigned long r9; @@ -57,9 +61,12 @@ struct pt_regs { unsigned long rdx; unsigned long rsi; unsigned long rdi; +/* + * On syscall entry, this is syscall#. On CPU exception, this is error code. + * On hw interrupt, it's IRQ number: + */ unsigned long orig_rax; -/* end of arguments */ -/* cpu exception frame or undefined */ +/* Return frame for iretq */ unsigned long rip; unsigned long cs; unsigned long eflags; diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h index d8b9f9081e86..16dc4e8a2cd3 100644 --- a/arch/x86/include/uapi/asm/sigcontext.h +++ b/arch/x86/include/uapi/asm/sigcontext.h @@ -177,9 +177,24 @@ struct sigcontext { __u64 rip; __u64 eflags; /* RFLAGS */ __u16 cs; - __u16 gs; - __u16 fs; - __u16 __pad0; + + /* + * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"), + * Linux saved and restored fs and gs in these slots. This + * was counterproductive, as fsbase and gsbase were never + * saved, so arch_prctl was presumably unreliable. + * + * If these slots are ever needed for any other purpose, there + * is some risk that very old 64-bit binaries could get + * confused. I doubt that many such binaries still work, + * though, since the same patch in 2.5.64 also removed the + * 64-bit set_thread_area syscall, so it appears that there is + * no TLS API that works in both pre- and post-2.5.64 kernels. + */ + __u16 __pad2; /* Was gs. */ + __u16 __pad1; /* Was fs. */ + + __u16 ss; __u64 err; __u64 trapno; __u64 oldmask; diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index cdb1b70ddad0..c887cd944f0c 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o obj-$(CONFIG_X86_64) += mcount_64.o obj-y += syscall_$(BITS).o vsyscall_gtod.o +obj-$(CONFIG_IA32_EMULATION) += syscall_32.o obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o obj-$(CONFIG_SYSFS) += ksysfs.o diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 703130f469ec..af397cc98d05 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str) __setup("noreplace-paravirt", setup_noreplace_paravirt); #endif -#define DPRINTK(fmt, ...) \ -do { \ - if (debug_alternative) \ - printk(KERN_DEBUG fmt, ##__VA_ARGS__); \ +#define DPRINTK(fmt, args...) \ +do { \ + if (debug_alternative) \ + printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \ +} while (0) + +#define DUMP_BYTES(buf, len, fmt, args...) \ +do { \ + if (unlikely(debug_alternative)) { \ + int j; \ + \ + if (!(len)) \ + break; \ + \ + printk(KERN_DEBUG fmt, ##args); \ + for (j = 0; j < (len) - 1; j++) \ + printk(KERN_CONT "%02hhx ", buf[j]); \ + printk(KERN_CONT "%02hhx\n", buf[j]); \ + } \ } while (0) /* @@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; extern s32 __smp_locks[], __smp_locks_end[]; void *text_poke_early(void *addr, const void *opcode, size_t len); -/* Replace instructions with better alternatives for this CPU type. - This runs before SMP is initialized to avoid SMP problems with - self modifying code. This implies that asymmetric systems where - APs have less capabilities than the boot processor are not handled. - Tough. Make sure you disable such features by hand. */ +/* + * Are we looking at a near JMP with a 1 or 4-byte displacement. + */ +static inline bool is_jmp(const u8 opcode) +{ + return opcode == 0xeb || opcode == 0xe9; +} + +static void __init_or_module +recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf) +{ + u8 *next_rip, *tgt_rip; + s32 n_dspl, o_dspl; + int repl_len; + + if (a->replacementlen != 5) + return; + + o_dspl = *(s32 *)(insnbuf + 1); + + /* next_rip of the replacement JMP */ + next_rip = repl_insn + a->replacementlen; + /* target rip of the replacement JMP */ + tgt_rip = next_rip + o_dspl; + n_dspl = tgt_rip - orig_insn; + + DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl); + + if (tgt_rip - orig_insn >= 0) { + if (n_dspl - 2 <= 127) + goto two_byte_jmp; + else + goto five_byte_jmp; + /* negative offset */ + } else { + if (((n_dspl - 2) & 0xff) == (n_dspl - 2)) + goto two_byte_jmp; + else + goto five_byte_jmp; + } + +two_byte_jmp: + n_dspl -= 2; + + insnbuf[0] = 0xeb; + insnbuf[1] = (s8)n_dspl; + add_nops(insnbuf + 2, 3); + + repl_len = 2; + goto done; + +five_byte_jmp: + n_dspl -= 5; + insnbuf[0] = 0xe9; + *(s32 *)&insnbuf[1] = n_dspl; + + repl_len = 5; + +done: + + DPRINTK("final displ: 0x%08x, JMP 0x%lx", + n_dspl, (unsigned long)orig_insn + n_dspl + repl_len); +} + +static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr) +{ + add_nops(instr + (a->instrlen - a->padlen), a->padlen); + + DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ", + instr, a->instrlen - a->padlen, a->padlen); +} + +/* + * Replace instructions with better alternatives for this CPU type. This runs + * before SMP is initialized to avoid SMP problems with self modifying code. + * This implies that asymmetric systems where APs have less capabilities than + * the boot processor are not handled. Tough. Make sure you disable such + * features by hand. + */ void __init_or_module apply_alternatives(struct alt_instr *start, struct alt_instr *end) { @@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start, u8 *instr, *replacement; u8 insnbuf[MAX_PATCH_LEN]; - DPRINTK("%s: alt table %p -> %p\n", __func__, start, end); + DPRINTK("alt table %p -> %p", start, end); /* * The scan order should be from start to end. A later scanned - * alternative code can overwrite a previous scanned alternative code. + * alternative code can overwrite previously scanned alternative code. * Some kernel functions (e.g. memcpy, memset, etc) use this order to * patch code. * @@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start, * order. */ for (a = start; a < end; a++) { + int insnbuf_sz = 0; + instr = (u8 *)&a->instr_offset + a->instr_offset; replacement = (u8 *)&a->repl_offset + a->repl_offset; - BUG_ON(a->replacementlen > a->instrlen); BUG_ON(a->instrlen > sizeof(insnbuf)); BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32); - if (!boot_cpu_has(a->cpuid)) + if (!boot_cpu_has(a->cpuid)) { + if (a->padlen > 1) + optimize_nops(a, instr); + continue; + } + + DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)", + a->cpuid >> 5, + a->cpuid & 0x1f, + instr, a->instrlen, + replacement, a->replacementlen); + + DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr); + DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement); memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; /* 0xe8 is a relative jump; fix the offset. */ - if (*insnbuf == 0xe8 && a->replacementlen == 5) - *(s32 *)(insnbuf + 1) += replacement - instr; + if (*insnbuf == 0xe8 && a->replacementlen == 5) { + *(s32 *)(insnbuf + 1) += replacement - instr; + DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx", + *(s32 *)(insnbuf + 1), + (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5); + } + + if (a->replacementlen && is_jmp(replacement[0])) + recompute_jump(a, instr, replacement, insnbuf); - add_nops(insnbuf + a->replacementlen, - a->instrlen - a->replacementlen); + if (a->instrlen > a->replacementlen) { + add_nops(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr); - text_poke_early(instr, insnbuf, a->instrlen); + text_poke_early(instr, insnbuf, insnbuf_sz); } } #ifdef CONFIG_SMP - static void alternatives_smp_lock(const s32 *start, const s32 *end, u8 *text, u8 *text_end) { @@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod, smp->locks_end = locks_end; smp->text = text; smp->text_end = text_end; - DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n", - __func__, smp->locks, smp->locks_end, + DPRINTK("locks %p -> %p, text %p -> %p, name %s\n", + smp->locks, smp->locks_end, smp->text, smp->text_end, smp->name); list_add_tail(&smp->next, &smp_alt_modules); @@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end) return 0; } -#endif +#endif /* CONFIG_SMP */ #ifdef CONFIG_PARAVIRT void __init_or_module apply_paravirt(struct paravirt_patch_site *start, diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c index 3b3b9d33ac1d..47703aed74cf 100644 --- a/arch/x86/kernel/asm-offsets_32.c +++ b/arch/x86/kernel/asm-offsets_32.c @@ -68,7 +68,7 @@ void foo(void) /* Offset from the sysenter stack to tss.sp0 */ DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - sizeof(struct tss_struct)); + offsetofend(struct tss_struct, SYSENTER_stack)); #if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE) BLANK(); diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c index fdcbb4d27c9f..5ce6f2da8763 100644 --- a/arch/x86/kernel/asm-offsets_64.c +++ b/arch/x86/kernel/asm-offsets_64.c @@ -81,6 +81,7 @@ int main(void) #undef ENTRY OFFSET(TSS_ist, tss_struct, x86_tss.ist); + OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); BLANK(); DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1); diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c index a220239cea65..dd9e50500297 100644 --- a/arch/x86/kernel/cpu/amd.c +++ b/arch/x86/kernel/cpu/amd.c @@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c) set_cpu_bug(c, X86_BUG_AMD_APIC_C1E); rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy); + + /* 3DNow or LM implies PREFETCHW */ + if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH)) + if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM)) + set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH); } #ifdef CONFIG_X86_32 diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c index 2346c95c6ab1..d79f139871e0 100644 --- a/arch/x86/kernel/cpu/common.c +++ b/arch/x86/kernel/cpu/common.c @@ -960,37 +960,53 @@ static void identify_cpu(struct cpuinfo_x86 *c) } #ifdef CONFIG_X86_64 -#ifdef CONFIG_IA32_EMULATION +# ifdef CONFIG_IA32_EMULATION /* May not be __init: called during resume */ static void syscall32_cpu_init(void) { - /* Load these always in case some future AMD CPU supports - SYSENTER from compat mode too. */ + /* + * Always load these, in case some future 64-bit CPU supports + * SYSENTER from compat mode too: + */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target); wrmsrl(MSR_CSTAR, ia32_cstar_target); } -#endif /* CONFIG_IA32_EMULATION */ -#endif /* CONFIG_X86_64 */ +# endif +#endif +/* + * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions + * on 32-bit kernels: + */ #ifdef CONFIG_X86_32 void enable_sep_cpu(void) { - int cpu = get_cpu(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss; + int cpu; - if (!boot_cpu_has(X86_FEATURE_SEP)) { - put_cpu(); - return; - } + cpu = get_cpu(); + tss = &per_cpu(cpu_tss, cpu); + + if (!boot_cpu_has(X86_FEATURE_SEP)) + goto out; + + /* + * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware, + * we cache the SYSENTER CS and ESP values there for easy access: + */ tss->x86_tss.ss1 = __KERNEL_CS; - tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss; - wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0); + wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); + + tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack); wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0); - wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0); + + wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0); + +out: put_cpu(); } #endif @@ -1130,8 +1146,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union, irq_stack_union) __aligned(PAGE_SIZE) __visible; /* - * The following four percpu variables are hot. Align current_task to - * cacheline size such that all four fall in the same cacheline. + * The following percpu variables are hot. Align current_task to + * cacheline size such that they fall in the same cacheline. */ DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned = &init_task; @@ -1226,6 +1242,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); DEFINE_PER_CPU(struct task_struct *, fpu_owner_task); +/* + * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find + * the top of the kernel stack. Use an extra percpu variable to track the + * top of the kernel stack directly. + */ +DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = + (unsigned long)&init_thread_union + THREAD_SIZE; +EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack); + #ifdef CONFIG_CC_STACKPROTECTOR DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary); #endif @@ -1307,7 +1332,7 @@ void cpu_init(void) */ load_ucode_ap(); - t = &per_cpu(init_tss, cpu); + t = &per_cpu(cpu_tss, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1391,7 +1416,7 @@ void cpu_init(void) { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); struct thread_struct *thread = &curr->thread; wait_for_master_cpu(cpu); diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c index b71a7f86d68a..979963bb3977 100644 --- a/arch/x86/kernel/cpu/perf_event.c +++ b/arch/x86/kernel/cpu/perf_event.c @@ -2161,10 +2161,9 @@ static unsigned long code_segment_base(struct pt_regs *regs) if (user_mode(regs) && regs->cs != __USER_CS) return get_segment_base(regs->cs); #else - if (test_thread_flag(TIF_IA32)) { - if (user_mode(regs) && regs->cs != __USER32_CS) - return get_segment_base(regs->cs); - } + if (user_mode(regs) && !user_64bit_mode(regs) && + regs->cs != __USER32_CS) + return get_segment_base(regs->cs); #endif return 0; } diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S index 31e2d5bf3e38..4c8cc34e6d68 100644 --- a/arch/x86/kernel/entry_32.S +++ b/arch/x86/kernel/entry_32.S @@ -398,7 +398,7 @@ sysenter_past_esp: * A tiny bit of offset fixup is necessary - 4*4 means the 4 words * pushed above; +8 corresponds to copy_thread's esp0 setting. */ - pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp) + pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp) CFI_REL_OFFSET eip, 0 pushl_cfi %eax @@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error) pushl_cfi $0 #ifdef CONFIG_X86_INVD_BUG /* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */ -661: pushl_cfi $do_general_protection -662: -.section .altinstructions,"a" - altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f -.previous -.section .altinstr_replacement,"ax" -663: pushl $do_simd_coprocessor_error -664: -.previous + ALTERNATIVE "pushl_cfi $do_general_protection", \ + "pushl $do_simd_coprocessor_error", \ + X86_FEATURE_XMM #else pushl_cfi $do_simd_coprocessor_error #endif @@ -1240,20 +1234,13 @@ error_code: /*CFI_REL_OFFSET es, 0*/ pushl_cfi %ds /*CFI_REL_OFFSET ds, 0*/ - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ebp - CFI_REL_OFFSET ebp, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ebp + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg edx + pushl_cfi_reg ecx + pushl_cfi_reg ebx cld movl $(__KERNEL_PERCPU), %ecx movl %ecx, %fs diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S index 1d74d161687c..0c91256d73df 100644 --- a/arch/x86/kernel/entry_64.S +++ b/arch/x86/kernel/entry_64.S @@ -14,24 +14,13 @@ * NOTE: This code handles signal-recognition, which happens every time * after an interrupt and after each system call. * - * Normal syscalls and interrupts don't save a full stack frame, this is - * only done for syscall tracing, signals or fork/exec et.al. - * * A note on terminology: - * - top of stack: Architecture defined interrupt frame from SS to RIP + * - iret frame: Architecture defined interrupt frame from SS to RIP * at the top of the kernel process stack. - * - partial stack frame: partially saved registers up to R11. - * - full stack frame: Like partial stack frame, but all register saved. * * Some macro usage: * - CFI macros are used to generate dwarf2 unwind information for better * backtraces. They don't change any code. - * - SAVE_ALL/RESTORE_ALL - Save/restore all registers - * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify. - * There are unfortunately lots of special cases where some registers - * not touched. The macro is a big mess that should be cleaned up. - * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS. - * Gives a full stack frame. * - ENTRY/END Define functions in the symbol table. * - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack * frame that is otherwise undefined after a SYSCALL @@ -82,9 +71,9 @@ ENDPROC(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ -.macro TRACE_IRQS_IRETQ offset=ARGOFFSET +.macro TRACE_IRQS_IRETQ #ifdef CONFIG_TRACE_IRQFLAGS - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON 1: @@ -116,8 +105,8 @@ ENDPROC(native_usergs_sysret64) call debug_stack_reset .endm -.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET - bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */ +.macro TRACE_IRQS_IRETQ_DEBUG + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc 1f TRACE_IRQS_ON_DEBUG 1: @@ -130,34 +119,27 @@ ENDPROC(native_usergs_sysret64) #endif /* - * C code is not supposed to know about undefined top of stack. Every time - * a C function with an pt_regs argument is called from the SYSCALL based - * fast path FIXUP_TOP_OF_STACK is needed. + * C code is not supposed to know that the iret frame is not populated. + * Every time a C function with an pt_regs argument is called from + * the SYSCALL based fast path FIXUP_TOP_OF_STACK is needed. * RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs * manipulation. */ - - /* %rsp:at FRAMEEND */ .macro FIXUP_TOP_OF_STACK tmp offset=0 - movq PER_CPU_VAR(old_rsp),\tmp - movq \tmp,RSP+\offset(%rsp) movq $__USER_DS,SS+\offset(%rsp) movq $__USER_CS,CS+\offset(%rsp) movq RIP+\offset(%rsp),\tmp /* get rip */ movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */ - movq R11+\offset(%rsp),\tmp /* get eflags */ - movq \tmp,EFLAGS+\offset(%rsp) + movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */ + movq \tmp,R11+\offset(%rsp) .endm .macro RESTORE_TOP_OF_STACK tmp offset=0 - movq RSP+\offset(%rsp),\tmp - movq \tmp,PER_CPU_VAR(old_rsp) - movq EFLAGS+\offset(%rsp),\tmp - movq \tmp,R11+\offset(%rsp) + /* nothing to do */ .endm /* - * initial frame state for interrupts (and exceptions without error code) + * empty frame */ .macro EMPTY_FRAME start=1 offset=0 .if \start @@ -173,12 +155,12 @@ ENDPROC(native_usergs_sysret64) * initial frame state for interrupts (and exceptions without error code) */ .macro INTR_FRAME start=1 offset=0 - EMPTY_FRAME \start, SS+8+\offset-RIP - /*CFI_REL_OFFSET ss, SS+\offset-RIP*/ - CFI_REL_OFFSET rsp, RSP+\offset-RIP - /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/ - /*CFI_REL_OFFSET cs, CS+\offset-RIP*/ - CFI_REL_OFFSET rip, RIP+\offset-RIP + EMPTY_FRAME \start, 5*8+\offset + /*CFI_REL_OFFSET ss, 4*8+\offset*/ + CFI_REL_OFFSET rsp, 3*8+\offset + /*CFI_REL_OFFSET rflags, 2*8+\offset*/ + /*CFI_REL_OFFSET cs, 1*8+\offset*/ + CFI_REL_OFFSET rip, 0*8+\offset .endm /* @@ -186,30 +168,23 @@ ENDPROC(native_usergs_sysret64) * with vector already pushed) */ .macro XCPT_FRAME start=1 offset=0 - INTR_FRAME \start, RIP+\offset-ORIG_RAX - .endm - -/* - * frame that enables calling into C. - */ - .macro PARTIAL_FRAME start=1 offset=0 - XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET - CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET - CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET - CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET - CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET - CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET - CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET - CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET - CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET - CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET + INTR_FRAME \start, 1*8+\offset .endm /* * frame that enables passing a complete pt_regs to a C function. */ .macro DEFAULT_FRAME start=1 offset=0 - PARTIAL_FRAME \start, R11+\offset-R15 + XCPT_FRAME \start, ORIG_RAX+\offset + CFI_REL_OFFSET rdi, RDI+\offset + CFI_REL_OFFSET rsi, RSI+\offset + CFI_REL_OFFSET rdx, RDX+\offset + CFI_REL_OFFSET rcx, RCX+\offset + CFI_REL_OFFSET rax, RAX+\offset + CFI_REL_OFFSET r8, R8+\offset + CFI_REL_OFFSET r9, R9+\offset + CFI_REL_OFFSET r10, R10+\offset + CFI_REL_OFFSET r11, R11+\offset CFI_REL_OFFSET rbx, RBX+\offset CFI_REL_OFFSET rbp, RBP+\offset CFI_REL_OFFSET r12, R12+\offset @@ -218,105 +193,31 @@ ENDPROC(native_usergs_sysret64) CFI_REL_OFFSET r15, R15+\offset .endm -ENTRY(save_paranoid) - XCPT_FRAME 1 RDI+8 - cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq_cfi rdx, RDX+8 - movq_cfi rcx, RCX+8 - movq_cfi rax, RAX+8 - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) - movl $1,%ebx - movl $MSR_GS_BASE,%ecx - rdmsr - testl %edx,%edx - js 1f /* negative -> in kernel */ - SWAPGS - xorl %ebx,%ebx -1: ret - CFI_ENDPROC -END(save_paranoid) - /* - * A newly forked process directly context switches into this address. + * 64bit SYSCALL instruction entry. Up to 6 arguments in registers. * - * rdi: prev task we switched from - */ -ENTRY(ret_from_fork) - DEFAULT_FRAME - - LOCK ; btr $TIF_FORK,TI_flags(%r8) - - pushq_cfi $0x0002 - popfq_cfi # reset kernel eflags - - call schedule_tail # rdi: 'prev' task parameter - - GET_THREAD_INFO(%rcx) - - RESTORE_REST - - testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread? - jz 1f - - /* - * By the time we get here, we have no idea whether our pt_regs, - * ti flags, and ti status came from the 64-bit SYSCALL fast path, - * the slow path, or one of the ia32entry paths. - * Use int_ret_from_sys_call to return, since it can safely handle - * all of the above. - */ - jmp int_ret_from_sys_call - -1: - subq $REST_SKIP, %rsp # leave space for volatiles - CFI_ADJUST_CFA_OFFSET REST_SKIP - movq %rbp, %rdi - call *%rbx - movl $0, RAX(%rsp) - RESTORE_REST - jmp int_ret_from_sys_call - CFI_ENDPROC -END(ret_from_fork) - -/* - * System call entry. Up to 6 arguments in registers are supported. + * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11, + * then loads new ss, cs, and rip from previously programmed MSRs. + * rflags gets masked by a value from another MSR (so CLD and CLAC + * are not needed). SYSCALL does not save anything on the stack + * and does not change rsp. * - * SYSCALL does not save anything on the stack and does not change the - * stack pointer. However, it does mask the flags register for us, so - * CLD and CLAC are not needed. - */ - -/* - * Register setup: + * Registers on entry: * rax system call number + * rcx return address + * r11 saved rflags (note: r11 is callee-clobbered register in C ABI) * rdi arg0 - * rcx return address for syscall/sysret, C arg3 * rsi arg1 * rdx arg2 - * r10 arg3 (--> moved to rcx for C) + * r10 arg3 (needs to be moved to rcx to conform to C ABI) * r8 arg4 * r9 arg5 - * r11 eflags for syscall/sysret, temporary for C - * r12-r15,rbp,rbx saved by C code, not touched. + * (note: r12-r15,rbp,rbx are callee-preserved in C ABI) * * Interrupts are off on entry. * Only called from user space. * - * XXX if we had a free scratch register we could save the RSP into the stack frame - * and report it properly in ps. Unfortunately we haven't. - * - * When user can change the frames always force IRET. That is because + * When user can change pt_regs->foo always force IRET. That is because * it deals with uncanonical addresses better. SYSRET has trouble * with them due to bugs in both AMD and Intel CPUs. */ @@ -335,18 +236,29 @@ ENTRY(system_call) */ GLOBAL(system_call_after_swapgs) - movq %rsp,PER_CPU_VAR(old_rsp) + /* + * We use 'rsp_scratch' as a scratch register, hence this block must execute + * atomically in the face of possible interrupt-driven task preemption, + * so we can enable interrupts only after we're done with using rsp_scratch: + */ + movq %rsp,PER_CPU_VAR(rsp_scratch) + /* kernel_stack is set so that 5 slots (iret frame) are preallocated */ movq PER_CPU_VAR(kernel_stack),%rsp + ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */ + movq %rcx,RIP(%rsp) + movq PER_CPU_VAR(rsp_scratch),%rcx + movq %r11,EFLAGS(%rsp) + movq %rcx,RSP(%rsp) /* * No need to follow this irqs off/on section - it's straight * and short: */ ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_ARGS 8, 0, rax_enosys=1 - movq_cfi rax,(ORIG_RAX-ARGOFFSET) - movq %rcx,RIP-ARGOFFSET(%rsp) - CFI_REL_OFFSET rip,RIP-ARGOFFSET - testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + movq_cfi rax,ORIG_RAX + SAVE_C_REGS_EXCEPT_RAX_RCX_R11 + movq $-ENOSYS,RAX(%rsp) + CFI_REL_OFFSET rip,RIP + testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP) jnz tracesys system_call_fastpath: #if __SYSCALL_MASK == ~0 @@ -358,13 +270,13 @@ system_call_fastpath: ja ret_from_sys_call /* and return regs->ax */ movq %r10,%rcx call *sys_call_table(,%rax,8) # XXX: rip relative - movq %rax,RAX-ARGOFFSET(%rsp) + movq %rax,RAX(%rsp) /* * Syscall return path ending with SYSRET (fast path) - * Has incomplete stack frame and undefined top of stack. + * Has incompletely filled pt_regs, iret frame is also incomplete. */ ret_from_sys_call: - testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET) + testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP) jnz int_ret_from_sys_call_fixup /* Go the the slow path */ LOCKDEP_SYS_EXIT @@ -375,31 +287,38 @@ ret_from_sys_call: * sysretq will re-enable interrupts: */ TRACE_IRQS_ON - movq RIP-ARGOFFSET(%rsp),%rcx + RESTORE_C_REGS_EXCEPT_RCX_R11 + movq RIP(%rsp),%rcx CFI_REGISTER rip,rcx - RESTORE_ARGS 1,-ARG_SKIP,0 + movq EFLAGS(%rsp),%r11 /*CFI_REGISTER rflags,r11*/ - movq PER_CPU_VAR(old_rsp), %rsp + movq RSP(%rsp),%rsp + /* + * 64bit SYSRET restores rip from rcx, + * rflags from r11 (but RF and VM bits are forced to 0), + * cs and ss are loaded from MSRs. + */ USERGS_SYSRET64 CFI_RESTORE_STATE int_ret_from_sys_call_fixup: - FIXUP_TOP_OF_STACK %r11, -ARGOFFSET + FIXUP_TOP_OF_STACK %r11 jmp int_ret_from_sys_call - /* Do syscall tracing */ + /* Do syscall entry tracing */ tracesys: - leaq -REST_SKIP(%rsp), %rdi + movq %rsp, %rdi movq $AUDIT_ARCH_X86_64, %rsi call syscall_trace_enter_phase1 test %rax, %rax jnz tracesys_phase2 /* if needed, run the slow path */ - LOAD_ARGS 0 /* else restore clobbered regs */ + RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */ + movq ORIG_RAX(%rsp), %rax jmp system_call_fastpath /* and return to the fast path */ tracesys_phase2: - SAVE_REST + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %rdi movq %rsp, %rdi movq $AUDIT_ARCH_X86_64, %rsi @@ -407,12 +326,12 @@ tracesys_phase2: call syscall_trace_enter_phase2 /* - * Reload arg registers from stack in case ptrace changed them. + * Reload registers from stack in case ptrace changed them. * We don't reload %rax because syscall_trace_entry_phase2() returned * the value it wants us to use in the table lookup. */ - LOAD_ARGS ARGOFFSET, 1 - RESTORE_REST + RESTORE_C_REGS_EXCEPT_RAX + RESTORE_EXTRA_REGS #if __SYSCALL_MASK == ~0 cmpq $__NR_syscall_max,%rax #else @@ -422,12 +341,12 @@ tracesys_phase2: ja int_ret_from_sys_call /* RAX(%rsp) is already set */ movq %r10,%rcx /* fixup for C */ call *sys_call_table(,%rax,8) - movq %rax,RAX-ARGOFFSET(%rsp) - /* Use IRET because user could have changed frame */ + movq %rax,RAX(%rsp) + /* Use IRET because user could have changed pt_regs->foo */ /* * Syscall return path ending with IRET. - * Has correct top of stack, but partial stack frame. + * Has correct iret frame. */ GLOBAL(int_ret_from_sys_call) DISABLE_INTERRUPTS(CLBR_NONE) @@ -458,12 +377,11 @@ int_careful: TRACE_IRQS_OFF jmp int_with_check - /* handle signals and tracing -- both require a full stack frame */ + /* handle signals and tracing -- both require a full pt_regs */ int_very_careful: TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) -int_check_syscall_exit_work: - SAVE_REST + SAVE_EXTRA_REGS /* Check for syscall exit trace */ testl $_TIF_WORK_SYSCALL_EXIT,%edx jz int_signal @@ -482,7 +400,7 @@ int_signal: call do_notify_resume 1: movl $_TIF_WORK_MASK,%edi int_restore_rest: - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF jmp int_with_check @@ -492,45 +410,29 @@ END(system_call) .macro FORK_LIKE func ENTRY(stub_\func) CFI_STARTPROC - popq %r11 /* save return address */ - PARTIAL_FRAME 0 - SAVE_REST - pushq %r11 /* put it back on stack */ + DEFAULT_FRAME 0, 8 /* offset 8: return address */ + SAVE_EXTRA_REGS 8 FIXUP_TOP_OF_STACK %r11, 8 - DEFAULT_FRAME 0 8 /* offset 8: return address */ call sys_\func RESTORE_TOP_OF_STACK %r11, 8 - ret $REST_SKIP /* pop extended registers */ - CFI_ENDPROC -END(stub_\func) - .endm - - .macro FIXED_FRAME label,func -ENTRY(\label) - CFI_STARTPROC - PARTIAL_FRAME 0 8 /* offset 8: return address */ - FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET - call \func - RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET ret CFI_ENDPROC -END(\label) +END(stub_\func) .endm FORK_LIKE clone FORK_LIKE fork FORK_LIKE vfork - FIXED_FRAME stub_iopl, sys_iopl ENTRY(stub_execve) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys_execve movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execve) @@ -538,13 +440,13 @@ END(stub_execve) ENTRY(stub_execveat) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys_execveat RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_execveat) @@ -556,12 +458,12 @@ END(stub_execveat) ENTRY(stub_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_rt_sigreturn) @@ -570,12 +472,12 @@ END(stub_rt_sigreturn) ENTRY(stub_x32_rt_sigreturn) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call sys32_x32_rt_sigreturn movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_x32_rt_sigreturn) @@ -583,13 +485,13 @@ END(stub_x32_rt_sigreturn) ENTRY(stub_x32_execve) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call compat_sys_execve RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_x32_execve) @@ -597,13 +499,13 @@ END(stub_x32_execve) ENTRY(stub_x32_execveat) CFI_STARTPROC addq $8, %rsp - PARTIAL_FRAME 0 - SAVE_REST + DEFAULT_FRAME 0 + SAVE_EXTRA_REGS FIXUP_TOP_OF_STACK %r11 call compat_sys_execveat RESTORE_TOP_OF_STACK %r11 movq %rax,RAX(%rsp) - RESTORE_REST + RESTORE_EXTRA_REGS jmp int_ret_from_sys_call CFI_ENDPROC END(stub_x32_execveat) @@ -611,6 +513,46 @@ END(stub_x32_execveat) #endif /* + * A newly forked process directly context switches into this address. + * + * rdi: prev task we switched from + */ +ENTRY(ret_from_fork) + DEFAULT_FRAME + + LOCK ; btr $TIF_FORK,TI_flags(%r8) + + pushq_cfi $0x0002 + popfq_cfi # reset kernel eflags + + call schedule_tail # rdi: 'prev' task parameter + + GET_THREAD_INFO(%rcx) + + RESTORE_EXTRA_REGS + + testl $3,CS(%rsp) # from kernel_thread? + jz 1f + + /* + * By the time we get here, we have no idea whether our pt_regs, + * ti flags, and ti status came from the 64-bit SYSCALL fast path, + * the slow path, or one of the ia32entry paths. + * Use int_ret_from_sys_call to return, since it can safely handle + * all of the above. + */ + jmp int_ret_from_sys_call + +1: + movq %rbp, %rdi + call *%rbx + movl $0, RAX(%rsp) + RESTORE_EXTRA_REGS + jmp int_ret_from_sys_call + CFI_ENDPROC +END(ret_from_fork) + +/* * Build the entry stubs and pointer table with some assembler magic. * We pack 7 stubs into a single 32-byte chunk, which will fit in a * single cache line on all modern x86 implementations. @@ -659,47 +601,45 @@ END(interrupt) /* 0(%rsp): ~(interrupt number) */ .macro interrupt func - /* reserve pt_regs for scratch regs and rbp */ - subq $ORIG_RAX-RBP, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP cld - /* start from rbp in pt_regs and jump over */ - movq_cfi rdi, (RDI-RBP) - movq_cfi rsi, (RSI-RBP) - movq_cfi rdx, (RDX-RBP) - movq_cfi rcx, (RCX-RBP) - movq_cfi rax, (RAX-RBP) - movq_cfi r8, (R8-RBP) - movq_cfi r9, (R9-RBP) - movq_cfi r10, (R10-RBP) - movq_cfi r11, (R11-RBP) - - /* Save rbp so that we can unwind from get_irq_regs() */ - movq_cfi rbp, 0 - - /* Save previous stack value */ - movq %rsp, %rsi + /* + * Since nothing in interrupt handling code touches r12...r15 members + * of "struct pt_regs", and since interrupts can nest, we can save + * four stack slots and simultaneously provide + * an unwind-friendly stack layout by saving "truncated" pt_regs + * exactly up to rbp slot, without these members. + */ + ALLOC_PT_GPREGS_ON_STACK -RBP + SAVE_C_REGS -RBP + /* this goes to 0(%rsp) for unwinder, not for saving the value: */ + SAVE_EXTRA_REGS_RBP -RBP + + leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */ - leaq -RBP(%rsp),%rdi /* arg1 for handler */ - testl $3, CS-RBP(%rsi) + testl $3, CS-RBP(%rsp) je 1f SWAPGS +1: /* + * Save previous stack pointer, optionally switch to interrupt stack. * irq_count is used to check if a CPU is already on an interrupt stack * or not. While this is essentially redundant with preempt_count it is * a little cheaper to use a separate counter in the PDA (short of * moving irq_enter into assembly, which would be too much work) */ -1: incl PER_CPU_VAR(irq_count) + movq %rsp, %rsi + incl PER_CPU_VAR(irq_count) cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp CFI_DEF_CFA_REGISTER rsi - - /* Store previous stack value */ pushq %rsi + /* + * For debugger: + * "CFA (Current Frame Address) is the value on stack + offset" + */ CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \ - 0x77 /* DW_OP_breg7 */, 0, \ + 0x77 /* DW_OP_breg7 (rsp) */, 0, \ 0x06 /* DW_OP_deref */, \ - 0x08 /* DW_OP_const1u */, SS+8-RBP, \ + 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \ 0x22 /* DW_OP_plus */ /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -717,7 +657,7 @@ common_interrupt: ASM_CLAC addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */ interrupt do_IRQ - /* 0(%rsp): old_rsp-ARGOFFSET */ + /* 0(%rsp): rsp_scratch */ ret_from_intr: DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF @@ -725,19 +665,20 @@ ret_from_intr: /* Restore saved previous stack */ popq %rsi - CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */ - leaq ARGOFFSET-RBP(%rsi), %rsp + CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */ + /* return code expects complete pt_regs - adjust rsp accordingly: */ + leaq -RBP(%rsi),%rsp CFI_DEF_CFA_REGISTER rsp - CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET + CFI_ADJUST_CFA_OFFSET RBP exit_intr: GET_THREAD_INFO(%rcx) - testl $3,CS-ARGOFFSET(%rsp) + testl $3,CS(%rsp) je retint_kernel /* Interrupt came from user space */ /* - * Has a correct top of stack, but a partial stack frame + * Has a correct top of stack. * %rcx: thread info. Interrupts off. */ retint_with_reschedule: @@ -760,8 +701,8 @@ retint_swapgs: /* return to user-space */ * Try to use SYSRET instead of IRET if we're returning to * a completely clean 64-bit userspace context. */ - movq (RCX-R11)(%rsp), %rcx - cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */ + movq RCX(%rsp),%rcx + cmpq %rcx,RIP(%rsp) /* RCX == RIP */ jne opportunistic_sysret_failed /* @@ -782,19 +723,19 @@ retint_swapgs: /* return to user-space */ shr $__VIRTUAL_MASK_SHIFT, %rcx jnz opportunistic_sysret_failed - cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */ + cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */ jne opportunistic_sysret_failed - movq (R11-ARGOFFSET)(%rsp), %r11 - cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */ + movq R11(%rsp),%r11 + cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */ jne opportunistic_sysret_failed - testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ + testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */ jnz opportunistic_sysret_failed /* nothing to check for RSP */ - cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */ + cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */ jne opportunistic_sysret_failed /* @@ -803,8 +744,9 @@ retint_swapgs: /* return to user-space */ */ irq_return_via_sysret: CFI_REMEMBER_STATE - RESTORE_ARGS 1,8,1 - movq (RSP-RIP)(%rsp),%rsp + /* r11 is already restored (see code above) */ + RESTORE_C_REGS_EXCEPT_R11 + movq RSP(%rsp),%rsp USERGS_SYSRET64 CFI_RESTORE_STATE @@ -819,7 +761,8 @@ retint_restore_args: /* return to kernel space */ */ TRACE_IRQS_IRETQ restore_args: - RESTORE_ARGS 1,8,1 + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 irq_return: INTERRUPT_RETURN @@ -890,12 +833,12 @@ retint_signal: jz retint_swapgs TRACE_IRQS_ON ENABLE_INTERRUPTS(CLBR_NONE) - SAVE_REST + SAVE_EXTRA_REGS movq $-1,ORIG_RAX(%rsp) xorl %esi,%esi # oldset movq %rsp,%rdi # &pt_regs call do_notify_resume - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -907,7 +850,7 @@ retint_signal: ENTRY(retint_kernel) cmpl $0,PER_CPU_VAR(__preempt_count) jnz retint_restore_args - bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */ + bt $9,EFLAGS(%rsp) /* interrupts off? */ jnc retint_restore_args call preempt_schedule_irq jmp exit_intr @@ -1000,7 +943,7 @@ apicinterrupt IRQ_WORK_VECTOR \ /* * Exception entry points. */ -#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) @@ -1022,8 +965,7 @@ ENTRY(\sym) pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ .endif - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK .if \paranoid .if \paranoid == 1 @@ -1031,10 +973,11 @@ ENTRY(\sym) testl $3, CS(%rsp) /* If coming from userspace, switch */ jnz 1f /* stacks. */ .endif - call save_paranoid + call paranoid_entry .else call error_entry .endif + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ DEFAULT_FRAME 0 @@ -1056,19 +999,20 @@ ENTRY(\sym) .endif .if \shift_ist != -1 - subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif call \do_sym .if \shift_ist != -1 - addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist) + addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist) .endif + /* these procedures expect "no swapgs" flag in ebx */ .if \paranoid - jmp paranoid_exit /* %ebx: no swapgs flag */ + jmp paranoid_exit .else - jmp error_exit /* %ebx: no swapgs flag */ + jmp error_exit .endif .if \paranoid == 1 @@ -1272,7 +1216,9 @@ ENTRY(xen_failsafe_callback) addq $0x30,%rsp CFI_ADJUST_CFA_OFFSET -0x30 pushq_cfi $-1 /* orig_ax = -1 => not a system call */ - SAVE_ALL + ALLOC_PT_GPREGS_ON_STACK + SAVE_C_REGS + SAVE_EXTRA_REGS jmp error_exit CFI_ENDPROC END(xen_failsafe_callback) @@ -1304,59 +1250,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1 idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip) #endif - /* - * "Paranoid" exit path from exception stack. This is invoked - * only on return from non-NMI IST interrupts that came - * from kernel space. - * - * We may be returning to very strange contexts (e.g. very early - * in syscall entry), so checking for preemption here would - * be complicated. Fortunately, we there's no good reason - * to try to handle preemption here. - */ +/* + * Save all registers in pt_regs, and switch gs if needed. + * Use slow, but surefire "are we in kernel?" check. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise + */ +ENTRY(paranoid_entry) + XCPT_FRAME 1 15*8 + cld + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 + movl $1,%ebx + movl $MSR_GS_BASE,%ecx + rdmsr + testl %edx,%edx + js 1f /* negative -> in kernel */ + SWAPGS + xorl %ebx,%ebx +1: ret + CFI_ENDPROC +END(paranoid_entry) - /* ebx: no swapgs flag */ +/* + * "Paranoid" exit path from exception stack. This is invoked + * only on return from non-NMI IST interrupts that came + * from kernel space. + * + * We may be returning to very strange contexts (e.g. very early + * in syscall entry), so checking for preemption here would + * be complicated. Fortunately, we there's no good reason + * to try to handle preemption here. + */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) DEFAULT_FRAME DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF_DEBUG testl %ebx,%ebx /* swapgs needed? */ - jnz paranoid_restore - TRACE_IRQS_IRETQ 0 + jnz paranoid_exit_no_swapgs + TRACE_IRQS_IRETQ SWAPGS_UNSAFE_STACK - RESTORE_ALL 8 - INTERRUPT_RETURN -paranoid_restore: - TRACE_IRQS_IRETQ_DEBUG 0 - RESTORE_ALL 8 + jmp paranoid_exit_restore +paranoid_exit_no_swapgs: + TRACE_IRQS_IRETQ_DEBUG +paranoid_exit_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS + REMOVE_PT_GPREGS_FROM_STACK 8 INTERRUPT_RETURN CFI_ENDPROC END(paranoid_exit) /* - * Exception entry point. This expects an error code/orig_rax on the stack. - * returns in "no swapgs flag" in %ebx. + * Save all registers in pt_regs, and switch gs if needed. + * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(error_entry) - XCPT_FRAME - CFI_ADJUST_CFA_OFFSET 15*8 - /* oldrax contains error code */ + XCPT_FRAME 1 15*8 cld - movq %rdi, RDI+8(%rsp) - movq %rsi, RSI+8(%rsp) - movq %rdx, RDX+8(%rsp) - movq %rcx, RCX+8(%rsp) - movq %rax, RAX+8(%rsp) - movq %r8, R8+8(%rsp) - movq %r9, R9+8(%rsp) - movq %r10, R10+8(%rsp) - movq %r11, R11+8(%rsp) - movq_cfi rbx, RBX+8 - movq %rbp, RBP+8(%rsp) - movq %r12, R12+8(%rsp) - movq %r13, R13+8(%rsp) - movq %r14, R14+8(%rsp) - movq %r15, R15+8(%rsp) + SAVE_C_REGS 8 + SAVE_EXTRA_REGS 8 xorl %ebx,%ebx testl $3,CS+8(%rsp) je error_kernelspace @@ -1366,12 +1319,12 @@ error_sti: TRACE_IRQS_OFF ret -/* - * There are two places in the kernel that can potentially fault with - * usergs. Handle them here. B stepping K8s sometimes report a - * truncated RIP for IRET exceptions returning to compat mode. Check - * for these here too. - */ + /* + * There are two places in the kernel that can potentially fault with + * usergs. Handle them here. B stepping K8s sometimes report a + * truncated RIP for IRET exceptions returning to compat mode. Check + * for these here too. + */ error_kernelspace: CFI_REL_OFFSET rcx, RCX+8 incl %ebx @@ -1401,11 +1354,11 @@ error_bad_iret: END(error_entry) -/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */ +/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(error_exit) DEFAULT_FRAME movl %ebx,%eax - RESTORE_REST + RESTORE_EXTRA_REGS DISABLE_INTERRUPTS(CLBR_NONE) TRACE_IRQS_OFF GET_THREAD_INFO(%rcx) @@ -1587,7 +1540,7 @@ first_nmi: .rept 5 pushq_cfi 11*8(%rsp) .endr - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 /* Everything up to here is safe from nested NMIs */ @@ -1615,7 +1568,7 @@ repeat_nmi: pushq_cfi -6*8(%rsp) .endr subq $(5*8), %rsp - CFI_DEF_CFA_OFFSET SS+8-RIP + CFI_DEF_CFA_OFFSET 5*8 end_repeat_nmi: /* @@ -1624,16 +1577,16 @@ end_repeat_nmi: * so that we repeat another NMI. */ pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */ - subq $ORIG_RAX-R15, %rsp - CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15 + ALLOC_PT_GPREGS_ON_STACK + /* - * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit + * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit * as we should not be calling schedule in NMI context. * Even with normal interrupts enabled. An NMI should not be * setting NEED_RESCHED or anything that normal interrupts and * exceptions might do. */ - call save_paranoid + call paranoid_entry DEFAULT_FRAME 0 /* @@ -1664,8 +1617,10 @@ end_repeat_nmi: nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: + RESTORE_EXTRA_REGS + RESTORE_C_REGS /* Pop the extra iret frame at once */ - RESTORE_ALL 6*8 + REMOVE_PT_GPREGS_FROM_STACK 6*8 /* Clear the NMI executing stack variable */ movq $0, 5*8(%rsp) diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S index f36bd42d6f0c..d031bad9e07e 100644 --- a/arch/x86/kernel/head_32.S +++ b/arch/x86/kernel/head_32.S @@ -22,6 +22,7 @@ #include <asm/cpufeature.h> #include <asm/percpu.h> #include <asm/nops.h> +#include <asm/bootparam.h> /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -90,7 +91,7 @@ ENTRY(startup_32) /* test KEEP_SEGMENTS flag to see if the bootloader is asking us to not reload segments */ - testb $(1<<6), BP_loadflags(%esi) + testb $KEEP_SEGMENTS, BP_loadflags(%esi) jnz 2f /* diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index 6fd514d9f69a..ae6588b301c2 100644 --- a/arch/x86/kernel/head_64.S +++ b/arch/x86/kernel/head_64.S @@ -1,5 +1,5 @@ /* - * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit + * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit * * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE * Copyright (C) 2000 Pavel Machek <pavel@suse.cz> @@ -56,7 +56,7 @@ startup_64: * %rsi holds a physical pointer to real_mode_data. * * We come here either directly from a 64bit bootloader, or from - * arch/x86_64/boot/compressed/head.S. + * arch/x86/boot/compressed/head_64.S. * * We only come here initially at boot nothing else comes here. * @@ -146,7 +146,7 @@ startup_64: leaq level2_kernel_pgt(%rip), %rdi leaq 4096(%rdi), %r8 /* See if it is a valid page table entry */ -1: testq $1, 0(%rdi) +1: testb $1, 0(%rdi) jz 2f addq %rbp, 0(%rdi) /* Go to the next page */ diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c index 4ddaf66ea35f..37dae792dbbe 100644 --- a/arch/x86/kernel/ioport.c +++ b/arch/x86/kernel/ioport.c @@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on) * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c index 781861cc5ee8..02a8720414c0 100644 --- a/arch/x86/kernel/perf_regs.c +++ b/arch/x86/kernel/perf_regs.c @@ -177,7 +177,7 @@ void perf_get_regs_user(struct perf_regs *regs_user, * than just blindly copying user_regs. */ regs_user->abi = PERF_SAMPLE_REGS_ABI_64; - regs_user_copy->sp = this_cpu_read(old_rsp); + regs_user_copy->sp = user_regs->sp; regs_user_copy->cs = __USER_CS; regs_user_copy->ss = __USER_DS; regs_user_copy->cx = -1; /* usually contains garbage */ diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 046e2d620bbe..12b1cf606ddf 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -37,7 +37,26 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS; +__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { + .x86_tss = { + .sp0 = TOP_OF_INIT_STACK, +#ifdef CONFIG_X86_32 + .ss0 = __KERNEL_DS, + .ss1 = __KERNEL_CS, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +#endif + }, +#ifdef CONFIG_X86_32 + /* + * Note that the .io_bitmap member must be extra-big. This is because + * the CPU will access an additional byte beyond the end of the IO + * permission bitmap. The extra byte must be all 1 bits, and must + * be within the limit. + */ + .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, +#endif +}; +EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss); #ifdef CONFIG_X86_64 static DEFINE_PER_CPU(unsigned char, is_idle); @@ -109,7 +128,7 @@ void exit_thread(void) unsigned long *bp = t->io_bitmap_ptr; if (bp) { - struct tss_struct *tss = &per_cpu(init_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c index 603c4f99cb5a..26c596d1ee07 100644 --- a/arch/x86/kernel/process_32.c +++ b/arch/x86/kernel/process_32.c @@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp) regs->ip = new_ip; regs->sp = new_sp; regs->flags = X86_EFLAGS_IF; - /* - * force it to the iret return path by making it look as if there was - * some work pending. - */ - set_thread_flag(TIF_NOTIFY_RESUME); + force_iret(); } EXPORT_SYMBOL_GPL(start_thread); @@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread, *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); fpu_switch_t fpu; /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) fpu = switch_fpu_prepare(prev_p, next_p, cpu); /* - * Reload esp0. - */ - load_sp0(tss, next); - - /* * Save away %gs. No need to save %fs, as it was saved on the * stack on entry. No need to save %es and %ds, as those are * always kernel segments while inside the kernel. Doing this @@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) */ arch_end_context_switch(next_p); + /* + * Reload esp0, kernel_stack, and current_top_of_stack. This changes + * current_thread_info(). + */ + load_sp0(tss, next); this_cpu_write(kernel_stack, - (unsigned long)task_stack_page(next_p) + - THREAD_SIZE - KERNEL_STACK_OFFSET); + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE - KERNEL_STACK_OFFSET); + this_cpu_write(cpu_current_top_of_stack, + (unsigned long)task_stack_page(next_p) + + THREAD_SIZE); /* * Restore %gs if needed (which is common) diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c index 67fcc43577d2..da8b74598d90 100644 --- a/arch/x86/kernel/process_64.c +++ b/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ asmlinkage extern void ret_from_fork(void); -__visible DEFINE_PER_CPU(unsigned long, old_rsp); +__visible DEFINE_PER_CPU(unsigned long, rsp_scratch); /* Prints also some state that isn't saved in the pt_regs */ void __show_regs(struct pt_regs *regs, int all) @@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); p->thread.sp = (unsigned long) childregs; - p->thread.usersp = me->thread.usersp; set_tsk_thread_flag(p, TIF_FORK); p->thread.io_bitmap_ptr = NULL; @@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp, */ if (clone_flags & CLONE_SETTLS) { #ifdef CONFIG_IA32_EMULATION - if (test_thread_flag(TIF_IA32)) + if (is_ia32_task()) err = do_set_thread_area(p, -1, (struct user_desc __user *)childregs->si, 0); else @@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip, loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); - current->thread.usersp = new_sp; regs->ip = new_ip; regs->sp = new_sp; - this_cpu_write(old_rsp, new_sp); regs->cs = _cs; regs->ss = _ss; regs->flags = X86_EFLAGS_IF; + force_iret(); } void @@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) struct thread_struct *prev = &prev_p->thread; struct thread_struct *next = &next_p->thread; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(init_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss, cpu); unsigned fsindex, gsindex; fpu_switch_t fpu; fpu = switch_fpu_prepare(prev_p, next_p, cpu); - /* Reload esp0 and ss1. */ - load_sp0(tss, next); - /* We must save %fs and %gs before load_TLS() because * %fs and %gs may be cleared by load_TLS(). * @@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) /* * Switch the PDA and FPU contexts. */ - prev->usersp = this_cpu_read(old_rsp); - this_cpu_write(old_rsp, next->usersp); this_cpu_write(current_task, next_p); /* @@ -413,6 +406,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p) task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count); this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count); + /* Reload esp0 and ss1. This changes current_thread_info(). */ + load_sp0(tss, next); + this_cpu_write(kernel_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE - KERNEL_STACK_OFFSET); @@ -602,6 +598,5 @@ long sys_arch_prctl(int code, unsigned long addr) unsigned long KSTK_ESP(struct task_struct *task) { - return (test_tsk_thread_flag(task, TIF_IA32)) ? - (task_pt_regs(task)->sp) : ((task)->thread.usersp); + return task_pt_regs(task)->sp; } diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c index e510618b2e91..1e125817cf9f 100644 --- a/arch/x86/kernel/ptrace.c +++ b/arch/x86/kernel/ptrace.c @@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task, case offsetof(struct user_regs_struct,cs): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->cs = value; -#endif + task_pt_regs(task)->cs = value; break; case offsetof(struct user_regs_struct,ss): if (unlikely(value == 0)) return -EIO; -#ifdef CONFIG_IA32_EMULATION - if (test_tsk_thread_flag(task, TIF_IA32)) - task_pt_regs(task)->ss = value; -#endif + task_pt_regs(task)->ss = value; break; } diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S index e13f8e7c22a6..77630d57e7bf 100644 --- a/arch/x86/kernel/relocate_kernel_32.S +++ b/arch/x86/kernel/relocate_kernel_32.S @@ -226,23 +226,23 @@ swap_pages: movl (%ebx), %ecx addl $4, %ebx 1: - testl $0x1, %ecx /* is it a destination page */ + testb $0x1, %cl /* is it a destination page */ jz 2f movl %ecx, %edi andl $0xfffff000, %edi jmp 0b 2: - testl $0x2, %ecx /* is it an indirection page */ + testb $0x2, %cl /* is it an indirection page */ jz 2f movl %ecx, %ebx andl $0xfffff000, %ebx jmp 0b 2: - testl $0x4, %ecx /* is it the done indicator */ + testb $0x4, %cl /* is it the done indicator */ jz 2f jmp 3f 2: - testl $0x8, %ecx /* is it the source indicator */ + testb $0x8, %cl /* is it the source indicator */ jz 0b /* Ignore it otherwise */ movl %ecx, %esi /* For every source page do a copy */ andl $0xfffff000, %esi diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S index 3fd2c693e475..04cb1790a596 100644 --- a/arch/x86/kernel/relocate_kernel_64.S +++ b/arch/x86/kernel/relocate_kernel_64.S @@ -221,23 +221,23 @@ swap_pages: movq (%rbx), %rcx addq $8, %rbx 1: - testq $0x1, %rcx /* is it a destination page? */ + testb $0x1, %cl /* is it a destination page? */ jz 2f movq %rcx, %rdi andq $0xfffffffffffff000, %rdi jmp 0b 2: - testq $0x2, %rcx /* is it an indirection page? */ + testb $0x2, %cl /* is it an indirection page? */ jz 2f movq %rcx, %rbx andq $0xfffffffffffff000, %rbx jmp 0b 2: - testq $0x4, %rcx /* is it the done indicator? */ + testb $0x4, %cl /* is it the done indicator? */ jz 2f jmp 3f 2: - testq $0x8, %rcx /* is it the source indicator? */ + testb $0x8, %cl /* is it the source indicator? */ jz 0b /* Ignore it otherwise */ movq %rcx, %rsi /* For ever source page do a copy */ andq $0xfffffffffffff000, %rsi diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c index e5042463c1bc..eaa2c5e3f2cd 100644 --- a/arch/x86/kernel/signal.c +++ b/arch/x86/kernel/signal.c @@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, COPY(r15); #endif /* CONFIG_X86_64 */ -#ifdef CONFIG_X86_32 COPY_SEG_CPL3(cs); COPY_SEG_CPL3(ss); -#else /* !CONFIG_X86_32 */ - /* Kernel saves and restores only the CS segment register on signals, - * which is the bare minimum needed to allow mixed 32/64-bit code. - * App's signal handler can save/restore other segments if needed. */ - COPY_SEG_CPL3(cs); -#endif /* CONFIG_X86_32 */ get_user_ex(tmpflags, &sc->flags); regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS); @@ -115,6 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc, err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32)); + force_iret(); + return err; } @@ -162,8 +157,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate, #else /* !CONFIG_X86_32 */ put_user_ex(regs->flags, &sc->flags); put_user_ex(regs->cs, &sc->cs); - put_user_ex(0, &sc->gs); - put_user_ex(0, &sc->fs); + put_user_ex(0, &sc->__pad2); + put_user_ex(0, &sc->__pad1); + put_user_ex(regs->ss, &sc->ss); #endif /* CONFIG_X86_32 */ put_user_ex(fpstate, &sc->fpstate); @@ -457,9 +453,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig, regs->sp = (unsigned long)frame; - /* Set up the CS register to run signal handlers in 64-bit mode, - even if the handler happens to be interrupting 32-bit code. */ + /* + * Set up the CS and SS registers to run signal handlers in + * 64-bit mode, even if the handler happens to be interrupting + * 32-bit or 16-bit code. + * + * SS is subtle. In 64-bit mode, we don't need any particular + * SS descriptor, but we do need SS to be valid. It's possible + * that the old SS is entirely bogus -- this can happen if the + * signal we're trying to deliver is #GP or #SS caused by a bad + * SS value. + */ regs->cs = __USER_CS; + regs->ss = __USER_DS; return 0; } diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index febc6aabc72e..759388c538cf 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -806,6 +806,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle) #ifdef CONFIG_X86_32 /* Stack for startup_32 can be just as for start_secondary onwards */ irq_ctx_init(cpu); + per_cpu(cpu_current_top_of_stack, cpu) = + (unsigned long)task_stack_page(idle) + THREAD_SIZE; #else clear_tsk_thread_flag(idle, TIF_FORK); initial_gs = per_cpu_offset(cpu); diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c index e9bcd57d8a9e..3777189c4a19 100644 --- a/arch/x86/kernel/syscall_32.c +++ b/arch/x86/kernel/syscall_32.c @@ -5,21 +5,29 @@ #include <linux/cache.h> #include <asm/asm-offsets.h> -#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ; +#ifdef CONFIG_IA32_EMULATION +#define SYM(sym, compat) compat +#else +#define SYM(sym, compat) sym +#define ia32_sys_call_table sys_call_table +#define __NR_ia32_syscall_max __NR_syscall_max +#endif + +#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ; #include <asm/syscalls_32.h> #undef __SYSCALL_I386 -#define __SYSCALL_I386(nr, sym, compat) [nr] = sym, +#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat), typedef asmlinkage void (*sys_call_ptr_t)(void); extern asmlinkage void sys_ni_syscall(void); -__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = { +__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = { /* * Smells like a compiler bug -- it doesn't work * when the & below is removed. */ - [0 ... __NR_syscall_max] = &sys_ni_syscall, + [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall, #include <asm/syscalls_32.h> }; diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c index 4ff5d162ff9f..277341128b47 100644 --- a/arch/x86/kernel/traps.c +++ b/arch/x86/kernel/traps.c @@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs) * will catch asm bugs and any attempt to use ist_preempt_enable * from double_fault. */ - BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack)) - & ~(THREAD_SIZE - 1)) != 0); + BUG_ON((unsigned long)(current_top_of_stack() - + current_stack_pointer()) >= THREAD_SIZE); preempt_count_sub(HARDIRQ_OFFSET); } @@ -925,9 +925,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code) /* Set of traps needed for early debugging. */ void __init early_trap_init(void) { - set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* + * Don't use IST to set DEBUG_STACK as it doesn't work until TSS + * is ready in cpu_init() <-- trap_init(). Before trap_init(), + * CPU runs at ring 0 so it is impossible to hit an invalid + * stack. Using the original stack works well enough at this + * early stage. DEBUG_STACK will be equipped after cpu_init() in + * trap_init(). + * + * We don't need to set trace_idt_table like set_intr_gate(), + * since we don't have trace_debug and it will be reset to + * 'debug' in trap_init() by set_intr_gate_ist(). + */ + set_intr_gate_notrace(X86_TRAP_DB, debug); /* int3 can be called from all */ - set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + set_system_intr_gate(X86_TRAP_BP, &int3); #ifdef CONFIG_X86_32 set_intr_gate(X86_TRAP_PF, page_fault); #endif @@ -1005,6 +1017,15 @@ void __init trap_init(void) */ cpu_init(); + /* + * X86_TRAP_DB and X86_TRAP_BP have been set + * in early_trap_init(). However, ITS works only after + * cpu_init() loads TSS. See comments in early_trap_init(). + */ + set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK); + /* int3 can be called from all */ + set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK); + x86_init.irqs.trap_init(); #ifdef CONFIG_X86_64 diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c index e8edcf52e069..fc9db6ef2a95 100644 --- a/arch/x86/kernel/vm86_32.c +++ b/arch/x86/kernel/vm86_32.c @@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs) do_exit(SIGSEGV); } - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); current->thread.sp0 = current->thread.saved_sp0; current->thread.sysenter_cs = __KERNEL_CS; load_sp0(tss, ¤t->thread); @@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk tsk->thread.saved_fs = info->regs32->fs; tsk->thread.saved_gs = get_user_gs(info->regs32); - tss = &per_cpu(init_tss, get_cpu()); + tss = &per_cpu(cpu_tss, get_cpu()); tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0; if (cpu_has_sep) tsk->thread.sysenter_cs = 0; diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index ac4453d8520e..8561585ee2c6 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -1076,6 +1076,7 @@ static void lguest_load_sp0(struct tss_struct *tss, { lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0, THREAD_SIZE / PAGE_SIZE); + tss->x86_tss.sp0 = thread->sp0; } /* Let's just say, I wouldn't do debugging under a Guest. */ diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S index f5cc9eb1d51b..082a85167a5b 100644 --- a/arch/x86/lib/atomic64_cx8_32.S +++ b/arch/x86/lib/atomic64_cx8_32.S @@ -13,16 +13,6 @@ #include <asm/alternative-asm.h> #include <asm/dwarf2.h> -.macro SAVE reg - pushl_cfi %\reg - CFI_REL_OFFSET \reg, 0 -.endm - -.macro RESTORE reg - popl_cfi %\reg - CFI_RESTORE \reg -.endm - .macro read64 reg movl %ebx, %eax movl %ecx, %edx @@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8) .macro addsub_return func ins insc ENTRY(atomic64_\func\()_return_cx8) CFI_STARTPROC - SAVE ebp - SAVE ebx - SAVE esi - SAVE edi + pushl_cfi_reg ebp + pushl_cfi_reg ebx + pushl_cfi_reg esi + pushl_cfi_reg edi movl %eax, %esi movl %edx, %edi @@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - RESTORE edi - RESTORE esi - RESTORE ebx - RESTORE ebp + popl_cfi_reg edi + popl_cfi_reg esi + popl_cfi_reg ebx + popl_cfi_reg ebp ret CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) @@ -104,7 +94,7 @@ addsub_return sub sub sbb .macro incdec_return func ins insc ENTRY(atomic64_\func\()_return_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8) 10: movl %ebx, %eax movl %ecx, %edx - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_\func\()_return_cx8) @@ -130,7 +120,7 @@ incdec_return dec sub sbb ENTRY(atomic64_dec_if_positive_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8) 2: movl %ebx, %eax movl %ecx, %edx - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_dec_if_positive_cx8) ENTRY(atomic64_add_unless_cx8) CFI_STARTPROC - SAVE ebp - SAVE ebx + pushl_cfi_reg ebp + pushl_cfi_reg ebx /* these just push these two parameters on the stack */ - SAVE edi - SAVE ecx + pushl_cfi_reg edi + pushl_cfi_reg ecx movl %eax, %ebp movl %edx, %edi @@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8) 3: addl $8, %esp CFI_ADJUST_CFA_OFFSET -8 - RESTORE ebx - RESTORE ebp + popl_cfi_reg ebx + popl_cfi_reg ebp ret 4: cmpl %edx, 4(%esp) @@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8) ENTRY(atomic64_inc_not_zero_cx8) CFI_STARTPROC - SAVE ebx + pushl_cfi_reg ebx read64 %esi 1: @@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8) movl $1, %eax 3: - RESTORE ebx + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(atomic64_inc_not_zero_cx8) diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S index e78b8eee6615..9bc944a91274 100644 --- a/arch/x86/lib/checksum_32.S +++ b/arch/x86/lib/checksum_32.S @@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum) */ ENTRY(csum_partial) CFI_STARTPROC - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg esi + pushl_cfi_reg ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: unsigned char *buff @@ -127,14 +125,12 @@ ENTRY(csum_partial) 6: addl %ecx,%eax adcl $0, %eax 7: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 8f roll $8, %eax 8: - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi + popl_cfi_reg ebx + popl_cfi_reg esi ret CFI_ENDPROC ENDPROC(csum_partial) @@ -145,10 +141,8 @@ ENDPROC(csum_partial) ENTRY(csum_partial) CFI_STARTPROC - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg esi + pushl_cfi_reg ebx movl 20(%esp),%eax # Function arg: unsigned int sum movl 16(%esp),%ecx # Function arg: int len movl 12(%esp),%esi # Function arg: const unsigned char *buf @@ -251,14 +245,12 @@ ENTRY(csum_partial) addl %ebx,%eax adcl $0,%eax 80: - testl $1, 12(%esp) + testb $1, 12(%esp) jz 90f roll $8, %eax 90: - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi + popl_cfi_reg ebx + popl_cfi_reg esi ret CFI_ENDPROC ENDPROC(csum_partial) @@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic) CFI_STARTPROC subl $4,%esp CFI_ADJUST_CFA_OFFSET 4 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 + pushl_cfi_reg edi + pushl_cfi_reg esi + pushl_cfi_reg ebx movl ARGBASE+16(%esp),%eax # sum movl ARGBASE+12(%esp),%ecx # len movl ARGBASE+4(%esp),%esi # src @@ -412,12 +401,9 @@ DST( movb %cl, (%edi) ) .previous - popl_cfi %ebx - CFI_RESTORE ebx - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi + popl_cfi_reg ebx + popl_cfi_reg esi + popl_cfi_reg edi popl_cfi %ecx # equivalent to addl $4,%esp ret CFI_ENDPROC @@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic) ENTRY(csum_partial_copy_generic) CFI_STARTPROC - pushl_cfi %ebx - CFI_REL_OFFSET ebx, 0 - pushl_cfi %edi - CFI_REL_OFFSET edi, 0 - pushl_cfi %esi - CFI_REL_OFFSET esi, 0 + pushl_cfi_reg ebx + pushl_cfi_reg edi + pushl_cfi_reg esi movl ARGBASE+4(%esp),%esi #src movl ARGBASE+8(%esp),%edi #dst movl ARGBASE+12(%esp),%ecx #len @@ -506,12 +489,9 @@ DST( movb %dl, (%edi) ) jmp 7b .previous - popl_cfi %esi - CFI_RESTORE esi - popl_cfi %edi - CFI_RESTORE edi - popl_cfi %ebx - CFI_RESTORE ebx + popl_cfi_reg esi + popl_cfi_reg edi + popl_cfi_reg ebx ret CFI_ENDPROC ENDPROC(csum_partial_copy_generic) diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S index f2145cfa12a6..e67e579c93bd 100644 --- a/arch/x86/lib/clear_page_64.S +++ b/arch/x86/lib/clear_page_64.S @@ -1,31 +1,35 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #include <asm/alternative-asm.h> /* - * Zero a page. - * rdi page - */ -ENTRY(clear_page_c) + * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is + * recommended to use this when possible and we do use them by default. + * If enhanced REP MOVSB/STOSB is not available, try to use fast string. + * Otherwise, use original. + */ + +/* + * Zero a page. + * %rdi - page + */ +ENTRY(clear_page) CFI_STARTPROC + + ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp clear_page_c_e", X86_FEATURE_ERMS + movl $4096/8,%ecx xorl %eax,%eax rep stosq ret CFI_ENDPROC -ENDPROC(clear_page_c) +ENDPROC(clear_page) -ENTRY(clear_page_c_e) +ENTRY(clear_page_orig) CFI_STARTPROC - movl $4096,%ecx - xorl %eax,%eax - rep stosb - ret - CFI_ENDPROC -ENDPROC(clear_page_c_e) -ENTRY(clear_page) - CFI_STARTPROC xorl %eax,%eax movl $4096/64,%ecx .p2align 4 @@ -45,29 +49,13 @@ ENTRY(clear_page) nop ret CFI_ENDPROC -.Lclear_page_end: -ENDPROC(clear_page) - - /* - * Some CPUs support enhanced REP MOVSB/STOSB instructions. - * It is recommended to use this when possible. - * If enhanced REP MOVSB/STOSB is not available, try to use fast string. - * Otherwise, use original function. - * - */ +ENDPROC(clear_page_orig) -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */ -2: .byte 0xeb /* jmp <disp8> */ - .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */ -3: - .previous - .section .altinstructions,"a" - altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\ - .Lclear_page_end-clear_page, 2b-1b - altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \ - .Lclear_page_end-clear_page,3b-2b - .previous +ENTRY(clear_page_c_e) + CFI_STARTPROC + movl $4096,%ecx + xorl %eax,%eax + rep stosb + ret + CFI_ENDPROC +ENDPROC(clear_page_c_e) diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S index 176cca67212b..8239dbcbf984 100644 --- a/arch/x86/lib/copy_page_64.S +++ b/arch/x86/lib/copy_page_64.S @@ -2,23 +2,26 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> +#include <asm/cpufeature.h> #include <asm/alternative-asm.h> +/* + * Some CPUs run faster using the string copy instructions (sane microcode). + * It is also a lot simpler. Use this when possible. But, don't use streaming + * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the + * prefetch distance based on SMP/UP. + */ ALIGN -copy_page_rep: +ENTRY(copy_page) CFI_STARTPROC + ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD movl $4096/8, %ecx rep movsq ret CFI_ENDPROC -ENDPROC(copy_page_rep) - -/* - * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD. - * Could vary the prefetch distance based on SMP/UP. -*/ +ENDPROC(copy_page) -ENTRY(copy_page) +ENTRY(copy_page_regs) CFI_STARTPROC subq $2*8, %rsp CFI_ADJUST_CFA_OFFSET 2*8 @@ -90,21 +93,5 @@ ENTRY(copy_page) addq $2*8, %rsp CFI_ADJUST_CFA_OFFSET -2*8 ret -.Lcopy_page_end: CFI_ENDPROC -ENDPROC(copy_page) - - /* Some CPUs run faster using the string copy instructions. - It is also a lot simpler. Use this when possible */ - -#include <asm/cpufeature.h> - - .section .altinstr_replacement,"ax" -1: .byte 0xeb /* jmp <disp8> */ - .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */ -2: - .previous - .section .altinstructions,"a" - altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \ - .Lcopy_page_end-copy_page, 2b-1b - .previous +ENDPROC(copy_page_regs) diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S index dee945d55594..fa997dfaef24 100644 --- a/arch/x86/lib/copy_user_64.S +++ b/arch/x86/lib/copy_user_64.S @@ -8,9 +8,6 @@ #include <linux/linkage.h> #include <asm/dwarf2.h> - -#define FIX_ALIGNMENT 1 - #include <asm/current.h> #include <asm/asm-offsets.h> #include <asm/thread_info.h> @@ -19,33 +16,7 @@ #include <asm/asm.h> #include <asm/smap.h> -/* - * By placing feature2 after feature1 in altinstructions section, we logically - * implement: - * If CPU has feature2, jmp to alt2 is used - * else if CPU has feature1, jmp to alt1 is used - * else jmp to orig is used. - */ - .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2 -0: - .byte 0xe9 /* 32bit jump */ - .long \orig-1f /* by default jump to orig */ -1: - .section .altinstr_replacement,"ax" -2: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt1-1b /* offset */ /* or alternatively to alt1 */ -3: .byte 0xe9 /* near jump with 32bit immediate */ - .long \alt2-1b /* offset */ /* or alternatively to alt2 */ - .previous - - .section .altinstructions,"a" - altinstruction_entry 0b,2b,\feature1,5,5 - altinstruction_entry 0b,3b,\feature2,5,5 - .previous - .endm - .macro ALIGN_DESTINATION -#ifdef FIX_ALIGNMENT /* check for bad alignment of destination */ movl %edi,%ecx andl $7,%ecx @@ -67,7 +38,6 @@ _ASM_EXTABLE(100b,103b) _ASM_EXTABLE(101b,103b) -#endif .endm /* Standard copy_to_user with segment limit checking */ @@ -79,9 +49,11 @@ ENTRY(_copy_to_user) jc bad_to_user cmpq TI_addr_limit(%rax),%rcx ja bad_to_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_to_user) @@ -94,9 +66,11 @@ ENTRY(_copy_from_user) jc bad_from_user cmpq TI_addr_limit(%rax),%rcx ja bad_from_user - ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \ - copy_user_generic_unrolled,copy_user_generic_string, \ - copy_user_enhanced_fast_string + ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \ + "jmp copy_user_generic_string", \ + X86_FEATURE_REP_GOOD, \ + "jmp copy_user_enhanced_fast_string", \ + X86_FEATURE_ERMS CFI_ENDPROC ENDPROC(_copy_from_user) diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S index 2419d5fefae3..9734182966f3 100644 --- a/arch/x86/lib/csum-copy_64.S +++ b/arch/x86/lib/csum-copy_64.S @@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic) /* handle last odd byte */ .Lhandle_1: - testl $1, %r10d + testb $1, %r10b jz .Lende xorl %ebx, %ebx source diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c index 1313ae6b478b..8f72b334aea0 100644 --- a/arch/x86/lib/insn.c +++ b/arch/x86/lib/insn.c @@ -52,6 +52,13 @@ */ void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64) { + /* + * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid + * even if the input buffer is long enough to hold them. + */ + if (buf_len > MAX_INSN_SIZE) + buf_len = MAX_INSN_SIZE; + memset(insn, 0, sizeof(*insn)); insn->kaddr = kaddr; insn->end_kaddr = kaddr + buf_len; @@ -164,6 +171,12 @@ found: /* VEX.W overrides opnd_size */ insn->opnd_bytes = 8; } else { + /* + * For VEX2, fake VEX3-like byte#2. + * Makes it easier to decode vex.W, vex.vvvv, + * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0. + */ + insn->vex_prefix.bytes[2] = b2 & 0x7f; insn->vex_prefix.nbytes = 2; insn->next_byte += 2; } diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S index 89b53c9968e7..b046664f5a1c 100644 --- a/arch/x86/lib/memcpy_64.S +++ b/arch/x86/lib/memcpy_64.S @@ -1,12 +1,20 @@ /* Copyright 2002 Andi Kleen */ #include <linux/linkage.h> - #include <asm/cpufeature.h> #include <asm/dwarf2.h> #include <asm/alternative-asm.h> /* + * We build a jump to memcpy_orig by default which gets NOPped out on + * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which + * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs + * to a jmp to memcpy_erms which does the REP; MOVSB mem copy. + */ + +.weak memcpy + +/* * memcpy - Copy a memory block. * * Input: @@ -17,15 +25,11 @@ * Output: * rax original destination */ +ENTRY(__memcpy) +ENTRY(memcpy) + ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memcpy_erms", X86_FEATURE_ERMS -/* - * memcpy_c() - fast string ops (REP MOVSQ) based variant. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c: movq %rdi, %rax movq %rdx, %rcx shrq $3, %rcx @@ -34,29 +38,21 @@ movl %edx, %ecx rep movsb ret -.Lmemcpy_e: - .previous +ENDPROC(memcpy) +ENDPROC(__memcpy) /* - * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than - * memcpy_c. Use memcpy_c_e when possible. - * - * This gets patched over the unrolled variant (below) via the - * alternative instructions framework: + * memcpy_erms() - enhanced fast string memcpy. This is faster and + * simpler than memcpy. Use memcpy_erms when possible. */ - .section .altinstr_replacement, "ax", @progbits -.Lmemcpy_c_e: +ENTRY(memcpy_erms) movq %rdi, %rax movq %rdx, %rcx rep movsb ret -.Lmemcpy_e_e: - .previous - -.weak memcpy +ENDPROC(memcpy_erms) -ENTRY(__memcpy) -ENTRY(memcpy) +ENTRY(memcpy_orig) CFI_STARTPROC movq %rdi, %rax @@ -183,26 +179,4 @@ ENTRY(memcpy) .Lend: retq CFI_ENDPROC -ENDPROC(memcpy) -ENDPROC(__memcpy) - - /* - * Some CPUs are adding enhanced REP MOVSB/STOSB feature - * If the feature is supported, memcpy_c_e() is the first choice. - * If enhanced rep movsb copy is not available, use fast string copy - * memcpy_c() when possible. This is faster and code is simpler than - * original memcpy(). - * Otherwise, original memcpy() is used. - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - * - * Replace only beginning, memcpy is used to apply alternatives, - * so it is silly to overwrite itself with nops - reboot is the - * only outcome... - */ - .section .altinstructions, "a" - altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\ - .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c - altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \ - .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e - .previous +ENDPROC(memcpy_orig) diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S index 9c4b530575da..0f8a0d0331b9 100644 --- a/arch/x86/lib/memmove_64.S +++ b/arch/x86/lib/memmove_64.S @@ -5,7 +5,6 @@ * This assembly file is re-written from memmove_64.c file. * - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com> */ -#define _STRING_C #include <linux/linkage.h> #include <asm/dwarf2.h> #include <asm/cpufeature.h> @@ -44,6 +43,8 @@ ENTRY(__memmove) jg 2f .Lmemmove_begin_forward: + ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS + /* * movsq instruction have many startup latency * so we handle small size by general register. @@ -207,21 +208,5 @@ ENTRY(__memmove) 13: retq CFI_ENDPROC - - .section .altinstr_replacement,"ax" -.Lmemmove_begin_forward_efs: - /* Forward moving data. */ - movq %rdx, %rcx - rep movsb - retq -.Lmemmove_end_forward_efs: - .previous - - .section .altinstructions,"a" - altinstruction_entry .Lmemmove_begin_forward, \ - .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \ - .Lmemmove_end_forward-.Lmemmove_begin_forward, \ - .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs - .previous ENDPROC(__memmove) ENDPROC(memmove) diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S index 6f44935c6a60..93118fb23976 100644 --- a/arch/x86/lib/memset_64.S +++ b/arch/x86/lib/memset_64.S @@ -5,19 +5,30 @@ #include <asm/cpufeature.h> #include <asm/alternative-asm.h> +.weak memset + /* * ISO C memset - set a memory block to a byte value. This function uses fast * string to get better performance than the original function. The code is * simpler and shorter than the orignal function as well. - * + * * rdi destination - * rsi value (char) - * rdx count (bytes) - * + * rsi value (char) + * rdx count (bytes) + * * rax original destination - */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c: + */ +ENTRY(memset) +ENTRY(__memset) + /* + * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended + * to use it when possible. If not available, use fast string instructions. + * + * Otherwise, use original memset function. + */ + ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \ + "jmp memset_erms", X86_FEATURE_ERMS + movq %rdi,%r9 movq %rdx,%rcx andl $7,%edx @@ -31,8 +42,8 @@ rep stosb movq %r9,%rax ret -.Lmemset_e: - .previous +ENDPROC(memset) +ENDPROC(__memset) /* * ISO C memset - set a memory block to a byte value. This function uses @@ -45,21 +56,16 @@ * * rax original destination */ - .section .altinstr_replacement, "ax", @progbits -.Lmemset_c_e: +ENTRY(memset_erms) movq %rdi,%r9 movb %sil,%al movq %rdx,%rcx rep stosb movq %r9,%rax ret -.Lmemset_e_e: - .previous - -.weak memset +ENDPROC(memset_erms) -ENTRY(memset) -ENTRY(__memset) +ENTRY(memset_orig) CFI_STARTPROC movq %rdi,%r10 @@ -134,23 +140,4 @@ ENTRY(__memset) jmp .Lafter_bad_alignment .Lfinal: CFI_ENDPROC -ENDPROC(memset) -ENDPROC(__memset) - - /* Some CPUs support enhanced REP MOVSB/STOSB feature. - * It is recommended to use this when possible. - * - * If enhanced REP MOVSB/STOSB feature is not available, use fast string - * instructions. - * - * Otherwise, use original memset function. - * - * In .altinstructions section, ERMS feature is placed after REG_GOOD - * feature to implement the right patch order. - */ - .section .altinstructions,"a" - altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\ - .Lfinal-__memset,.Lmemset_e-.Lmemset_c - altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \ - .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e - .previous +ENDPROC(memset_orig) diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S index f6d13eefad10..3ca5218fbece 100644 --- a/arch/x86/lib/msr-reg.S +++ b/arch/x86/lib/msr-reg.S @@ -14,8 +14,8 @@ .macro op_safe_regs op ENTRY(\op\()_safe_regs) CFI_STARTPROC - pushq_cfi %rbx - pushq_cfi %rbp + pushq_cfi_reg rbx + pushq_cfi_reg rbp movq %rdi, %r10 /* Save pointer */ xorl %r11d, %r11d /* Return value */ movl (%rdi), %eax @@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs) movl %ebp, 20(%r10) movl %esi, 24(%r10) movl %edi, 28(%r10) - popq_cfi %rbp - popq_cfi %rbx + popq_cfi_reg rbp + popq_cfi_reg rbx ret 3: CFI_RESTORE_STATE @@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs) .macro op_safe_regs op ENTRY(\op\()_safe_regs) CFI_STARTPROC - pushl_cfi %ebx - pushl_cfi %ebp - pushl_cfi %esi - pushl_cfi %edi + pushl_cfi_reg ebx + pushl_cfi_reg ebp + pushl_cfi_reg esi + pushl_cfi_reg edi pushl_cfi $0 /* Return value */ pushl_cfi %eax movl 4(%eax), %ecx @@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs) movl %esi, 24(%eax) movl %edi, 28(%eax) popl_cfi %eax - popl_cfi %edi - popl_cfi %esi - popl_cfi %ebp - popl_cfi %ebx + popl_cfi_reg edi + popl_cfi_reg esi + popl_cfi_reg ebp + popl_cfi_reg ebx ret 3: CFI_RESTORE_STATE diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S index 5dff5f042468..2322abe4da3b 100644 --- a/arch/x86/lib/rwsem.S +++ b/arch/x86/lib/rwsem.S @@ -34,10 +34,10 @@ */ #define save_common_regs \ - pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0 + pushl_cfi_reg ecx #define restore_common_regs \ - popl_cfi %ecx; CFI_RESTORE ecx + popl_cfi_reg ecx /* Avoid uglifying the argument copying x86-64 needs to do. */ .macro movq src, dst @@ -64,22 +64,22 @@ */ #define save_common_regs \ - pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \ - pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \ - pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \ - pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \ - pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \ - pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \ - pushq_cfi %r11; CFI_REL_OFFSET r11, 0 + pushq_cfi_reg rdi; \ + pushq_cfi_reg rsi; \ + pushq_cfi_reg rcx; \ + pushq_cfi_reg r8; \ + pushq_cfi_reg r9; \ + pushq_cfi_reg r10; \ + pushq_cfi_reg r11 #define restore_common_regs \ - popq_cfi %r11; CFI_RESTORE r11; \ - popq_cfi %r10; CFI_RESTORE r10; \ - popq_cfi %r9; CFI_RESTORE r9; \ - popq_cfi %r8; CFI_RESTORE r8; \ - popq_cfi %rcx; CFI_RESTORE rcx; \ - popq_cfi %rsi; CFI_RESTORE rsi; \ - popq_cfi %rdi; CFI_RESTORE rdi + popq_cfi_reg r11; \ + popq_cfi_reg r10; \ + popq_cfi_reg r9; \ + popq_cfi_reg r8; \ + popq_cfi_reg rcx; \ + popq_cfi_reg rsi; \ + popq_cfi_reg rdi #endif @@ -87,12 +87,10 @@ ENTRY(call_rwsem_down_read_failed) CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi) %__ASM_REG(dx) - CFI_REL_OFFSET __ASM_REG(dx), 0 + __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) movq %rax,%rdi call rwsem_down_read_failed - __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) - CFI_RESTORE __ASM_REG(dx) + __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) restore_common_regs ret CFI_ENDPROC @@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake) ENTRY(call_rwsem_downgrade_wake) CFI_STARTPROC save_common_regs - __ASM_SIZE(push,_cfi) %__ASM_REG(dx) - CFI_REL_OFFSET __ASM_REG(dx), 0 + __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx) movq %rax,%rdi call rwsem_downgrade_wake - __ASM_SIZE(pop,_cfi) %__ASM_REG(dx) - CFI_RESTORE __ASM_REG(dx) + __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx) restore_common_regs ret CFI_ENDPROC diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S index e28cdaf5ac2c..5eb715087b80 100644 --- a/arch/x86/lib/thunk_32.S +++ b/arch/x86/lib/thunk_32.S @@ -13,12 +13,9 @@ .globl \name \name: CFI_STARTPROC - pushl_cfi %eax - CFI_REL_OFFSET eax, 0 - pushl_cfi %ecx - CFI_REL_OFFSET ecx, 0 - pushl_cfi %edx - CFI_REL_OFFSET edx, 0 + pushl_cfi_reg eax + pushl_cfi_reg ecx + pushl_cfi_reg edx .if \put_ret_addr_in_eax /* Place EIP in the arg1 */ @@ -26,12 +23,9 @@ .endif call \func - popl_cfi %edx - CFI_RESTORE edx - popl_cfi %ecx - CFI_RESTORE ecx - popl_cfi %eax - CFI_RESTORE eax + popl_cfi_reg edx + popl_cfi_reg ecx + popl_cfi_reg eax ret CFI_ENDPROC _ASM_NOKPROBE(\name) diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S index b30b5ebd614a..f89ba4e93025 100644 --- a/arch/x86/lib/thunk_64.S +++ b/arch/x86/lib/thunk_64.S @@ -17,9 +17,18 @@ CFI_STARTPROC /* this one pushes 9 elems, the next one would be %rIP */ - SAVE_ARGS + pushq_cfi_reg rdi + pushq_cfi_reg rsi + pushq_cfi_reg rdx + pushq_cfi_reg rcx + pushq_cfi_reg rax + pushq_cfi_reg r8 + pushq_cfi_reg r9 + pushq_cfi_reg r10 + pushq_cfi_reg r11 .if \put_ret_addr_in_rdi + /* 9*8(%rsp) is return addr on stack */ movq_cfi_restore 9*8, rdi .endif @@ -45,11 +54,22 @@ #endif #endif - /* SAVE_ARGS below is used only for the .cfi directives it contains. */ +#if defined(CONFIG_TRACE_IRQFLAGS) \ + || defined(CONFIG_DEBUG_LOCK_ALLOC) \ + || defined(CONFIG_PREEMPT) CFI_STARTPROC - SAVE_ARGS + CFI_ADJUST_CFA_OFFSET 9*8 restore: - RESTORE_ARGS + popq_cfi_reg r11 + popq_cfi_reg r10 + popq_cfi_reg r9 + popq_cfi_reg r8 + popq_cfi_reg rax + popq_cfi_reg rcx + popq_cfi_reg rdx + popq_cfi_reg rsi + popq_cfi_reg rdi ret CFI_ENDPROC _ASM_NOKPROBE(restore) +#endif diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt index 1a2be7c6895d..816488c0b97e 100644 --- a/arch/x86/lib/x86-opcode-map.txt +++ b/arch/x86/lib/x86-opcode-map.txt @@ -273,6 +273,9 @@ dd: ESC de: ESC df: ESC # 0xe0 - 0xef +# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix +# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation +# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD. e0: LOOPNE/LOOPNZ Jb (f64) e1: LOOPE/LOOPZ Jb (f64) e2: LOOP Jb (f64) @@ -281,6 +284,10 @@ e4: IN AL,Ib e5: IN eAX,Ib e6: OUT Ib,AL e7: OUT Ib,eAX +# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset +# in "near" jumps and calls is 16-bit. For CALL, +# push of return address is 16-bit wide, RSP is decremented by 2 +# but is not truncated to 16 bits, unlike RIP. e8: CALL Jz (f64) e9: JMP-near Jz (f64) ea: JMP-far Ap (i64) @@ -456,6 +463,7 @@ AVXcode: 1 7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1) 7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3) # 0x0f 0x80-0x8f +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 80: JO Jz (f64) 81: JNO Jz (f64) 82: JB/JC/JNAE Jz (f64) @@ -842,6 +850,7 @@ EndTable GrpTable: Grp5 0: INC Ev 1: DEC Ev +# Note: "forced64" is Intel CPU behavior (see comment about CALL insn). 2: CALLN Ev (f64) 3: CALLF Ep 4: JMPN Ev (f64) diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c index ede025fb46f1..ae340d3761ca 100644 --- a/arch/x86/mm/fault.c +++ b/arch/x86/mm/fault.c @@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr) instr = (void *)convert_ip_to_linear(current, regs); max_instr = instr + 15; - if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE) + if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX) return 0; while (instr < max_instr) { diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c index a110efca6d06..52417e771af9 100644 --- a/arch/x86/mm/init.c +++ b/arch/x86/mm/init.c @@ -179,7 +179,8 @@ static void __init probe_page_size_mask(void) if (cpu_has_pge) { cr4_set_bits_and_update_boot(X86_CR4_PGE); __supported_pte_mask |= _PAGE_GLOBAL; - } + } else + __supported_pte_mask &= ~_PAGE_GLOBAL; } #ifdef CONFIG_X86_32 diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c index 3e32ed5648a0..757678fb26e1 100644 --- a/arch/x86/power/cpu.c +++ b/arch/x86/power/cpu.c @@ -134,7 +134,7 @@ static void do_fpu_end(void) static void fix_processor_context(void) { int cpu = smp_processor_id(); - struct tss_struct *t = &per_cpu(init_tss, cpu); + struct tss_struct *t = &per_cpu(cpu_tss, cpu); #ifdef CONFIG_X86_64 struct desc_struct *desc = get_cpu_gdt_table(cpu); tss_desc tss; diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl index b3560ece1c9f..ef8187f9d28d 100644 --- a/arch/x86/syscalls/syscall_32.tbl +++ b/arch/x86/syscalls/syscall_32.tbl @@ -119,7 +119,7 @@ 110 i386 iopl sys_iopl 111 i386 vhangup sys_vhangup 112 i386 idle -113 i386 vm86old sys_vm86old sys32_vm86_warning +113 i386 vm86old sys_vm86old sys_ni_syscall 114 i386 wait4 sys_wait4 compat_sys_wait4 115 i386 swapoff sys_swapoff 116 i386 sysinfo sys_sysinfo compat_sys_sysinfo @@ -172,7 +172,7 @@ 163 i386 mremap sys_mremap 164 i386 setresuid sys_setresuid16 165 i386 getresuid sys_getresuid16 -166 i386 vm86 sys_vm86 sys32_vm86_warning +166 i386 vm86 sys_vm86 sys_ni_syscall 167 i386 query_module 168 i386 poll sys_poll 169 i386 nfsservctl diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl index 8d656fbb57aa..9ef32d5f1b19 100644 --- a/arch/x86/syscalls/syscall_64.tbl +++ b/arch/x86/syscalls/syscall_64.tbl @@ -178,7 +178,7 @@ 169 common reboot sys_reboot 170 common sethostname sys_sethostname 171 common setdomainname sys_setdomainname -172 common iopl stub_iopl +172 common iopl sys_iopl 173 common ioperm sys_ioperm 174 64 create_module 175 common init_module sys_init_module diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h index 2d7d9a1f5b53..8ffd2146fa6a 100644 --- a/arch/x86/um/asm/barrier.h +++ b/arch/x86/um/asm/barrier.h @@ -64,8 +64,8 @@ */ static inline void rdtsc_barrier(void) { - alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC); - alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, + "lfence", X86_FEATURE_LFENCE_RDTSC); } #endif diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c index 5cdfa9db2217..a75d8700472a 100644 --- a/arch/x86/um/sys_call_table_64.c +++ b/arch/x86/um/sys_call_table_64.c @@ -16,7 +16,7 @@ */ /* Not going to be implemented by UML, since we have no hardware. */ -#define stub_iopl sys_ni_syscall +#define sys_iopl sys_ni_syscall #define sys_ioperm sys_ni_syscall /* diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c index 5240f563076d..81665c9f2132 100644 --- a/arch/x86/xen/enlighten.c +++ b/arch/x86/xen/enlighten.c @@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss, mcs = xen_mc_entry(0); MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0); xen_mc_issue(PARAVIRT_LAZY_CPU); + tss->x86_tss.sp0 = thread->sp0; } static void xen_set_iopl_mask(unsigned mask) diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S index 53adefda4275..985fc3ee0973 100644 --- a/arch/x86/xen/xen-asm_64.S +++ b/arch/x86/xen/xen-asm_64.S @@ -68,11 +68,11 @@ ENTRY(xen_sysret64) * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ - movq %rsp, PER_CPU_VAR(old_rsp) + movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack), %rsp pushq $__USER_DS - pushq PER_CPU_VAR(old_rsp) + pushq PER_CPU_VAR(rsp_scratch) pushq %r11 pushq $__USER_CS pushq %rcx @@ -87,11 +87,11 @@ ENTRY(xen_sysret32) * We're already on the usermode stack at this point, but * still with the kernel gs, so we can easily switch back */ - movq %rsp, PER_CPU_VAR(old_rsp) + movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(kernel_stack), %rsp pushq $__USER32_DS - pushq PER_CPU_VAR(old_rsp) + pushq PER_CPU_VAR(rsp_scratch) pushq %r11 pushq $__USER32_CS pushq %rcx diff --git a/include/linux/stddef.h b/include/linux/stddef.h index f4aec0e75c3a..076af437284d 100644 --- a/include/linux/stddef.h +++ b/include/linux/stddef.h @@ -19,3 +19,12 @@ enum { #define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER) #endif #endif + +/** + * offsetofend(TYPE, MEMBER) + * + * @TYPE: The type of the structure + * @MEMBER: The member within the structure to get the end offset of + */ +#define offsetofend(TYPE, MEMBER) \ + (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) diff --git a/include/linux/vfio.h b/include/linux/vfio.h index 2d67b8998fd8..049b2f497bc7 100644 --- a/include/linux/vfio.h +++ b/include/linux/vfio.h @@ -78,19 +78,6 @@ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops); extern void vfio_unregister_iommu_driver( const struct vfio_iommu_driver_ops *ops); -/** - * offsetofend(TYPE, MEMBER) - * - * @TYPE: The type of the structure - * @MEMBER: The member within the structure to get the end offset of - * - * Simple helper macro for dealing with variable sized structures passed - * from user space. This allows us to easily determine if the provided - * structure is sized to include various fields. - */ -#define offsetofend(TYPE, MEMBER) \ - (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER)) - /* * External user API */ diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h index d66ab799b35f..8c0c1a2770c8 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h @@ -1,12 +1,12 @@ -MEMCPY_FN(__memcpy, +MEMCPY_FN(memcpy_orig, "x86-64-unrolled", "unrolled memcpy() in arch/x86/lib/memcpy_64.S") -MEMCPY_FN(memcpy_c, +MEMCPY_FN(__memcpy, "x86-64-movsq", "movsq-based memcpy() in arch/x86/lib/memcpy_64.S") -MEMCPY_FN(memcpy_c_e, +MEMCPY_FN(memcpy_erms, "x86-64-movsb", "movsb-based memcpy() in arch/x86/lib/memcpy_64.S") diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S index fcd9cf00600a..e4c2c30143b9 100644 --- a/tools/perf/bench/mem-memcpy-x86-64-asm.S +++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S @@ -1,8 +1,6 @@ #define memcpy MEMCPY /* don't hide glibc's memcpy() */ #define altinstr_replacement text #define globl p2align 4; .globl -#define Lmemcpy_c globl memcpy_c; memcpy_c -#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e #include "../../../arch/x86/lib/memcpy_64.S" /* * We need to provide note.GNU-stack section, saying that we want diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c index db1d3a29d97f..d3dfb7936dcd 100644 --- a/tools/perf/bench/mem-memcpy.c +++ b/tools/perf/bench/mem-memcpy.c @@ -36,7 +36,7 @@ static const struct option options[] = { "Specify length of memory to copy. " "Available units: B, KB, MB, GB and TB (upper and lower)"), OPT_STRING('r', "routine", &routine, "default", - "Specify routine to copy"), + "Specify routine to copy, \"all\" runs all available routines"), OPT_INTEGER('i', "iterations", &iterations, "repeat memcpy() invocation this number of times"), OPT_BOOLEAN('c', "cycle", &use_cycle, @@ -135,55 +135,16 @@ struct bench_mem_info { const char *const *usage; }; -static int bench_mem_common(int argc, const char **argv, - const char *prefix __maybe_unused, - struct bench_mem_info *info) +static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen) { - int i; - size_t len; - double totallen; + const struct routine *r = &info->routines[r_idx]; double result_bps[2]; u64 result_cycle[2]; - argc = parse_options(argc, argv, options, - info->usage, 0); - - if (no_prefault && only_prefault) { - fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); - return 1; - } - - if (use_cycle) - init_cycle(); - - len = (size_t)perf_atoll((char *)length_str); - totallen = (double)len * iterations; - result_cycle[0] = result_cycle[1] = 0ULL; result_bps[0] = result_bps[1] = 0.0; - if ((s64)len <= 0) { - fprintf(stderr, "Invalid length:%s\n", length_str); - return 1; - } - - /* same to without specifying either of prefault and no-prefault */ - if (only_prefault && no_prefault) - only_prefault = no_prefault = false; - - for (i = 0; info->routines[i].name; i++) { - if (!strcmp(info->routines[i].name, routine)) - break; - } - if (!info->routines[i].name) { - printf("Unknown routine:%s\n", routine); - printf("Available routines...\n"); - for (i = 0; info->routines[i].name; i++) { - printf("\t%s ... %s\n", - info->routines[i].name, info->routines[i].desc); - } - return 1; - } + printf("Routine %s (%s)\n", r->name, r->desc); if (bench_format == BENCH_FORMAT_DEFAULT) printf("# Copying %s Bytes ...\n\n", length_str); @@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv, if (!only_prefault && !no_prefault) { /* show both of results */ if (use_cycle) { - result_cycle[0] = - info->do_cycle(&info->routines[i], len, false); - result_cycle[1] = - info->do_cycle(&info->routines[i], len, true); + result_cycle[0] = info->do_cycle(r, len, false); + result_cycle[1] = info->do_cycle(r, len, true); } else { - result_bps[0] = - info->do_gettimeofday(&info->routines[i], - len, false); - result_bps[1] = - info->do_gettimeofday(&info->routines[i], - len, true); + result_bps[0] = info->do_gettimeofday(r, len, false); + result_bps[1] = info->do_gettimeofday(r, len, true); } } else { - if (use_cycle) { - result_cycle[pf] = - info->do_cycle(&info->routines[i], - len, only_prefault); - } else { - result_bps[pf] = - info->do_gettimeofday(&info->routines[i], - len, only_prefault); - } + if (use_cycle) + result_cycle[pf] = info->do_cycle(r, len, only_prefault); + else + result_bps[pf] = info->do_gettimeofday(r, len, only_prefault); } switch (bench_format) { @@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv, die("unknown format: %d\n", bench_format); break; } +} + +static int bench_mem_common(int argc, const char **argv, + const char *prefix __maybe_unused, + struct bench_mem_info *info) +{ + int i; + size_t len; + double totallen; + + argc = parse_options(argc, argv, options, + info->usage, 0); + + if (no_prefault && only_prefault) { + fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n"); + return 1; + } + + if (use_cycle) + init_cycle(); + + len = (size_t)perf_atoll((char *)length_str); + totallen = (double)len * iterations; + + if ((s64)len <= 0) { + fprintf(stderr, "Invalid length:%s\n", length_str); + return 1; + } + + /* same to without specifying either of prefault and no-prefault */ + if (only_prefault && no_prefault) + only_prefault = no_prefault = false; + + if (!strncmp(routine, "all", 3)) { + for (i = 0; info->routines[i].name; i++) + __bench_mem_routine(info, i, len, totallen); + return 0; + } + + for (i = 0; info->routines[i].name; i++) { + if (!strcmp(info->routines[i].name, routine)) + break; + } + if (!info->routines[i].name) { + printf("Unknown routine:%s\n", routine); + printf("Available routines...\n"); + for (i = 0; info->routines[i].name; i++) { + printf("\t%s ... %s\n", + info->routines[i].name, info->routines[i].desc); + } + return 1; + } + + __bench_mem_routine(info, i, len, totallen); return 0; } diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h index a71dff97c1f5..f02d028771d9 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm-def.h +++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h @@ -1,12 +1,12 @@ -MEMSET_FN(__memset, +MEMSET_FN(memset_orig, "x86-64-unrolled", "unrolled memset() in arch/x86/lib/memset_64.S") -MEMSET_FN(memset_c, +MEMSET_FN(__memset, "x86-64-stosq", "movsq-based memset() in arch/x86/lib/memset_64.S") -MEMSET_FN(memset_c_e, +MEMSET_FN(memset_erms, "x86-64-stosb", "movsb-based memset() in arch/x86/lib/memset_64.S") diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S index 9e5af89ed13a..de278784c866 100644 --- a/tools/perf/bench/mem-memset-x86-64-asm.S +++ b/tools/perf/bench/mem-memset-x86-64-asm.S @@ -1,8 +1,6 @@ #define memset MEMSET /* don't hide glibc's memset() */ #define altinstr_replacement text #define globl p2align 4; .globl -#define Lmemset_c globl memset_c; memset_c -#define Lmemset_c_e globl memset_c_e; memset_c_e #include "../../../arch/x86/lib/memset_64.S" /* diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h index 6789d788d494..3a3a0f16456a 100644 --- a/tools/perf/util/include/asm/alternative-asm.h +++ b/tools/perf/util/include/asm/alternative-asm.h @@ -4,5 +4,6 @@ /* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */ #define altinstruction_entry # +#define ALTERNATIVE_2 # #endif |