summaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arch/x86/boot/compressed/head_32.S3
-rw-r--r--arch/x86/boot/compressed/head_64.S3
-rw-r--r--arch/x86/ia32/Makefile1
-rw-r--r--arch/x86/ia32/ia32_signal.c2
-rw-r--r--arch/x86/ia32/ia32entry.S345
-rw-r--r--arch/x86/ia32/nosyscall.c7
-rw-r--r--arch/x86/ia32/sys_ia32.c14
-rw-r--r--arch/x86/ia32/syscall_ia32.c25
-rw-r--r--arch/x86/include/asm/alternative-asm.h43
-rw-r--r--arch/x86/include/asm/alternative.h65
-rw-r--r--arch/x86/include/asm/apic.h2
-rw-r--r--arch/x86/include/asm/barrier.h6
-rw-r--r--arch/x86/include/asm/calling.h275
-rw-r--r--arch/x86/include/asm/compat.h2
-rw-r--r--arch/x86/include/asm/cpufeature.h31
-rw-r--r--arch/x86/include/asm/desc.h7
-rw-r--r--arch/x86/include/asm/dwarf2.h24
-rw-r--r--arch/x86/include/asm/insn.h2
-rw-r--r--arch/x86/include/asm/irqflags.h4
-rw-r--r--arch/x86/include/asm/processor.h109
-rw-r--r--arch/x86/include/asm/ptrace.h23
-rw-r--r--arch/x86/include/asm/sigcontext.h6
-rw-r--r--arch/x86/include/asm/smap.h30
-rw-r--r--arch/x86/include/asm/special_insns.h10
-rw-r--r--arch/x86/include/asm/thread_info.h42
-rw-r--r--arch/x86/include/uapi/asm/ptrace-abi.h16
-rw-r--r--arch/x86/include/uapi/asm/ptrace.h13
-rw-r--r--arch/x86/include/uapi/asm/sigcontext.h21
-rw-r--r--arch/x86/kernel/Makefile1
-rw-r--r--arch/x86/kernel/alternative.c158
-rw-r--r--arch/x86/kernel/asm-offsets_32.c2
-rw-r--r--arch/x86/kernel/asm-offsets_64.c1
-rw-r--r--arch/x86/kernel/cpu/amd.c5
-rw-r--r--arch/x86/kernel/cpu/common.c61
-rw-r--r--arch/x86/kernel/cpu/perf_event.c7
-rw-r--r--arch/x86/kernel/entry_32.S35
-rw-r--r--arch/x86/kernel/entry_64.S577
-rw-r--r--arch/x86/kernel/head_32.S3
-rw-r--r--arch/x86/kernel/head_64.S6
-rw-r--r--arch/x86/kernel/ioport.c2
-rw-r--r--arch/x86/kernel/perf_regs.c2
-rw-r--r--arch/x86/kernel/process.c23
-rw-r--r--arch/x86/kernel/process_32.c25
-rw-r--r--arch/x86/kernel/process_64.c21
-rw-r--r--arch/x86/kernel/ptrace.c10
-rw-r--r--arch/x86/kernel/relocate_kernel_32.S8
-rw-r--r--arch/x86/kernel/relocate_kernel_64.S8
-rw-r--r--arch/x86/kernel/signal.c28
-rw-r--r--arch/x86/kernel/smpboot.c2
-rw-r--r--arch/x86/kernel/syscall_32.c16
-rw-r--r--arch/x86/kernel/traps.c29
-rw-r--r--arch/x86/kernel/vm86_32.c4
-rw-r--r--arch/x86/lguest/boot.c1
-rw-r--r--arch/x86/lib/atomic64_cx8_32.S50
-rw-r--r--arch/x86/lib/checksum_32.S64
-rw-r--r--arch/x86/lib/clear_page_64.S66
-rw-r--r--arch/x86/lib/copy_page_64.S37
-rw-r--r--arch/x86/lib/copy_user_64.S46
-rw-r--r--arch/x86/lib/csum-copy_64.S2
-rw-r--r--arch/x86/lib/insn.c13
-rw-r--r--arch/x86/lib/memcpy_64.S68
-rw-r--r--arch/x86/lib/memmove_64.S19
-rw-r--r--arch/x86/lib/memset_64.S61
-rw-r--r--arch/x86/lib/msr-reg.S24
-rw-r--r--arch/x86/lib/rwsem.S44
-rw-r--r--arch/x86/lib/thunk_32.S18
-rw-r--r--arch/x86/lib/thunk_64.S28
-rw-r--r--arch/x86/lib/x86-opcode-map.txt9
-rw-r--r--arch/x86/mm/fault.c2
-rw-r--r--arch/x86/mm/init.c3
-rw-r--r--arch/x86/power/cpu.c2
-rw-r--r--arch/x86/syscalls/syscall_32.tbl4
-rw-r--r--arch/x86/syscalls/syscall_64.tbl2
-rw-r--r--arch/x86/um/asm/barrier.h4
-rw-r--r--arch/x86/um/sys_call_table_64.c2
-rw-r--r--arch/x86/xen/enlighten.c1
-rw-r--r--arch/x86/xen/xen-asm_64.S8
-rw-r--r--include/linux/stddef.h9
-rw-r--r--include/linux/vfio.h13
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memcpy-x86-64-asm.S2
-rw-r--r--tools/perf/bench/mem-memcpy.c128
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm-def.h6
-rw-r--r--tools/perf/bench/mem-memset-x86-64-asm.S2
-rw-r--r--tools/perf/util/include/asm/alternative-asm.h1
85 files changed, 1537 insertions, 1373 deletions
diff --git a/arch/x86/boot/compressed/head_32.S b/arch/x86/boot/compressed/head_32.S
index 1d7fbbcc196d..8ef964ddc18e 100644
--- a/arch/x86/boot/compressed/head_32.S
+++ b/arch/x86/boot/compressed/head_32.S
@@ -29,6 +29,7 @@
#include <asm/page_types.h>
#include <asm/boot.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
__HEAD
ENTRY(startup_32)
@@ -102,7 +103,7 @@ preferred_addr:
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
* us to not reload segments
*/
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 1f
cli
diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S
index 6b1766c6c082..90c152135e92 100644
--- a/arch/x86/boot/compressed/head_64.S
+++ b/arch/x86/boot/compressed/head_64.S
@@ -31,6 +31,7 @@
#include <asm/msr.h>
#include <asm/processor-flags.h>
#include <asm/asm-offsets.h>
+#include <asm/bootparam.h>
__HEAD
.code32
@@ -46,7 +47,7 @@ ENTRY(startup_32)
* Test KEEP_SEGMENTS flag to see if the bootloader is asking
* us to not reload segments
*/
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 1f
cli
diff --git a/arch/x86/ia32/Makefile b/arch/x86/ia32/Makefile
index e785b422b766..bb635c641869 100644
--- a/arch/x86/ia32/Makefile
+++ b/arch/x86/ia32/Makefile
@@ -3,7 +3,6 @@
#
obj-$(CONFIG_IA32_EMULATION) := ia32entry.o sys_ia32.o ia32_signal.o
-obj-$(CONFIG_IA32_EMULATION) += nosyscall.o syscall_ia32.o
obj-$(CONFIG_IA32_AOUT) += ia32_aout.o
diff --git a/arch/x86/ia32/ia32_signal.c b/arch/x86/ia32/ia32_signal.c
index d0165c9a2932..1f5e2b0e09ff 100644
--- a/arch/x86/ia32/ia32_signal.c
+++ b/arch/x86/ia32/ia32_signal.c
@@ -203,6 +203,8 @@ static int ia32_restore_sigcontext(struct pt_regs *regs,
err |= restore_xstate_sig(buf, 1);
+ force_iret();
+
return err;
}
diff --git a/arch/x86/ia32/ia32entry.S b/arch/x86/ia32/ia32entry.S
index 156ebcab4ada..ad9efef65a6b 100644
--- a/arch/x86/ia32/ia32entry.S
+++ b/arch/x86/ia32/ia32entry.S
@@ -30,24 +30,13 @@
.section .entry.text, "ax"
- .macro IA32_ARG_FIXUP noebp=0
- movl %edi,%r8d
- .if \noebp
- .else
- movl %ebp,%r9d
- .endif
- xchg %ecx,%esi
- movl %ebx,%edi
- movl %edx,%edx /* zero extension */
- .endm
-
- /* clobbers %eax */
- .macro CLEAR_RREGS offset=0, _r9=rax
+ /* clobbers %rax */
+ .macro CLEAR_RREGS _r9=rax
xorl %eax,%eax
- movq %rax,\offset+R11(%rsp)
- movq %rax,\offset+R10(%rsp)
- movq %\_r9,\offset+R9(%rsp)
- movq %rax,\offset+R8(%rsp)
+ movq %rax,R11(%rsp)
+ movq %rax,R10(%rsp)
+ movq %\_r9,R9(%rsp)
+ movq %rax,R8(%rsp)
.endm
/*
@@ -60,14 +49,14 @@
* If it's -1 to make us punt the syscall, then (u32)-1 is still
* an appropriately invalid value.
*/
- .macro LOAD_ARGS32 offset, _r9=0
+ .macro LOAD_ARGS32 _r9=0
.if \_r9
- movl \offset+16(%rsp),%r9d
+ movl R9(%rsp),%r9d
.endif
- movl \offset+40(%rsp),%ecx
- movl \offset+48(%rsp),%edx
- movl \offset+56(%rsp),%esi
- movl \offset+64(%rsp),%edi
+ movl RCX(%rsp),%ecx
+ movl RDX(%rsp),%edx
+ movl RSI(%rsp),%esi
+ movl RDI(%rsp),%edi
movl %eax,%eax /* zero extension */
.endm
@@ -99,35 +88,38 @@ ENDPROC(native_irq_enable_sysexit)
/*
* 32bit SYSENTER instruction entry.
*
+ * SYSENTER loads ss, rsp, cs, and rip from previously programmed MSRs.
+ * IF and VM in rflags are cleared (IOW: interrupts are off).
+ * SYSENTER does not save anything on the stack,
+ * and does not save old rip (!!!) and rflags.
+ *
* Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp user stack
- * 0(%ebp) Arg6
- *
- * Interrupts off.
- *
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp user stack
+ * 0(%ebp) arg6
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
+ * path below. We set up a complete hardware stack frame to share code
* with the int 0x80 path.
- */
+ */
ENTRY(ia32_sysenter_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
CFI_DEF_CFA rsp,0
CFI_REGISTER rsp,rbp
SWAPGS_UNSAFE_STACK
- movq PER_CPU_VAR(kernel_stack), %rsp
- addq $(KERNEL_STACK_OFFSET),%rsp
+ movq PER_CPU_VAR(cpu_tss + TSS_sp0), %rsp
/*
* No need to follow this irqs on/off section: the syscall
* disabled irqs, here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
+ /* Construct iret frame (ss,rsp,rflags,cs,rip) */
movl %ebp,%ebp /* zero extension */
pushq_cfi $__USER32_DS
/*CFI_REL_OFFSET ss,0*/
@@ -140,13 +132,19 @@ ENTRY(ia32_sysenter_target)
pushq_cfi $__USER32_CS
/*CFI_REL_OFFSET cs,0*/
movl %eax, %eax
+ /* Store thread_info->sysenter_return in rip stack slot */
pushq_cfi %r10
CFI_REL_OFFSET rip,0
+ /* Store orig_ax */
pushq_cfi %rax
+ /* Construct the rest of "struct pt_regs" */
cld
- SAVE_ARGS 0,1,0
- /* no need to do an access_ok check here because rbp has been
- 32bit zero extended */
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS_EXCEPT_R891011
+ /*
+ * no need to do an access_ok check here because rbp has been
+ * 32bit zero extended
+ */
ASM_STAC
1: movl (%rbp),%ebp
_ASM_EXTABLE(1b,ia32_badarg)
@@ -157,32 +155,39 @@ ENTRY(ia32_sysenter_target)
* ourselves. To save a few cycles, we can check whether
* NT was set instead of doing an unconditional popfq.
*/
- testl $X86_EFLAGS_NT,EFLAGS-ARGOFFSET(%rsp)
+ testl $X86_EFLAGS_NT,EFLAGS(%rsp)
jnz sysenter_fix_flags
sysenter_flags_fixed:
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
CFI_REMEMBER_STATE
jnz sysenter_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
sysenter_do_call:
- IA32_ARG_FIXUP
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ movl %ebp,%r9d /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
sysenter_dispatch:
call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
jnz sysexit_audit
sysexit_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
/* clear IF, that popfq doesn't enable interrupts early */
- andl $~0x200,EFLAGS-ARGOFFSET(%rsp)
- movl RIP-ARGOFFSET(%rsp),%edx /* User %eip */
+ andl $~0x200,EFLAGS(%rsp)
+ movl RIP(%rsp),%edx /* User %eip */
CFI_REGISTER rip,rdx
- RESTORE_ARGS 0,24,0,0,0,0
+ RESTORE_RSI_RDI
+ /* pop everything except ss,rsp,rflags slots */
+ REMOVE_PT_GPREGS_FROM_STACK 3*8
xorq %r8,%r8
xorq %r9,%r9
xorq %r10,%r10
@@ -192,6 +197,10 @@ sysexit_from_sys_call:
popq_cfi %rcx /* User %esp */
CFI_REGISTER rsp,rcx
TRACE_IRQS_ON
+ /*
+ * 32bit SYSEXIT restores eip from edx, esp from ecx.
+ * cs and ss are loaded from MSRs.
+ */
ENABLE_INTERRUPTS_SYSEXIT32
CFI_RESTORE_STATE
@@ -205,18 +214,18 @@ sysexit_from_sys_call:
movl %ebx,%esi /* 2nd arg: 1st syscall arg */
movl %eax,%edi /* 1st arg: syscall number */
call __audit_syscall_entry
- movl RAX-ARGOFFSET(%rsp),%eax /* reload syscall number */
+ movl RAX(%rsp),%eax /* reload syscall number */
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
movl %ebx,%edi /* reload 1st syscall arg */
- movl RCX-ARGOFFSET(%rsp),%esi /* reload 2nd syscall arg */
- movl RDX-ARGOFFSET(%rsp),%edx /* reload 3rd syscall arg */
- movl RSI-ARGOFFSET(%rsp),%ecx /* reload 4th syscall arg */
- movl RDI-ARGOFFSET(%rsp),%r8d /* reload 5th syscall arg */
+ movl RCX(%rsp),%esi /* reload 2nd syscall arg */
+ movl RDX(%rsp),%edx /* reload 3rd syscall arg */
+ movl RSI(%rsp),%ecx /* reload 4th syscall arg */
+ movl RDI(%rsp),%r8d /* reload 5th syscall arg */
.endm
.macro auditsys_exit exit
- testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jnz ia32_ret_from_sys_call
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
@@ -227,13 +236,13 @@ sysexit_from_sys_call:
1: setbe %al /* 1 if error, 0 if not */
movzbl %al,%edi /* zero-extend that into %edi */
call __audit_syscall_exit
- movq RAX-ARGOFFSET(%rsp),%rax /* reload syscall return value */
+ movq RAX(%rsp),%rax /* reload syscall return value */
movl $(_TIF_ALLWORK_MASK & ~_TIF_SYSCALL_AUDIT),%edi
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl %edi,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl %edi,TI_flags+THREAD_INFO(%rsp,RIP)
jz \exit
- CLEAR_RREGS -ARGOFFSET
+ CLEAR_RREGS
jmp int_with_check
.endm
@@ -253,16 +262,16 @@ sysenter_fix_flags:
sysenter_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jz sysenter_auditsys
#endif
- SAVE_REST
+ SAVE_EXTRA_REGS
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp)/* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
+ RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* sysenter_tracesys has set RAX(%rsp) */
jmp sysenter_do_call
@@ -272,23 +281,33 @@ ENDPROC(ia32_sysenter_target)
/*
* 32bit SYSCALL instruction entry.
*
+ * 32bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
+ *
+ * Note: rflags saving+masking-with-MSR happens only in Long mode
+ * (in legacy 32bit mode, IF, RF and VM bits are cleared and that's it).
+ * Don't get confused: rflags saving+masking depends on Long Mode Active bit
+ * (EFER.LMA=1), NOT on bitness of userspace where SYSCALL executes
+ * or target CS descriptor's L bit (SYSCALL does not read segment descriptors).
+ *
* Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx return EIP
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg2 [note: not saved in the stack frame, should not be touched]
- * %esp user stack
- * 0(%esp) Arg6
- *
- * Interrupts off.
- *
+ * eax system call number
+ * ecx return address
+ * ebx arg1
+ * ebp arg2 (note: not saved in the stack frame, should not be touched)
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * esp user stack
+ * 0(%esp) arg6
+ *
* This is purely a fast path. For anything complicated we use the int 0x80
- * path below. Set up a complete hardware stack frame to share code
- * with the int 0x80 path.
- */
+ * path below. We set up a complete hardware stack frame to share code
+ * with the int 0x80 path.
+ */
ENTRY(ia32_cstar_target)
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
@@ -304,62 +323,77 @@ ENTRY(ia32_cstar_target)
* disabled irqs and here we enable it straight after entry:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8,0,0
+ ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */
+ SAVE_C_REGS_EXCEPT_RCX_R891011
movl %eax,%eax /* zero extension */
- movq %rax,ORIG_RAX-ARGOFFSET(%rsp)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- movq %rbp,RCX-ARGOFFSET(%rsp) /* this lies slightly to ptrace */
+ movq %rax,ORIG_RAX(%rsp)
+ movq %rcx,RIP(%rsp)
+ CFI_REL_OFFSET rip,RIP
+ movq %rbp,RCX(%rsp) /* this lies slightly to ptrace */
movl %ebp,%ecx
- movq $__USER32_CS,CS-ARGOFFSET(%rsp)
- movq $__USER32_DS,SS-ARGOFFSET(%rsp)
- movq %r11,EFLAGS-ARGOFFSET(%rsp)
- /*CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- movq %r8,RSP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
- /* no need to do an access_ok check here because r8 has been
- 32bit zero extended */
- /* hardware stack frame is complete now */
+ movq $__USER32_CS,CS(%rsp)
+ movq $__USER32_DS,SS(%rsp)
+ movq %r11,EFLAGS(%rsp)
+ /*CFI_REL_OFFSET rflags,EFLAGS*/
+ movq %r8,RSP(%rsp)
+ CFI_REL_OFFSET rsp,RSP
+ /* iret stack frame is complete now */
+ /*
+ * no need to do an access_ok check here because r8 has been
+ * 32bit zero extended
+ */
ASM_STAC
1: movl (%r8),%r9d
_ASM_EXTABLE(1b,ia32_badarg)
ASM_CLAC
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
CFI_REMEMBER_STATE
jnz cstar_tracesys
cmpq $IA32_NR_syscalls-1,%rax
ja ia32_badsys
cstar_do_call:
- IA32_ARG_FIXUP 1
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ /* r9 already loaded */ /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
cstar_dispatch:
call *ia32_sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
jnz sysretl_audit
sysretl_from_sys_call:
- andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- RESTORE_ARGS 0,-ARG_SKIP,0,0,0
- movl RIP-ARGOFFSET(%rsp),%ecx
+ andl $~TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ RESTORE_RSI_RDI_RDX
+ movl RIP(%rsp),%ecx
CFI_REGISTER rip,rcx
- movl EFLAGS-ARGOFFSET(%rsp),%r11d
+ movl EFLAGS(%rsp),%r11d
/*CFI_REGISTER rflags,r11*/
xorq %r10,%r10
xorq %r9,%r9
xorq %r8,%r8
TRACE_IRQS_ON
- movl RSP-ARGOFFSET(%rsp),%esp
+ movl RSP(%rsp),%esp
CFI_RESTORE rsp
+ /*
+ * 64bit->32bit SYSRET restores eip from ecx,
+ * eflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ * (Note: 32bit->32bit SYSRET is different: since r11
+ * does not exist, it merely sets eflags.IF=1).
+ */
USERGS_SYSRET32
-
+
#ifdef CONFIG_AUDITSYSCALL
cstar_auditsys:
CFI_RESTORE_STATE
- movl %r9d,R9-ARGOFFSET(%rsp) /* register to be clobbered by call */
+ movl %r9d,R9(%rsp) /* register to be clobbered by call */
auditsys_entry_common
- movl R9-ARGOFFSET(%rsp),%r9d /* reload 6th syscall arg */
+ movl R9(%rsp),%r9d /* reload 6th syscall arg */
jmp cstar_dispatch
sysretl_audit:
@@ -368,17 +402,17 @@ sysretl_audit:
cstar_tracesys:
#ifdef CONFIG_AUDITSYSCALL
- testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $(_TIF_WORK_SYSCALL_ENTRY & ~_TIF_SYSCALL_AUDIT),TI_flags+THREAD_INFO(%rsp,RIP)
jz cstar_auditsys
#endif
xchgl %r9d,%ebp
- SAVE_REST
- CLEAR_RREGS 0, r9
+ SAVE_EXTRA_REGS
+ CLEAR_RREGS r9
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET, 1 /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ LOAD_ARGS32 1 /* reload args from stack in case ptrace changed it */
+ RESTORE_EXTRA_REGS
xchgl %ebp,%r9d
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* cstar_tracesys has set RAX(%rsp) */
@@ -391,26 +425,26 @@ ia32_badarg:
jmp ia32_sysret
CFI_ENDPROC
-/*
- * Emulated IA32 system calls via int 0x80.
+/*
+ * Emulated IA32 system calls via int 0x80.
*
- * Arguments:
- * %eax System call number.
- * %ebx Arg1
- * %ecx Arg2
- * %edx Arg3
- * %esi Arg4
- * %edi Arg5
- * %ebp Arg6 [note: not saved in the stack frame, should not be touched]
+ * Arguments:
+ * eax system call number
+ * ebx arg1
+ * ecx arg2
+ * edx arg3
+ * esi arg4
+ * edi arg5
+ * ebp arg6 (note: not saved in the stack frame, should not be touched)
*
* Notes:
- * Uses the same stack frame as the x86-64 version.
- * All registers except %eax must be saved (but ptrace may violate that)
+ * Uses the same stack frame as the x86-64 version.
+ * All registers except eax must be saved (but ptrace may violate that).
* Arguments are zero extended. For system calls that want sign extension and
* take long arguments a wrapper is needed. Most calls can just be called
* directly.
- * Assumes it is only called from user space and entered with interrupts off.
- */
+ * Assumes it is only called from user space and entered with interrupts off.
+ */
ENTRY(ia32_syscall)
CFI_STARTPROC32 simple
@@ -429,40 +463,46 @@ ENTRY(ia32_syscall)
*/
ENABLE_INTERRUPTS(CLBR_NONE)
movl %eax,%eax
- pushq_cfi %rax
+ pushq_cfi %rax /* store orig_ax */
cld
/* note the registers are not zero extended to the sf.
this could be a problem. */
- SAVE_ARGS 0,1,0
- orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP-ARGOFFSET)
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS_EXCEPT_R891011
+ orl $TS_COMPAT,TI_status+THREAD_INFO(%rsp,RIP)
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
jnz ia32_tracesys
cmpq $(IA32_NR_syscalls-1),%rax
ja ia32_badsys
ia32_do_call:
- IA32_ARG_FIXUP
+ /* 32bit syscall -> 64bit C ABI argument conversion */
+ movl %edi,%r8d /* arg5 */
+ movl %ebp,%r9d /* arg6 */
+ xchg %ecx,%esi /* rsi:arg2, rcx:arg4 */
+ movl %ebx,%edi /* arg1 */
+ movl %edx,%edx /* arg3 (zero extension) */
call *ia32_sys_call_table(,%rax,8) # xxx: rip relative
ia32_sysret:
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
ia32_ret_from_sys_call:
- CLEAR_RREGS -ARGOFFSET
- jmp int_ret_from_sys_call
+ CLEAR_RREGS
+ jmp int_ret_from_sys_call
-ia32_tracesys:
- SAVE_REST
+ia32_tracesys:
+ SAVE_EXTRA_REGS
CLEAR_RREGS
movq $-ENOSYS,RAX(%rsp) /* ptrace can change this for a bad syscall */
movq %rsp,%rdi /* &pt_regs -> arg1 */
call syscall_trace_enter
- LOAD_ARGS32 ARGOFFSET /* reload args from stack in case ptrace changed it */
- RESTORE_REST
+ LOAD_ARGS32 /* reload args from stack in case ptrace changed it */
+ RESTORE_EXTRA_REGS
cmpq $(IA32_NR_syscalls-1),%rax
ja int_ret_from_sys_call /* ia32_tracesys has set RAX(%rsp) */
jmp ia32_do_call
END(ia32_syscall)
ia32_badsys:
- movq $0,ORIG_RAX-ARGOFFSET(%rsp)
+ movq $0,ORIG_RAX(%rsp)
movq $-ENOSYS,%rax
jmp ia32_sysret
@@ -492,24 +532,23 @@ GLOBAL(stub32_clone)
ALIGN
ia32_ptregs_common:
- popq %r11
CFI_ENDPROC
CFI_STARTPROC32 simple
CFI_SIGNAL_FRAME
- CFI_DEF_CFA rsp,SS+8-ARGOFFSET
- CFI_REL_OFFSET rax,RAX-ARGOFFSET
- CFI_REL_OFFSET rcx,RCX-ARGOFFSET
- CFI_REL_OFFSET rdx,RDX-ARGOFFSET
- CFI_REL_OFFSET rsi,RSI-ARGOFFSET
- CFI_REL_OFFSET rdi,RDI-ARGOFFSET
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
-/* CFI_REL_OFFSET cs,CS-ARGOFFSET*/
-/* CFI_REL_OFFSET rflags,EFLAGS-ARGOFFSET*/
- CFI_REL_OFFSET rsp,RSP-ARGOFFSET
-/* CFI_REL_OFFSET ss,SS-ARGOFFSET*/
- SAVE_REST
+ CFI_DEF_CFA rsp,SIZEOF_PTREGS
+ CFI_REL_OFFSET rax,RAX
+ CFI_REL_OFFSET rcx,RCX
+ CFI_REL_OFFSET rdx,RDX
+ CFI_REL_OFFSET rsi,RSI
+ CFI_REL_OFFSET rdi,RDI
+ CFI_REL_OFFSET rip,RIP
+/* CFI_REL_OFFSET cs,CS*/
+/* CFI_REL_OFFSET rflags,EFLAGS*/
+ CFI_REL_OFFSET rsp,RSP
+/* CFI_REL_OFFSET ss,SS*/
+ SAVE_EXTRA_REGS 8
call *%rax
- RESTORE_REST
- jmp ia32_sysret /* misbalances the return cache */
+ RESTORE_EXTRA_REGS 8
+ ret
CFI_ENDPROC
END(ia32_ptregs_common)
diff --git a/arch/x86/ia32/nosyscall.c b/arch/x86/ia32/nosyscall.c
deleted file mode 100644
index 51ecd5b4e787..000000000000
--- a/arch/x86/ia32/nosyscall.c
+++ /dev/null
@@ -1,7 +0,0 @@
-#include <linux/kernel.h>
-#include <linux/errno.h>
-
-long compat_ni_syscall(void)
-{
- return -ENOSYS;
-}
diff --git a/arch/x86/ia32/sys_ia32.c b/arch/x86/ia32/sys_ia32.c
index 8e0ceecdc957..719cd702b0a4 100644
--- a/arch/x86/ia32/sys_ia32.c
+++ b/arch/x86/ia32/sys_ia32.c
@@ -201,20 +201,6 @@ long sys32_fadvise64_64(int fd, __u32 offset_low, __u32 offset_high,
advice);
}
-long sys32_vm86_warning(void)
-{
- struct task_struct *me = current;
- static char lastcomm[sizeof(me->comm)];
-
- if (strncmp(lastcomm, me->comm, sizeof(lastcomm))) {
- compat_printk(KERN_INFO
- "%s: vm86 mode not supported on 64 bit kernel\n",
- me->comm);
- strncpy(lastcomm, me->comm, sizeof(lastcomm));
- }
- return -ENOSYS;
-}
-
asmlinkage ssize_t sys32_readahead(int fd, unsigned off_lo, unsigned off_hi,
size_t count)
{
diff --git a/arch/x86/ia32/syscall_ia32.c b/arch/x86/ia32/syscall_ia32.c
deleted file mode 100644
index 4754ba0f5d9f..000000000000
--- a/arch/x86/ia32/syscall_ia32.c
+++ /dev/null
@@ -1,25 +0,0 @@
-/* System call table for ia32 emulation. */
-
-#include <linux/linkage.h>
-#include <linux/sys.h>
-#include <linux/cache.h>
-#include <asm/asm-offsets.h>
-
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void compat(void) ;
-#include <asm/syscalls_32.h>
-#undef __SYSCALL_I386
-
-#define __SYSCALL_I386(nr, sym, compat) [nr] = compat,
-
-typedef void (*sys_call_ptr_t)(void);
-
-extern void compat_ni_syscall(void);
-
-const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
- /*
- * Smells like a compiler bug -- it doesn't work
- * when the & below is removed.
- */
- [0 ... __NR_ia32_syscall_max] = &compat_ni_syscall,
-#include <asm/syscalls_32.h>
-};
diff --git a/arch/x86/include/asm/alternative-asm.h b/arch/x86/include/asm/alternative-asm.h
index 372231c22a47..524bddce0b76 100644
--- a/arch/x86/include/asm/alternative-asm.h
+++ b/arch/x86/include/asm/alternative-asm.h
@@ -18,12 +18,53 @@
.endm
#endif
-.macro altinstruction_entry orig alt feature orig_len alt_len
+.macro altinstruction_entry orig alt feature orig_len alt_len pad_len
.long \orig - .
.long \alt - .
.word \feature
.byte \orig_len
.byte \alt_len
+ .byte \pad_len
+.endm
+
+.macro ALTERNATIVE oldinstr, newinstr, feature
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature,142b-140b,144f-143f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr
+144:
+ .popsection
+.endm
+
+.macro ALTERNATIVE_2 oldinstr, newinstr1, feature1, newinstr2, feature2
+140:
+ \oldinstr
+141:
+ .skip -(((144f-143f)-(141b-140b)) > 0) * ((144f-143f)-(141b-140b)),0x90
+ .skip -(((145f-144f)-(144f-143f)-(141b-140b)) > 0) * ((145f-144f)-(144f-143f)-(141b-140b)),0x90
+142:
+
+ .pushsection .altinstructions,"a"
+ altinstruction_entry 140b,143f,\feature1,142b-140b,144f-143f,142b-141b
+ altinstruction_entry 140b,144f,\feature2,142b-140b,145f-144f,142b-141b
+ .popsection
+
+ .pushsection .altinstr_replacement,"ax"
+143:
+ \newinstr1
+144:
+ \newinstr2
+145:
+ .popsection
.endm
#endif /* __ASSEMBLY__ */
diff --git a/arch/x86/include/asm/alternative.h b/arch/x86/include/asm/alternative.h
index 473bdbee378a..5aef6a97d80e 100644
--- a/arch/x86/include/asm/alternative.h
+++ b/arch/x86/include/asm/alternative.h
@@ -48,8 +48,9 @@ struct alt_instr {
s32 repl_offset; /* offset to replacement instruction */
u16 cpuid; /* cpuid bit set for replacement */
u8 instrlen; /* length of original instruction */
- u8 replacementlen; /* length of new instruction, <= instrlen */
-};
+ u8 replacementlen; /* length of new instruction */
+ u8 padlen; /* length of build-time padding */
+} __packed;
extern void alternative_instructions(void);
extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end);
@@ -76,50 +77,61 @@ static inline int alternatives_text_reserved(void *start, void *end)
}
#endif /* CONFIG_SMP */
-#define OLDINSTR(oldinstr) "661:\n\t" oldinstr "\n662:\n"
+#define b_replacement(num) "664"#num
+#define e_replacement(num) "665"#num
-#define b_replacement(number) "663"#number
-#define e_replacement(number) "664"#number
+#define alt_end_marker "663"
+#define alt_slen "662b-661b"
+#define alt_pad_len alt_end_marker"b-662b"
+#define alt_total_slen alt_end_marker"b-661b"
+#define alt_rlen(num) e_replacement(num)"f-"b_replacement(num)"f"
-#define alt_slen "662b-661b"
-#define alt_rlen(number) e_replacement(number)"f-"b_replacement(number)"f"
+#define __OLDINSTR(oldinstr, num) \
+ "661:\n\t" oldinstr "\n662:\n" \
+ ".skip -(((" alt_rlen(num) ")-(" alt_slen ")) > 0) * " \
+ "((" alt_rlen(num) ")-(" alt_slen ")),0x90\n"
-#define ALTINSTR_ENTRY(feature, number) \
+#define OLDINSTR(oldinstr, num) \
+ __OLDINSTR(oldinstr, num) \
+ alt_end_marker ":\n"
+
+/*
+ * Pad the second replacement alternative with additional NOPs if it is
+ * additionally longer than the first replacement alternative.
+ */
+#define OLDINSTR_2(oldinstr, num1, num2) \
+ __OLDINSTR(oldinstr, num1) \
+ ".skip -(((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)) > 0) * " \
+ "((" alt_rlen(num2) ")-(" alt_rlen(num1) ")-(662b-661b)),0x90\n" \
+ alt_end_marker ":\n"
+
+#define ALTINSTR_ENTRY(feature, num) \
" .long 661b - .\n" /* label */ \
- " .long " b_replacement(number)"f - .\n" /* new instruction */ \
+ " .long " b_replacement(num)"f - .\n" /* new instruction */ \
" .word " __stringify(feature) "\n" /* feature bit */ \
- " .byte " alt_slen "\n" /* source len */ \
- " .byte " alt_rlen(number) "\n" /* replacement len */
+ " .byte " alt_total_slen "\n" /* source len */ \
+ " .byte " alt_rlen(num) "\n" /* replacement len */ \
+ " .byte " alt_pad_len "\n" /* pad len */
-#define DISCARD_ENTRY(number) /* rlen <= slen */ \
- " .byte 0xff + (" alt_rlen(number) ") - (" alt_slen ")\n"
-
-#define ALTINSTR_REPLACEMENT(newinstr, feature, number) /* replacement */ \
- b_replacement(number)":\n\t" newinstr "\n" e_replacement(number) ":\n\t"
+#define ALTINSTR_REPLACEMENT(newinstr, feature, num) /* replacement */ \
+ b_replacement(num)":\n\t" newinstr "\n" e_replacement(num) ":\n\t"
/* alternative assembly primitive: */
#define ALTERNATIVE(oldinstr, newinstr, feature) \
- OLDINSTR(oldinstr) \
+ OLDINSTR(oldinstr, 1) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature, 1) \
".popsection\n" \
- ".pushsection .discard,\"aw\",@progbits\n" \
- DISCARD_ENTRY(1) \
- ".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr, feature, 1) \
".popsection"
#define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\
- OLDINSTR(oldinstr) \
+ OLDINSTR_2(oldinstr, 1, 2) \
".pushsection .altinstructions,\"a\"\n" \
ALTINSTR_ENTRY(feature1, 1) \
ALTINSTR_ENTRY(feature2, 2) \
".popsection\n" \
- ".pushsection .discard,\"aw\",@progbits\n" \
- DISCARD_ENTRY(1) \
- DISCARD_ENTRY(2) \
- ".popsection\n" \
".pushsection .altinstr_replacement, \"ax\"\n" \
ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \
ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \
@@ -146,6 +158,9 @@ static inline int alternatives_text_reserved(void *start, void *end)
#define alternative(oldinstr, newinstr, feature) \
asm volatile (ALTERNATIVE(oldinstr, newinstr, feature) : : : "memory")
+#define alternative_2(oldinstr, newinstr1, feature1, newinstr2, feature2) \
+ asm volatile(ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2) ::: "memory")
+
/*
* Alternative inline assembly with input.
*
diff --git a/arch/x86/include/asm/apic.h b/arch/x86/include/asm/apic.h
index efc3b22d896e..8118e94d50ab 100644
--- a/arch/x86/include/asm/apic.h
+++ b/arch/x86/include/asm/apic.h
@@ -91,7 +91,7 @@ static inline void native_apic_mem_write(u32 reg, u32 v)
{
volatile u32 *addr = (volatile u32 *)(APIC_BASE + reg);
- alternative_io("movl %0, %1", "xchgl %0, %1", X86_BUG_11AP,
+ alternative_io("movl %0, %P1", "xchgl %0, %P1", X86_BUG_11AP,
ASM_OUTPUT2("=r" (v), "=m" (*addr)),
ASM_OUTPUT2("0" (v), "m" (*addr)));
}
diff --git a/arch/x86/include/asm/barrier.h b/arch/x86/include/asm/barrier.h
index 2ab1eb33106e..959e45b81fe2 100644
--- a/arch/x86/include/asm/barrier.h
+++ b/arch/x86/include/asm/barrier.h
@@ -95,13 +95,11 @@ do { \
* Stop RDTSC speculation. This is needed when you need to use RDTSC
* (or get_cycles or vread that possibly accesses the TSC) in a defined
* code region.
- *
- * (Could use an alternative three way for this if there was one.)
*/
static __always_inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif /* _ASM_X86_BARRIER_H */
diff --git a/arch/x86/include/asm/calling.h b/arch/x86/include/asm/calling.h
index 1f1297b46f83..4b5f7bf2b780 100644
--- a/arch/x86/include/asm/calling.h
+++ b/arch/x86/include/asm/calling.h
@@ -55,143 +55,148 @@ For 32-bit we have the following conventions - kernel is built with
* for assembly code:
*/
-#define R15 0
-#define R14 8
-#define R13 16
-#define R12 24
-#define RBP 32
-#define RBX 40
-
-/* arguments: interrupts/non tracing syscalls only save up to here: */
-#define R11 48
-#define R10 56
-#define R9 64
-#define R8 72
-#define RAX 80
-#define RCX 88
-#define RDX 96
-#define RSI 104
-#define RDI 112
-#define ORIG_RAX 120 /* + error_code */
-/* end of arguments */
-
-/* cpu exception frame or undefined in case of fast syscall: */
-#define RIP 128
-#define CS 136
-#define EFLAGS 144
-#define RSP 152
-#define SS 160
-
-#define ARGOFFSET R11
-
- .macro SAVE_ARGS addskip=0, save_rcx=1, save_r891011=1, rax_enosys=0
- subq $9*8+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET 9*8+\addskip
- movq_cfi rdi, 8*8
- movq_cfi rsi, 7*8
- movq_cfi rdx, 6*8
-
- .if \save_rcx
- movq_cfi rcx, 5*8
- .endif
+/* The layout forms the "struct pt_regs" on the stack: */
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
+#define R15 0*8
+#define R14 1*8
+#define R13 2*8
+#define R12 3*8
+#define RBP 4*8
+#define RBX 5*8
+/* These regs are callee-clobbered. Always saved on kernel entry. */
+#define R11 6*8
+#define R10 7*8
+#define R9 8*8
+#define R8 9*8
+#define RAX 10*8
+#define RCX 11*8
+#define RDX 12*8
+#define RSI 13*8
+#define RDI 14*8
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 15*8
+/* Return frame for iretq */
+#define RIP 16*8
+#define CS 17*8
+#define EFLAGS 18*8
+#define RSP 19*8
+#define SS 20*8
+
+#define SIZEOF_PTREGS 21*8
+
+ .macro ALLOC_PT_GPREGS_ON_STACK addskip=0
+ subq $15*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET 15*8+\addskip
+ .endm
- .if \rax_enosys
- movq $-ENOSYS, 4*8(%rsp)
- .else
- movq_cfi rax, 4*8
+ .macro SAVE_C_REGS_HELPER offset=0 rax=1 rcx=1 r8910=1 r11=1
+ .if \r11
+ movq_cfi r11, 6*8+\offset
.endif
-
- .if \save_r891011
- movq_cfi r8, 3*8
- movq_cfi r9, 2*8
- movq_cfi r10, 1*8
- movq_cfi r11, 0*8
+ .if \r8910
+ movq_cfi r10, 7*8+\offset
+ movq_cfi r9, 8*8+\offset
+ movq_cfi r8, 9*8+\offset
+ .endif
+ .if \rax
+ movq_cfi rax, 10*8+\offset
+ .endif
+ .if \rcx
+ movq_cfi rcx, 11*8+\offset
.endif
+ movq_cfi rdx, 12*8+\offset
+ movq_cfi rsi, 13*8+\offset
+ movq_cfi rdi, 14*8+\offset
+ .endm
+ .macro SAVE_C_REGS offset=0
+ SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RAX_RCX offset=0
+ SAVE_C_REGS_HELPER \offset, 0, 0, 1, 1
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_R891011
+ SAVE_C_REGS_HELPER 0, 1, 1, 0, 0
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RCX_R891011
+ SAVE_C_REGS_HELPER 0, 1, 0, 0, 0
+ .endm
+ .macro SAVE_C_REGS_EXCEPT_RAX_RCX_R11
+ SAVE_C_REGS_HELPER 0, 0, 0, 1, 0
+ .endm
+ .macro SAVE_EXTRA_REGS offset=0
+ movq_cfi r15, 0*8+\offset
+ movq_cfi r14, 1*8+\offset
+ movq_cfi r13, 2*8+\offset
+ movq_cfi r12, 3*8+\offset
+ movq_cfi rbp, 4*8+\offset
+ movq_cfi rbx, 5*8+\offset
+ .endm
+ .macro SAVE_EXTRA_REGS_RBP offset=0
+ movq_cfi rbp, 4*8+\offset
.endm
-#define ARG_SKIP (9*8)
+ .macro RESTORE_EXTRA_REGS offset=0
+ movq_cfi_restore 0*8+\offset, r15
+ movq_cfi_restore 1*8+\offset, r14
+ movq_cfi_restore 2*8+\offset, r13
+ movq_cfi_restore 3*8+\offset, r12
+ movq_cfi_restore 4*8+\offset, rbp
+ movq_cfi_restore 5*8+\offset, rbx
+ .endm
- .macro RESTORE_ARGS rstor_rax=1, addskip=0, rstor_rcx=1, rstor_r11=1, \
- rstor_r8910=1, rstor_rdx=1
+ .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1
.if \rstor_r11
- movq_cfi_restore 0*8, r11
+ movq_cfi_restore 6*8, r11
.endif
-
.if \rstor_r8910
- movq_cfi_restore 1*8, r10
- movq_cfi_restore 2*8, r9
- movq_cfi_restore 3*8, r8
+ movq_cfi_restore 7*8, r10
+ movq_cfi_restore 8*8, r9
+ movq_cfi_restore 9*8, r8
.endif
-
.if \rstor_rax
- movq_cfi_restore 4*8, rax
+ movq_cfi_restore 10*8, rax
.endif
-
.if \rstor_rcx
- movq_cfi_restore 5*8, rcx
+ movq_cfi_restore 11*8, rcx
.endif
-
.if \rstor_rdx
- movq_cfi_restore 6*8, rdx
- .endif
-
- movq_cfi_restore 7*8, rsi
- movq_cfi_restore 8*8, rdi
-
- .if ARG_SKIP+\addskip > 0
- addq $ARG_SKIP+\addskip, %rsp
- CFI_ADJUST_CFA_OFFSET -(ARG_SKIP+\addskip)
+ movq_cfi_restore 12*8, rdx
.endif
+ movq_cfi_restore 13*8, rsi
+ movq_cfi_restore 14*8, rdi
.endm
-
- .macro LOAD_ARGS offset, skiprax=0
- movq \offset(%rsp), %r11
- movq \offset+8(%rsp), %r10
- movq \offset+16(%rsp), %r9
- movq \offset+24(%rsp), %r8
- movq \offset+40(%rsp), %rcx
- movq \offset+48(%rsp), %rdx
- movq \offset+56(%rsp), %rsi
- movq \offset+64(%rsp), %rdi
- .if \skiprax
- .else
- movq \offset+72(%rsp), %rax
- .endif
+ .macro RESTORE_C_REGS
+ RESTORE_C_REGS_HELPER 1,1,1,1,1
.endm
-
-#define REST_SKIP (6*8)
-
- .macro SAVE_REST
- subq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq_cfi rbx, 5*8
- movq_cfi rbp, 4*8
- movq_cfi r12, 3*8
- movq_cfi r13, 2*8
- movq_cfi r14, 1*8
- movq_cfi r15, 0*8
+ .macro RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_C_REGS_HELPER 0,1,1,1,1
.endm
-
- .macro RESTORE_REST
- movq_cfi_restore 0*8, r15
- movq_cfi_restore 1*8, r14
- movq_cfi_restore 2*8, r13
- movq_cfi_restore 3*8, r12
- movq_cfi_restore 4*8, rbp
- movq_cfi_restore 5*8, rbx
- addq $REST_SKIP, %rsp
- CFI_ADJUST_CFA_OFFSET -(REST_SKIP)
+ .macro RESTORE_C_REGS_EXCEPT_RCX
+ RESTORE_C_REGS_HELPER 1,0,1,1,1
.endm
-
- .macro SAVE_ALL
- SAVE_ARGS
- SAVE_REST
+ .macro RESTORE_C_REGS_EXCEPT_R11
+ RESTORE_C_REGS_HELPER 1,1,0,1,1
+ .endm
+ .macro RESTORE_C_REGS_EXCEPT_RCX_R11
+ RESTORE_C_REGS_HELPER 1,0,0,1,1
+ .endm
+ .macro RESTORE_RSI_RDI
+ RESTORE_C_REGS_HELPER 0,0,0,0,0
+ .endm
+ .macro RESTORE_RSI_RDI_RDX
+ RESTORE_C_REGS_HELPER 0,0,0,0,1
.endm
- .macro RESTORE_ALL addskip=0
- RESTORE_REST
- RESTORE_ARGS 1, \addskip
+ .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0
+ addq $15*8+\addskip, %rsp
+ CFI_ADJUST_CFA_OFFSET -(15*8+\addskip)
.endm
.macro icebp
@@ -210,37 +215,23 @@ For 32-bit we have the following conventions - kernel is built with
*/
.macro SAVE_ALL
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ebp
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg edx
+ pushl_cfi_reg ecx
+ pushl_cfi_reg ebx
.endm
.macro RESTORE_ALL
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebp
- CFI_RESTORE ebp
- popl_cfi %eax
- CFI_RESTORE eax
+ popl_cfi_reg ebx
+ popl_cfi_reg ecx
+ popl_cfi_reg edx
+ popl_cfi_reg esi
+ popl_cfi_reg edi
+ popl_cfi_reg ebp
+ popl_cfi_reg eax
.endm
#endif /* CONFIG_X86_64 */
diff --git a/arch/x86/include/asm/compat.h b/arch/x86/include/asm/compat.h
index 59c6c401f79f..acdee09228b3 100644
--- a/arch/x86/include/asm/compat.h
+++ b/arch/x86/include/asm/compat.h
@@ -301,7 +301,7 @@ static inline void __user *arch_compat_alloc_user_space(long len)
sp = task_pt_regs(current)->sp;
} else {
/* -128 for the x32 ABI redzone */
- sp = this_cpu_read(old_rsp) - 128;
+ sp = task_pt_regs(current)->sp - 128;
}
return (void __user *)round_down(sp - len, 16);
diff --git a/arch/x86/include/asm/cpufeature.h b/arch/x86/include/asm/cpufeature.h
index 90a54851aedc..0f7a5a1a8db2 100644
--- a/arch/x86/include/asm/cpufeature.h
+++ b/arch/x86/include/asm/cpufeature.h
@@ -231,6 +231,7 @@
#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */
#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */
#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */
+#define X86_FEATURE_PCOMMIT ( 9*32+22) /* PCOMMIT instruction */
#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */
#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */
#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */
@@ -418,6 +419,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* 1: do replace */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (X86_FEATURE_ALWAYS) : : t_warn);
@@ -432,6 +434,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P0\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 0\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
/* skipping size check since replacement size = 0 */
: : "i" (bit) : : t_no);
@@ -457,6 +460,7 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
" .word %P1\n" /* feature bit */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -483,31 +487,30 @@ static __always_inline __pure bool __static_cpu_has(u16 bit)
static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
{
#ifdef CC_HAVE_ASM_GOTO
-/*
- * We need to spell the jumps to the compiler because, depending on the offset,
- * the replacement jump can be bigger than the original jump, and this we cannot
- * have. Thus, we force the jump to the widest, 4-byte, signed relative
- * offset even though the last would often fit in less bytes.
- */
- asm_volatile_goto("1: .byte 0xe9\n .long %l[t_dynamic] - 2f\n"
+ asm_volatile_goto("1: jmp %l[t_dynamic]\n"
"2:\n"
+ ".skip -(((5f-4f) - (2b-1b)) > 0) * "
+ "((5f-4f) - (2b-1b)),0x90\n"
+ "3:\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
- " .long 3f - .\n" /* repl offset */
+ " .long 4f - .\n" /* repl offset */
" .word %P1\n" /* always replace */
- " .byte 2b - 1b\n" /* src len */
- " .byte 4f - 3f\n" /* repl len */
+ " .byte 3b - 1b\n" /* src len */
+ " .byte 5f - 4f\n" /* repl len */
+ " .byte 3b - 2b\n" /* pad len */
".previous\n"
".section .altinstr_replacement,\"ax\"\n"
- "3: .byte 0xe9\n .long %l[t_no] - 2b\n"
- "4:\n"
+ "4: jmp %l[t_no]\n"
+ "5:\n"
".previous\n"
".section .altinstructions,\"a\"\n"
" .long 1b - .\n" /* src offset */
" .long 0\n" /* no replacement */
" .word %P0\n" /* feature bit */
- " .byte 2b - 1b\n" /* src len */
+ " .byte 3b - 1b\n" /* src len */
" .byte 0\n" /* repl len */
+ " .byte 0\n" /* pad len */
".previous\n"
: : "i" (bit), "i" (X86_FEATURE_ALWAYS)
: : t_dynamic, t_no);
@@ -527,6 +530,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P2\n" /* always replace */
" .byte 2b - 1b\n" /* source len */
" .byte 4f - 3f\n" /* replacement len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (4f-3f) - (2b-1b)\n" /* size check */
@@ -541,6 +545,7 @@ static __always_inline __pure bool _static_cpu_has_safe(u16 bit)
" .word %P1\n" /* feature bit */
" .byte 4b - 3b\n" /* src len */
" .byte 6f - 5f\n" /* repl len */
+ " .byte 0\n" /* pad len */
".previous\n"
".section .discard,\"aw\",@progbits\n"
" .byte 0xff + (6f-5f) - (4b-3b)\n" /* size check */
diff --git a/arch/x86/include/asm/desc.h b/arch/x86/include/asm/desc.h
index a94b82e8f156..a0bf89fd2647 100644
--- a/arch/x86/include/asm/desc.h
+++ b/arch/x86/include/asm/desc.h
@@ -376,11 +376,16 @@ static inline void _set_gate(int gate, unsigned type, void *addr,
* Pentium F0 0F bugfix can have resulted in the mapped
* IDT being write-protected.
*/
-#define set_intr_gate(n, addr) \
+#define set_intr_gate_notrace(n, addr) \
do { \
BUG_ON((unsigned)n > 0xFF); \
_set_gate(n, GATE_INTERRUPT, (void *)addr, 0, 0, \
__KERNEL_CS); \
+ } while (0)
+
+#define set_intr_gate(n, addr) \
+ do { \
+ set_intr_gate_notrace(n, addr); \
_trace_set_gate(n, GATE_INTERRUPT, (void *)trace_##addr,\
0, 0, __KERNEL_CS); \
} while (0)
diff --git a/arch/x86/include/asm/dwarf2.h b/arch/x86/include/asm/dwarf2.h
index f6f15986df6c..de1cdaf4d743 100644
--- a/arch/x86/include/asm/dwarf2.h
+++ b/arch/x86/include/asm/dwarf2.h
@@ -86,11 +86,23 @@
CFI_ADJUST_CFA_OFFSET 8
.endm
+ .macro pushq_cfi_reg reg
+ pushq %\reg
+ CFI_ADJUST_CFA_OFFSET 8
+ CFI_REL_OFFSET \reg, 0
+ .endm
+
.macro popq_cfi reg
popq \reg
CFI_ADJUST_CFA_OFFSET -8
.endm
+ .macro popq_cfi_reg reg
+ popq %\reg
+ CFI_ADJUST_CFA_OFFSET -8
+ CFI_RESTORE \reg
+ .endm
+
.macro pushfq_cfi
pushfq
CFI_ADJUST_CFA_OFFSET 8
@@ -116,11 +128,23 @@
CFI_ADJUST_CFA_OFFSET 4
.endm
+ .macro pushl_cfi_reg reg
+ pushl %\reg
+ CFI_ADJUST_CFA_OFFSET 4
+ CFI_REL_OFFSET \reg, 0
+ .endm
+
.macro popl_cfi reg
popl \reg
CFI_ADJUST_CFA_OFFSET -4
.endm
+ .macro popl_cfi_reg reg
+ popl %\reg
+ CFI_ADJUST_CFA_OFFSET -4
+ CFI_RESTORE \reg
+ .endm
+
.macro pushfl_cfi
pushfl
CFI_ADJUST_CFA_OFFSET 4
diff --git a/arch/x86/include/asm/insn.h b/arch/x86/include/asm/insn.h
index 47f29b1d1846..e7814b74caf8 100644
--- a/arch/x86/include/asm/insn.h
+++ b/arch/x86/include/asm/insn.h
@@ -69,7 +69,7 @@ struct insn {
const insn_byte_t *next_byte;
};
-#define MAX_INSN_SIZE 16
+#define MAX_INSN_SIZE 15
#define X86_MODRM_MOD(modrm) (((modrm) & 0xc0) >> 6)
#define X86_MODRM_REG(modrm) (((modrm) & 0x38) >> 3)
diff --git a/arch/x86/include/asm/irqflags.h b/arch/x86/include/asm/irqflags.h
index 0a8b519226b8..021bee9b86b6 100644
--- a/arch/x86/include/asm/irqflags.h
+++ b/arch/x86/include/asm/irqflags.h
@@ -171,9 +171,9 @@ static inline int arch_irqs_disabled(void)
#define ARCH_LOCKDEP_SYS_EXIT_IRQ \
TRACE_IRQS_ON; \
sti; \
- SAVE_REST; \
+ SAVE_EXTRA_REGS; \
LOCKDEP_SYS_EXIT; \
- RESTORE_REST; \
+ RESTORE_EXTRA_REGS; \
cli; \
TRACE_IRQS_OFF;
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index ec1c93588cef..572099710ba2 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -209,9 +209,24 @@ struct x86_hw_tss {
unsigned short back_link, __blh;
unsigned long sp0;
unsigned short ss0, __ss0h;
- unsigned long sp1;
- /* ss1 caches MSR_IA32_SYSENTER_CS: */
- unsigned short ss1, __ss1h;
+
+ /*
+ * We don't use ring 1, so sp1 and ss1 are convenient scratch
+ * spaces in the same cacheline as sp0. We use them to cache
+ * some MSR values to avoid unnecessary wrmsr instructions.
+ *
+ * We use SYSENTER_ESP to find sp0 and for the NMI emergency
+ * stack, but we need to context switch it because we do
+ * horrible things to the kernel stack in vm86 mode.
+ *
+ * We use SYSENTER_CS to disable sysenter in vm86 mode to avoid
+ * corrupting the stack if we went through the sysenter path
+ * from vm86 mode.
+ */
+ unsigned long sp1; /* MSR_IA32_SYSENTER_ESP */
+ unsigned short ss1; /* MSR_IA32_SYSENTER_CS */
+
+ unsigned short __ss1h;
unsigned long sp2;
unsigned short ss2, __ss2h;
unsigned long __cr3;
@@ -276,13 +291,17 @@ struct tss_struct {
unsigned long io_bitmap[IO_BITMAP_LONGS + 1];
/*
- * .. and then another 0x100 bytes for the emergency kernel stack:
+ * Space for the temporary SYSENTER stack:
*/
- unsigned long stack[64];
+ unsigned long SYSENTER_stack[64];
} ____cacheline_aligned;
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss);
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss);
+
+#ifdef CONFIG_X86_32
+DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
+#endif
/*
* Save the original ist values for checking stack pointers during debugging
@@ -474,7 +493,6 @@ struct thread_struct {
#ifdef CONFIG_X86_32
unsigned long sysenter_cs;
#else
- unsigned long usersp; /* Copy from PDA */
unsigned short es;
unsigned short ds;
unsigned short fsindex;
@@ -564,6 +582,16 @@ static inline void native_swapgs(void)
#endif
}
+static inline unsigned long current_top_of_stack(void)
+{
+#ifdef CONFIG_X86_64
+ return this_cpu_read_stable(cpu_tss.x86_tss.sp0);
+#else
+ /* sp0 on x86_32 is special in and around vm86 mode. */
+ return this_cpu_read_stable(cpu_current_top_of_stack);
+#endif
+}
+
#ifdef CONFIG_PARAVIRT
#include <asm/paravirt.h>
#else
@@ -761,10 +789,10 @@ extern char ignore_fpu_irq;
#define ARCH_HAS_SPINLOCK_PREFETCH
#ifdef CONFIG_X86_32
-# define BASE_PREFETCH ASM_NOP4
+# define BASE_PREFETCH ""
# define ARCH_HAS_PREFETCH
#else
-# define BASE_PREFETCH "prefetcht0 (%1)"
+# define BASE_PREFETCH "prefetcht0 %P1"
#endif
/*
@@ -775,10 +803,9 @@ extern char ignore_fpu_irq;
*/
static inline void prefetch(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchnta (%1)",
+ alternative_input(BASE_PREFETCH, "prefetchnta %P1",
X86_FEATURE_XMM,
- "r" (x));
+ "m" (*(const char *)x));
}
/*
@@ -788,10 +815,9 @@ static inline void prefetch(const void *x)
*/
static inline void prefetchw(const void *x)
{
- alternative_input(BASE_PREFETCH,
- "prefetchw (%1)",
- X86_FEATURE_3DNOW,
- "r" (x));
+ alternative_input(BASE_PREFETCH, "prefetchw %P1",
+ X86_FEATURE_3DNOWPREFETCH,
+ "m" (*(const char *)x));
}
static inline void spin_lock_prefetch(const void *x)
@@ -799,6 +825,9 @@ static inline void spin_lock_prefetch(const void *x)
prefetchw(x);
}
+#define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \
+ TOP_OF_KERNEL_STACK_PADDING)
+
#ifdef CONFIG_X86_32
/*
* User space process size: 3GB (default).
@@ -809,39 +838,16 @@ static inline void spin_lock_prefetch(const void *x)
#define STACK_TOP_MAX STACK_TOP
#define INIT_THREAD { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
+ .sp0 = TOP_OF_INIT_STACK, \
.vm86_info = NULL, \
.sysenter_cs = __KERNEL_CS, \
.io_bitmap_ptr = NULL, \
}
-/*
- * Note that the .io_bitmap member must be extra-big. This is because
- * the CPU will access an additional byte beyond the end of the IO
- * permission bitmap. The extra byte must be all 1 bits, and must
- * be within the limit.
- */
-#define INIT_TSS { \
- .x86_tss = { \
- .sp0 = sizeof(init_stack) + (long)&init_stack, \
- .ss0 = __KERNEL_DS, \
- .ss1 = __KERNEL_CS, \
- .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, \
- }, \
- .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, \
-}
-
extern unsigned long thread_saved_pc(struct task_struct *tsk);
-#define THREAD_SIZE_LONGS (THREAD_SIZE/sizeof(unsigned long))
-#define KSTK_TOP(info) \
-({ \
- unsigned long *__ptr = (unsigned long *)(info); \
- (unsigned long)(&__ptr[THREAD_SIZE_LONGS]); \
-})
-
/*
- * The below -8 is to reserve 8 bytes on top of the ring0 stack.
+ * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack.
* This is necessary to guarantee that the entire "struct pt_regs"
* is accessible even if the CPU haven't stored the SS/ESP registers
* on the stack (interrupt gate does not save these registers
@@ -850,11 +856,11 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
* "struct pt_regs" is possible, but they may contain the
* completely wrong values.
*/
-#define task_pt_regs(task) \
-({ \
- struct pt_regs *__regs__; \
- __regs__ = (struct pt_regs *)(KSTK_TOP(task_stack_page(task))-8); \
- __regs__ - 1; \
+#define task_pt_regs(task) \
+({ \
+ unsigned long __ptr = (unsigned long)task_stack_page(task); \
+ __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \
+ ((struct pt_regs *)__ptr) - 1; \
})
#define KSTK_ESP(task) (task_pt_regs(task)->sp)
@@ -886,11 +892,7 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define STACK_TOP_MAX TASK_SIZE_MAX
#define INIT_THREAD { \
- .sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
-}
-
-#define INIT_TSS { \
- .x86_tss.sp0 = (unsigned long)&init_stack + sizeof(init_stack) \
+ .sp0 = TOP_OF_INIT_STACK \
}
/*
@@ -902,11 +904,6 @@ extern unsigned long thread_saved_pc(struct task_struct *tsk);
#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1)
extern unsigned long KSTK_ESP(struct task_struct *task);
-/*
- * User space RSP while inside the SYSCALL fast path
- */
-DECLARE_PER_CPU(unsigned long, old_rsp);
-
#endif /* CONFIG_X86_64 */
extern void start_thread(struct pt_regs *regs, unsigned long new_ip,
diff --git a/arch/x86/include/asm/ptrace.h b/arch/x86/include/asm/ptrace.h
index 86fc2bb82287..83b874da2762 100644
--- a/arch/x86/include/asm/ptrace.h
+++ b/arch/x86/include/asm/ptrace.h
@@ -31,13 +31,17 @@ struct pt_regs {
#else /* __i386__ */
struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long bp;
unsigned long bx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -47,9 +51,12 @@ struct pt_regs {
unsigned long dx;
unsigned long si;
unsigned long di;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
unsigned long orig_ax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
unsigned long ip;
unsigned long cs;
unsigned long flags;
@@ -138,12 +145,8 @@ static inline bool user_64bit_mode(struct pt_regs *regs)
#endif
}
-#define current_user_stack_pointer() this_cpu_read(old_rsp)
-/* ia32 vs. x32 difference */
-#define compat_user_stack_pointer() \
- (test_thread_flag(TIF_IA32) \
- ? current_pt_regs()->sp \
- : this_cpu_read(old_rsp))
+#define current_user_stack_pointer() current_pt_regs()->sp
+#define compat_user_stack_pointer() current_pt_regs()->sp
#endif
#ifdef CONFIG_X86_32
@@ -248,7 +251,7 @@ static inline unsigned long regs_get_kernel_stack_nth(struct pt_regs *regs,
*/
#define arch_ptrace_stop_needed(code, info) \
({ \
- set_thread_flag(TIF_NOTIFY_RESUME); \
+ force_iret(); \
false; \
})
diff --git a/arch/x86/include/asm/sigcontext.h b/arch/x86/include/asm/sigcontext.h
index 9dfce4e0417d..6fe6b182c998 100644
--- a/arch/x86/include/asm/sigcontext.h
+++ b/arch/x86/include/asm/sigcontext.h
@@ -57,9 +57,9 @@ struct sigcontext {
unsigned long ip;
unsigned long flags;
unsigned short cs;
- unsigned short gs;
- unsigned short fs;
- unsigned short __pad0;
+ unsigned short __pad2; /* Was called gs, but was always zero. */
+ unsigned short __pad1; /* Was called fs, but was always zero. */
+ unsigned short ss;
unsigned long err;
unsigned long trapno;
unsigned long oldmask;
diff --git a/arch/x86/include/asm/smap.h b/arch/x86/include/asm/smap.h
index 8d3120f4e270..ba665ebd17bb 100644
--- a/arch/x86/include/asm/smap.h
+++ b/arch/x86/include/asm/smap.h
@@ -27,23 +27,11 @@
#ifdef CONFIG_X86_SMAP
-#define ASM_CLAC \
- 661: ASM_NOP3 ; \
- .pushsection .altinstr_replacement, "ax" ; \
- 662: __ASM_CLAC ; \
- .popsection ; \
- .pushsection .altinstructions, "a" ; \
- altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
- .popsection
-
-#define ASM_STAC \
- 661: ASM_NOP3 ; \
- .pushsection .altinstr_replacement, "ax" ; \
- 662: __ASM_STAC ; \
- .popsection ; \
- .pushsection .altinstructions, "a" ; \
- altinstruction_entry 661b, 662b, X86_FEATURE_SMAP, 3, 3 ; \
- .popsection
+#define ASM_CLAC \
+ ALTERNATIVE "", __stringify(__ASM_CLAC), X86_FEATURE_SMAP
+
+#define ASM_STAC \
+ ALTERNATIVE "", __stringify(__ASM_STAC), X86_FEATURE_SMAP
#else /* CONFIG_X86_SMAP */
@@ -61,20 +49,20 @@
static __always_inline void clac(void)
{
/* Note: a barrier is implicit in alternative() */
- alternative(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
+ alternative("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP);
}
static __always_inline void stac(void)
{
/* Note: a barrier is implicit in alternative() */
- alternative(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP);
+ alternative("", __stringify(__ASM_STAC), X86_FEATURE_SMAP);
}
/* These macros can be used in asm() statements */
#define ASM_CLAC \
- ALTERNATIVE(ASM_NOP3, __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
+ ALTERNATIVE("", __stringify(__ASM_CLAC), X86_FEATURE_SMAP)
#define ASM_STAC \
- ALTERNATIVE(ASM_NOP3, __stringify(__ASM_STAC), X86_FEATURE_SMAP)
+ ALTERNATIVE("", __stringify(__ASM_STAC), X86_FEATURE_SMAP)
#else /* CONFIG_X86_SMAP */
diff --git a/arch/x86/include/asm/special_insns.h b/arch/x86/include/asm/special_insns.h
index 6a4b00fafb00..2ec1a5392542 100644
--- a/arch/x86/include/asm/special_insns.h
+++ b/arch/x86/include/asm/special_insns.h
@@ -4,6 +4,8 @@
#ifdef __KERNEL__
+#include <asm/nops.h>
+
static inline void native_clts(void)
{
asm volatile("clts");
@@ -199,6 +201,14 @@ static inline void clflushopt(volatile void *__p)
"+m" (*(volatile char __force *)__p));
}
+static inline void pcommit_sfence(void)
+{
+ alternative(ASM_NOP7,
+ ".byte 0x66, 0x0f, 0xae, 0xf8\n\t" /* pcommit */
+ "sfence",
+ X86_FEATURE_PCOMMIT);
+}
+
#define nop() asm volatile ("nop")
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h
index 1d4e4f279a32..0abf7ab20ce2 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -13,6 +13,33 @@
#include <asm/types.h>
/*
+ * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we
+ * reserve at the top of the kernel stack. We do it because of a nasty
+ * 32-bit corner case. On x86_32, the hardware stack frame is
+ * variable-length. Except for vm86 mode, struct pt_regs assumes a
+ * maximum-length frame. If we enter from CPL 0, the top 8 bytes of
+ * pt_regs don't actually exist. Ordinarily this doesn't matter, but it
+ * does in at least one case:
+ *
+ * If we take an NMI early enough in SYSENTER, then we can end up with
+ * pt_regs that extends above sp0. On the way out, in the espfix code,
+ * we can read the saved SS value, but that value will be above sp0.
+ * Without this offset, that can result in a page fault. (We are
+ * careful that, in this case, the value we read doesn't matter.)
+ *
+ * In vm86 mode, the hardware frame is much longer still, but we neither
+ * access the extra members from NMI context, nor do we write such a
+ * frame at sp0 at all.
+ *
+ * x86_64 has a fixed-length stack frame.
+ */
+#ifdef CONFIG_X86_32
+# define TOP_OF_KERNEL_STACK_PADDING 8
+#else
+# define TOP_OF_KERNEL_STACK_PADDING 0
+#endif
+
+/*
* low level task data that entry.S needs immediate access to
* - this struct should fit entirely inside of one cache line
* - this struct shares the supervisor stack pages
@@ -158,10 +185,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack);
static inline struct thread_info *current_thread_info(void)
{
- struct thread_info *ti;
- ti = (void *)(this_cpu_read_stable(kernel_stack) +
- KERNEL_STACK_OFFSET - THREAD_SIZE);
- return ti;
+ return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE);
}
static inline unsigned long current_stack_pointer(void)
@@ -236,6 +260,16 @@ static inline bool is_ia32_task(void)
#endif
return false;
}
+
+/*
+ * Force syscall return via IRET by making it look as if there was
+ * some work pending. IRET is our most capable (but slowest) syscall
+ * return path, which is able to restore modified SS, CS and certain
+ * EFLAGS values that other (fast) syscall return instructions
+ * are not able to restore properly.
+ */
+#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME)
+
#endif /* !__ASSEMBLY__ */
#ifndef __ASSEMBLY__
diff --git a/arch/x86/include/uapi/asm/ptrace-abi.h b/arch/x86/include/uapi/asm/ptrace-abi.h
index 7b0a55a88851..580aee3072e0 100644
--- a/arch/x86/include/uapi/asm/ptrace-abi.h
+++ b/arch/x86/include/uapi/asm/ptrace-abi.h
@@ -25,13 +25,17 @@
#else /* __i386__ */
#if defined(__ASSEMBLY__) || defined(__FRAME_OFFSETS)
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
#define R15 0
#define R14 8
#define R13 16
#define R12 24
#define RBP 32
#define RBX 40
-/* arguments: interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
#define R11 48
#define R10 56
#define R9 64
@@ -41,15 +45,17 @@
#define RDX 96
#define RSI 104
#define RDI 112
-#define ORIG_RAX 120 /* = ERROR */
-/* end of arguments */
-/* cpu exception frame or undefined in case of fast syscall. */
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
+#define ORIG_RAX 120
+/* Return frame for iretq */
#define RIP 128
#define CS 136
#define EFLAGS 144
#define RSP 152
#define SS 160
-#define ARGOFFSET R11
#endif /* __ASSEMBLY__ */
/* top of stack page */
diff --git a/arch/x86/include/uapi/asm/ptrace.h b/arch/x86/include/uapi/asm/ptrace.h
index ac4b9aa4d999..bc16115af39b 100644
--- a/arch/x86/include/uapi/asm/ptrace.h
+++ b/arch/x86/include/uapi/asm/ptrace.h
@@ -41,13 +41,17 @@ struct pt_regs {
#ifndef __KERNEL__
struct pt_regs {
+/*
+ * C ABI says these regs are callee-preserved. They aren't saved on kernel entry
+ * unless syscall needs a complete, fully filled "struct pt_regs".
+ */
unsigned long r15;
unsigned long r14;
unsigned long r13;
unsigned long r12;
unsigned long rbp;
unsigned long rbx;
-/* arguments: non interrupts/non tracing syscalls only save up to here*/
+/* These regs are callee-clobbered. Always saved on kernel entry. */
unsigned long r11;
unsigned long r10;
unsigned long r9;
@@ -57,9 +61,12 @@ struct pt_regs {
unsigned long rdx;
unsigned long rsi;
unsigned long rdi;
+/*
+ * On syscall entry, this is syscall#. On CPU exception, this is error code.
+ * On hw interrupt, it's IRQ number:
+ */
unsigned long orig_rax;
-/* end of arguments */
-/* cpu exception frame or undefined */
+/* Return frame for iretq */
unsigned long rip;
unsigned long cs;
unsigned long eflags;
diff --git a/arch/x86/include/uapi/asm/sigcontext.h b/arch/x86/include/uapi/asm/sigcontext.h
index d8b9f9081e86..16dc4e8a2cd3 100644
--- a/arch/x86/include/uapi/asm/sigcontext.h
+++ b/arch/x86/include/uapi/asm/sigcontext.h
@@ -177,9 +177,24 @@ struct sigcontext {
__u64 rip;
__u64 eflags; /* RFLAGS */
__u16 cs;
- __u16 gs;
- __u16 fs;
- __u16 __pad0;
+
+ /*
+ * Prior to 2.5.64 ("[PATCH] x86-64 updates for 2.5.64-bk3"),
+ * Linux saved and restored fs and gs in these slots. This
+ * was counterproductive, as fsbase and gsbase were never
+ * saved, so arch_prctl was presumably unreliable.
+ *
+ * If these slots are ever needed for any other purpose, there
+ * is some risk that very old 64-bit binaries could get
+ * confused. I doubt that many such binaries still work,
+ * though, since the same patch in 2.5.64 also removed the
+ * 64-bit set_thread_area syscall, so it appears that there is
+ * no TLS API that works in both pre- and post-2.5.64 kernels.
+ */
+ __u16 __pad2; /* Was gs. */
+ __u16 __pad1; /* Was fs. */
+
+ __u16 ss;
__u64 err;
__u64 trapno;
__u64 oldmask;
diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile
index cdb1b70ddad0..c887cd944f0c 100644
--- a/arch/x86/kernel/Makefile
+++ b/arch/x86/kernel/Makefile
@@ -32,6 +32,7 @@ obj-$(CONFIG_X86_32) += i386_ksyms_32.o
obj-$(CONFIG_X86_64) += sys_x86_64.o x8664_ksyms_64.o
obj-$(CONFIG_X86_64) += mcount_64.o
obj-y += syscall_$(BITS).o vsyscall_gtod.o
+obj-$(CONFIG_IA32_EMULATION) += syscall_32.o
obj-$(CONFIG_X86_VSYSCALL_EMULATION) += vsyscall_64.o vsyscall_emu_64.o
obj-$(CONFIG_X86_ESPFIX64) += espfix_64.o
obj-$(CONFIG_SYSFS) += ksysfs.o
diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 703130f469ec..af397cc98d05 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -52,10 +52,25 @@ static int __init setup_noreplace_paravirt(char *str)
__setup("noreplace-paravirt", setup_noreplace_paravirt);
#endif
-#define DPRINTK(fmt, ...) \
-do { \
- if (debug_alternative) \
- printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+#define DPRINTK(fmt, args...) \
+do { \
+ if (debug_alternative) \
+ printk(KERN_DEBUG "%s: " fmt "\n", __func__, ##args); \
+} while (0)
+
+#define DUMP_BYTES(buf, len, fmt, args...) \
+do { \
+ if (unlikely(debug_alternative)) { \
+ int j; \
+ \
+ if (!(len)) \
+ break; \
+ \
+ printk(KERN_DEBUG fmt, ##args); \
+ for (j = 0; j < (len) - 1; j++) \
+ printk(KERN_CONT "%02hhx ", buf[j]); \
+ printk(KERN_CONT "%02hhx\n", buf[j]); \
+ } \
} while (0)
/*
@@ -243,12 +258,86 @@ extern struct alt_instr __alt_instructions[], __alt_instructions_end[];
extern s32 __smp_locks[], __smp_locks_end[];
void *text_poke_early(void *addr, const void *opcode, size_t len);
-/* Replace instructions with better alternatives for this CPU type.
- This runs before SMP is initialized to avoid SMP problems with
- self modifying code. This implies that asymmetric systems where
- APs have less capabilities than the boot processor are not handled.
- Tough. Make sure you disable such features by hand. */
+/*
+ * Are we looking at a near JMP with a 1 or 4-byte displacement.
+ */
+static inline bool is_jmp(const u8 opcode)
+{
+ return opcode == 0xeb || opcode == 0xe9;
+}
+
+static void __init_or_module
+recompute_jump(struct alt_instr *a, u8 *orig_insn, u8 *repl_insn, u8 *insnbuf)
+{
+ u8 *next_rip, *tgt_rip;
+ s32 n_dspl, o_dspl;
+ int repl_len;
+
+ if (a->replacementlen != 5)
+ return;
+
+ o_dspl = *(s32 *)(insnbuf + 1);
+
+ /* next_rip of the replacement JMP */
+ next_rip = repl_insn + a->replacementlen;
+ /* target rip of the replacement JMP */
+ tgt_rip = next_rip + o_dspl;
+ n_dspl = tgt_rip - orig_insn;
+
+ DPRINTK("target RIP: %p, new_displ: 0x%x", tgt_rip, n_dspl);
+
+ if (tgt_rip - orig_insn >= 0) {
+ if (n_dspl - 2 <= 127)
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ /* negative offset */
+ } else {
+ if (((n_dspl - 2) & 0xff) == (n_dspl - 2))
+ goto two_byte_jmp;
+ else
+ goto five_byte_jmp;
+ }
+
+two_byte_jmp:
+ n_dspl -= 2;
+
+ insnbuf[0] = 0xeb;
+ insnbuf[1] = (s8)n_dspl;
+ add_nops(insnbuf + 2, 3);
+
+ repl_len = 2;
+ goto done;
+
+five_byte_jmp:
+ n_dspl -= 5;
+ insnbuf[0] = 0xe9;
+ *(s32 *)&insnbuf[1] = n_dspl;
+
+ repl_len = 5;
+
+done:
+
+ DPRINTK("final displ: 0x%08x, JMP 0x%lx",
+ n_dspl, (unsigned long)orig_insn + n_dspl + repl_len);
+}
+
+static void __init_or_module optimize_nops(struct alt_instr *a, u8 *instr)
+{
+ add_nops(instr + (a->instrlen - a->padlen), a->padlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: [%d:%d) optimized NOPs: ",
+ instr, a->instrlen - a->padlen, a->padlen);
+}
+
+/*
+ * Replace instructions with better alternatives for this CPU type. This runs
+ * before SMP is initialized to avoid SMP problems with self modifying code.
+ * This implies that asymmetric systems where APs have less capabilities than
+ * the boot processor are not handled. Tough. Make sure you disable such
+ * features by hand.
+ */
void __init_or_module apply_alternatives(struct alt_instr *start,
struct alt_instr *end)
{
@@ -256,10 +345,10 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
u8 *instr, *replacement;
u8 insnbuf[MAX_PATCH_LEN];
- DPRINTK("%s: alt table %p -> %p\n", __func__, start, end);
+ DPRINTK("alt table %p -> %p", start, end);
/*
* The scan order should be from start to end. A later scanned
- * alternative code can overwrite a previous scanned alternative code.
+ * alternative code can overwrite previously scanned alternative code.
* Some kernel functions (e.g. memcpy, memset, etc) use this order to
* patch code.
*
@@ -267,29 +356,54 @@ void __init_or_module apply_alternatives(struct alt_instr *start,
* order.
*/
for (a = start; a < end; a++) {
+ int insnbuf_sz = 0;
+
instr = (u8 *)&a->instr_offset + a->instr_offset;
replacement = (u8 *)&a->repl_offset + a->repl_offset;
- BUG_ON(a->replacementlen > a->instrlen);
BUG_ON(a->instrlen > sizeof(insnbuf));
BUG_ON(a->cpuid >= (NCAPINTS + NBUGINTS) * 32);
- if (!boot_cpu_has(a->cpuid))
+ if (!boot_cpu_has(a->cpuid)) {
+ if (a->padlen > 1)
+ optimize_nops(a, instr);
+
continue;
+ }
+
+ DPRINTK("feat: %d*32+%d, old: (%p, len: %d), repl: (%p, len: %d)",
+ a->cpuid >> 5,
+ a->cpuid & 0x1f,
+ instr, a->instrlen,
+ replacement, a->replacementlen);
+
+ DUMP_BYTES(instr, a->instrlen, "%p: old_insn: ", instr);
+ DUMP_BYTES(replacement, a->replacementlen, "%p: rpl_insn: ", replacement);
memcpy(insnbuf, replacement, a->replacementlen);
+ insnbuf_sz = a->replacementlen;
/* 0xe8 is a relative jump; fix the offset. */
- if (*insnbuf == 0xe8 && a->replacementlen == 5)
- *(s32 *)(insnbuf + 1) += replacement - instr;
+ if (*insnbuf == 0xe8 && a->replacementlen == 5) {
+ *(s32 *)(insnbuf + 1) += replacement - instr;
+ DPRINTK("Fix CALL offset: 0x%x, CALL 0x%lx",
+ *(s32 *)(insnbuf + 1),
+ (unsigned long)instr + *(s32 *)(insnbuf + 1) + 5);
+ }
+
+ if (a->replacementlen && is_jmp(replacement[0]))
+ recompute_jump(a, instr, replacement, insnbuf);
- add_nops(insnbuf + a->replacementlen,
- a->instrlen - a->replacementlen);
+ if (a->instrlen > a->replacementlen) {
+ add_nops(insnbuf + a->replacementlen,
+ a->instrlen - a->replacementlen);
+ insnbuf_sz += a->instrlen - a->replacementlen;
+ }
+ DUMP_BYTES(insnbuf, insnbuf_sz, "%p: final_insn: ", instr);
- text_poke_early(instr, insnbuf, a->instrlen);
+ text_poke_early(instr, insnbuf, insnbuf_sz);
}
}
#ifdef CONFIG_SMP
-
static void alternatives_smp_lock(const s32 *start, const s32 *end,
u8 *text, u8 *text_end)
{
@@ -371,8 +485,8 @@ void __init_or_module alternatives_smp_module_add(struct module *mod,
smp->locks_end = locks_end;
smp->text = text;
smp->text_end = text_end;
- DPRINTK("%s: locks %p -> %p, text %p -> %p, name %s\n",
- __func__, smp->locks, smp->locks_end,
+ DPRINTK("locks %p -> %p, text %p -> %p, name %s\n",
+ smp->locks, smp->locks_end,
smp->text, smp->text_end, smp->name);
list_add_tail(&smp->next, &smp_alt_modules);
@@ -440,7 +554,7 @@ int alternatives_text_reserved(void *start, void *end)
return 0;
}
-#endif
+#endif /* CONFIG_SMP */
#ifdef CONFIG_PARAVIRT
void __init_or_module apply_paravirt(struct paravirt_patch_site *start,
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 3b3b9d33ac1d..47703aed74cf 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -68,7 +68,7 @@ void foo(void)
/* Offset from the sysenter stack to tss.sp0 */
DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) -
- sizeof(struct tss_struct));
+ offsetofend(struct tss_struct, SYSENTER_stack));
#if defined(CONFIG_LGUEST) || defined(CONFIG_LGUEST_GUEST) || defined(CONFIG_LGUEST_MODULE)
BLANK();
diff --git a/arch/x86/kernel/asm-offsets_64.c b/arch/x86/kernel/asm-offsets_64.c
index fdcbb4d27c9f..5ce6f2da8763 100644
--- a/arch/x86/kernel/asm-offsets_64.c
+++ b/arch/x86/kernel/asm-offsets_64.c
@@ -81,6 +81,7 @@ int main(void)
#undef ENTRY
OFFSET(TSS_ist, tss_struct, x86_tss.ist);
+ OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
BLANK();
DEFINE(__NR_syscall_max, sizeof(syscalls_64) - 1);
diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c
index a220239cea65..dd9e50500297 100644
--- a/arch/x86/kernel/cpu/amd.c
+++ b/arch/x86/kernel/cpu/amd.c
@@ -711,6 +711,11 @@ static void init_amd(struct cpuinfo_x86 *c)
set_cpu_bug(c, X86_BUG_AMD_APIC_C1E);
rdmsr_safe(MSR_AMD64_PATCH_LEVEL, &c->microcode, &dummy);
+
+ /* 3DNow or LM implies PREFETCHW */
+ if (!cpu_has(c, X86_FEATURE_3DNOWPREFETCH))
+ if (cpu_has(c, X86_FEATURE_3DNOW) || cpu_has(c, X86_FEATURE_LM))
+ set_cpu_cap(c, X86_FEATURE_3DNOWPREFETCH);
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 2346c95c6ab1..d79f139871e0 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -960,37 +960,53 @@ static void identify_cpu(struct cpuinfo_x86 *c)
}
#ifdef CONFIG_X86_64
-#ifdef CONFIG_IA32_EMULATION
+# ifdef CONFIG_IA32_EMULATION
/* May not be __init: called during resume */
static void syscall32_cpu_init(void)
{
- /* Load these always in case some future AMD CPU supports
- SYSENTER from compat mode too. */
+ /*
+ * Always load these, in case some future 64-bit CPU supports
+ * SYSENTER from compat mode too:
+ */
wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS);
wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL);
wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)ia32_sysenter_target);
wrmsrl(MSR_CSTAR, ia32_cstar_target);
}
-#endif /* CONFIG_IA32_EMULATION */
-#endif /* CONFIG_X86_64 */
+# endif
+#endif
+/*
+ * Set up the CPU state needed to execute SYSENTER/SYSEXIT instructions
+ * on 32-bit kernels:
+ */
#ifdef CONFIG_X86_32
void enable_sep_cpu(void)
{
- int cpu = get_cpu();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss;
+ int cpu;
- if (!boot_cpu_has(X86_FEATURE_SEP)) {
- put_cpu();
- return;
- }
+ cpu = get_cpu();
+ tss = &per_cpu(cpu_tss, cpu);
+
+ if (!boot_cpu_has(X86_FEATURE_SEP))
+ goto out;
+
+ /*
+ * The struct::SS1 and tss_struct::SP1 fields are not used by the hardware,
+ * we cache the SYSENTER CS and ESP values there for easy access:
+ */
tss->x86_tss.ss1 = __KERNEL_CS;
- tss->x86_tss.sp1 = sizeof(struct tss_struct) + (unsigned long) tss;
- wrmsr(MSR_IA32_SYSENTER_CS, __KERNEL_CS, 0);
+ wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0);
+
+ tss->x86_tss.sp1 = (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack);
wrmsr(MSR_IA32_SYSENTER_ESP, tss->x86_tss.sp1, 0);
- wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long) ia32_sysenter_target, 0);
+
+ wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)ia32_sysenter_target, 0);
+
+out:
put_cpu();
}
#endif
@@ -1130,8 +1146,8 @@ DEFINE_PER_CPU_FIRST(union irq_stack_union,
irq_stack_union) __aligned(PAGE_SIZE) __visible;
/*
- * The following four percpu variables are hot. Align current_task to
- * cacheline size such that all four fall in the same cacheline.
+ * The following percpu variables are hot. Align current_task to
+ * cacheline size such that they fall in the same cacheline.
*/
DEFINE_PER_CPU(struct task_struct *, current_task) ____cacheline_aligned =
&init_task;
@@ -1226,6 +1242,15 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
EXPORT_PER_CPU_SYMBOL(__preempt_count);
DEFINE_PER_CPU(struct task_struct *, fpu_owner_task);
+/*
+ * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
+ * the top of the kernel stack. Use an extra percpu variable to track the
+ * top of the kernel stack directly.
+ */
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
+ (unsigned long)&init_thread_union + THREAD_SIZE;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
#ifdef CONFIG_CC_STACKPROTECTOR
DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
#endif
@@ -1307,7 +1332,7 @@ void cpu_init(void)
*/
load_ucode_ap();
- t = &per_cpu(init_tss, cpu);
+ t = &per_cpu(cpu_tss, cpu);
oist = &per_cpu(orig_ist, cpu);
#ifdef CONFIG_NUMA
@@ -1391,7 +1416,7 @@ void cpu_init(void)
{
int cpu = smp_processor_id();
struct task_struct *curr = current;
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
struct thread_struct *thread = &curr->thread;
wait_for_master_cpu(cpu);
diff --git a/arch/x86/kernel/cpu/perf_event.c b/arch/x86/kernel/cpu/perf_event.c
index b71a7f86d68a..979963bb3977 100644
--- a/arch/x86/kernel/cpu/perf_event.c
+++ b/arch/x86/kernel/cpu/perf_event.c
@@ -2161,10 +2161,9 @@ static unsigned long code_segment_base(struct pt_regs *regs)
if (user_mode(regs) && regs->cs != __USER_CS)
return get_segment_base(regs->cs);
#else
- if (test_thread_flag(TIF_IA32)) {
- if (user_mode(regs) && regs->cs != __USER32_CS)
- return get_segment_base(regs->cs);
- }
+ if (user_mode(regs) && !user_64bit_mode(regs) &&
+ regs->cs != __USER32_CS)
+ return get_segment_base(regs->cs);
#endif
return 0;
}
diff --git a/arch/x86/kernel/entry_32.S b/arch/x86/kernel/entry_32.S
index 31e2d5bf3e38..4c8cc34e6d68 100644
--- a/arch/x86/kernel/entry_32.S
+++ b/arch/x86/kernel/entry_32.S
@@ -398,7 +398,7 @@ sysenter_past_esp:
* A tiny bit of offset fixup is necessary - 4*4 means the 4 words
* pushed above; +8 corresponds to copy_thread's esp0 setting.
*/
- pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+8+4*4)(%esp)
+ pushl_cfi ((TI_sysenter_return)-THREAD_SIZE+TOP_OF_KERNEL_STACK_PADDING+4*4)(%esp)
CFI_REL_OFFSET eip, 0
pushl_cfi %eax
@@ -816,15 +816,9 @@ ENTRY(simd_coprocessor_error)
pushl_cfi $0
#ifdef CONFIG_X86_INVD_BUG
/* AMD 486 bug: invd from userspace calls exception 19 instead of #GP */
-661: pushl_cfi $do_general_protection
-662:
-.section .altinstructions,"a"
- altinstruction_entry 661b, 663f, X86_FEATURE_XMM, 662b-661b, 664f-663f
-.previous
-.section .altinstr_replacement,"ax"
-663: pushl $do_simd_coprocessor_error
-664:
-.previous
+ ALTERNATIVE "pushl_cfi $do_general_protection", \
+ "pushl $do_simd_coprocessor_error", \
+ X86_FEATURE_XMM
#else
pushl_cfi $do_simd_coprocessor_error
#endif
@@ -1240,20 +1234,13 @@ error_code:
/*CFI_REL_OFFSET es, 0*/
pushl_cfi %ds
/*CFI_REL_OFFSET ds, 0*/
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ebp
- CFI_REL_OFFSET ebp, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ebp
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg edx
+ pushl_cfi_reg ecx
+ pushl_cfi_reg ebx
cld
movl $(__KERNEL_PERCPU), %ecx
movl %ecx, %fs
diff --git a/arch/x86/kernel/entry_64.S b/arch/x86/kernel/entry_64.S
index 1d74d161687c..0c91256d73df 100644
--- a/arch/x86/kernel/entry_64.S
+++ b/arch/x86/kernel/entry_64.S
@@ -14,24 +14,13 @@
* NOTE: This code handles signal-recognition, which happens every time
* after an interrupt and after each system call.
*
- * Normal syscalls and interrupts don't save a full stack frame, this is
- * only done for syscall tracing, signals or fork/exec et.al.
- *
* A note on terminology:
- * - top of stack: Architecture defined interrupt frame from SS to RIP
+ * - iret frame: Architecture defined interrupt frame from SS to RIP
* at the top of the kernel process stack.
- * - partial stack frame: partially saved registers up to R11.
- * - full stack frame: Like partial stack frame, but all register saved.
*
* Some macro usage:
* - CFI macros are used to generate dwarf2 unwind information for better
* backtraces. They don't change any code.
- * - SAVE_ALL/RESTORE_ALL - Save/restore all registers
- * - SAVE_ARGS/RESTORE_ARGS - Save/restore registers that C functions modify.
- * There are unfortunately lots of special cases where some registers
- * not touched. The macro is a big mess that should be cleaned up.
- * - SAVE_REST/RESTORE_REST - Handle the registers not saved by SAVE_ARGS.
- * Gives a full stack frame.
* - ENTRY/END Define functions in the symbol table.
* - FIXUP_TOP_OF_STACK/RESTORE_TOP_OF_STACK - Fix up the hardware stack
* frame that is otherwise undefined after a SYSCALL
@@ -82,9 +71,9 @@ ENDPROC(native_usergs_sysret64)
#endif /* CONFIG_PARAVIRT */
-.macro TRACE_IRQS_IRETQ offset=ARGOFFSET
+.macro TRACE_IRQS_IRETQ
#ifdef CONFIG_TRACE_IRQFLAGS
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON
1:
@@ -116,8 +105,8 @@ ENDPROC(native_usergs_sysret64)
call debug_stack_reset
.endm
-.macro TRACE_IRQS_IRETQ_DEBUG offset=ARGOFFSET
- bt $9,EFLAGS-\offset(%rsp) /* interrupts off? */
+.macro TRACE_IRQS_IRETQ_DEBUG
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc 1f
TRACE_IRQS_ON_DEBUG
1:
@@ -130,34 +119,27 @@ ENDPROC(native_usergs_sysret64)
#endif
/*
- * C code is not supposed to know about undefined top of stack. Every time
- * a C function with an pt_regs argument is called from the SYSCALL based
- * fast path FIXUP_TOP_OF_STACK is needed.
+ * C code is not supposed to know that the iret frame is not populated.
+ * Every time a C function with an pt_regs argument is called from
+ * the SYSCALL based fast path FIXUP_TOP_OF_STACK is needed.
* RESTORE_TOP_OF_STACK syncs the syscall state after any possible ptregs
* manipulation.
*/
-
- /* %rsp:at FRAMEEND */
.macro FIXUP_TOP_OF_STACK tmp offset=0
- movq PER_CPU_VAR(old_rsp),\tmp
- movq \tmp,RSP+\offset(%rsp)
movq $__USER_DS,SS+\offset(%rsp)
movq $__USER_CS,CS+\offset(%rsp)
movq RIP+\offset(%rsp),\tmp /* get rip */
movq \tmp,RCX+\offset(%rsp) /* copy it to rcx as sysret would do */
- movq R11+\offset(%rsp),\tmp /* get eflags */
- movq \tmp,EFLAGS+\offset(%rsp)
+ movq EFLAGS+\offset(%rsp),\tmp /* ditto for rflags->r11 */
+ movq \tmp,R11+\offset(%rsp)
.endm
.macro RESTORE_TOP_OF_STACK tmp offset=0
- movq RSP+\offset(%rsp),\tmp
- movq \tmp,PER_CPU_VAR(old_rsp)
- movq EFLAGS+\offset(%rsp),\tmp
- movq \tmp,R11+\offset(%rsp)
+ /* nothing to do */
.endm
/*
- * initial frame state for interrupts (and exceptions without error code)
+ * empty frame
*/
.macro EMPTY_FRAME start=1 offset=0
.if \start
@@ -173,12 +155,12 @@ ENDPROC(native_usergs_sysret64)
* initial frame state for interrupts (and exceptions without error code)
*/
.macro INTR_FRAME start=1 offset=0
- EMPTY_FRAME \start, SS+8+\offset-RIP
- /*CFI_REL_OFFSET ss, SS+\offset-RIP*/
- CFI_REL_OFFSET rsp, RSP+\offset-RIP
- /*CFI_REL_OFFSET rflags, EFLAGS+\offset-RIP*/
- /*CFI_REL_OFFSET cs, CS+\offset-RIP*/
- CFI_REL_OFFSET rip, RIP+\offset-RIP
+ EMPTY_FRAME \start, 5*8+\offset
+ /*CFI_REL_OFFSET ss, 4*8+\offset*/
+ CFI_REL_OFFSET rsp, 3*8+\offset
+ /*CFI_REL_OFFSET rflags, 2*8+\offset*/
+ /*CFI_REL_OFFSET cs, 1*8+\offset*/
+ CFI_REL_OFFSET rip, 0*8+\offset
.endm
/*
@@ -186,30 +168,23 @@ ENDPROC(native_usergs_sysret64)
* with vector already pushed)
*/
.macro XCPT_FRAME start=1 offset=0
- INTR_FRAME \start, RIP+\offset-ORIG_RAX
- .endm
-
-/*
- * frame that enables calling into C.
- */
- .macro PARTIAL_FRAME start=1 offset=0
- XCPT_FRAME \start, ORIG_RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET rdi, RDI+\offset-ARGOFFSET
- CFI_REL_OFFSET rsi, RSI+\offset-ARGOFFSET
- CFI_REL_OFFSET rdx, RDX+\offset-ARGOFFSET
- CFI_REL_OFFSET rcx, RCX+\offset-ARGOFFSET
- CFI_REL_OFFSET rax, RAX+\offset-ARGOFFSET
- CFI_REL_OFFSET r8, R8+\offset-ARGOFFSET
- CFI_REL_OFFSET r9, R9+\offset-ARGOFFSET
- CFI_REL_OFFSET r10, R10+\offset-ARGOFFSET
- CFI_REL_OFFSET r11, R11+\offset-ARGOFFSET
+ INTR_FRAME \start, 1*8+\offset
.endm
/*
* frame that enables passing a complete pt_regs to a C function.
*/
.macro DEFAULT_FRAME start=1 offset=0
- PARTIAL_FRAME \start, R11+\offset-R15
+ XCPT_FRAME \start, ORIG_RAX+\offset
+ CFI_REL_OFFSET rdi, RDI+\offset
+ CFI_REL_OFFSET rsi, RSI+\offset
+ CFI_REL_OFFSET rdx, RDX+\offset
+ CFI_REL_OFFSET rcx, RCX+\offset
+ CFI_REL_OFFSET rax, RAX+\offset
+ CFI_REL_OFFSET r8, R8+\offset
+ CFI_REL_OFFSET r9, R9+\offset
+ CFI_REL_OFFSET r10, R10+\offset
+ CFI_REL_OFFSET r11, R11+\offset
CFI_REL_OFFSET rbx, RBX+\offset
CFI_REL_OFFSET rbp, RBP+\offset
CFI_REL_OFFSET r12, R12+\offset
@@ -218,105 +193,31 @@ ENDPROC(native_usergs_sysret64)
CFI_REL_OFFSET r15, R15+\offset
.endm
-ENTRY(save_paranoid)
- XCPT_FRAME 1 RDI+8
- cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq_cfi rdx, RDX+8
- movq_cfi rcx, RCX+8
- movq_cfi rax, RAX+8
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
- movl $1,%ebx
- movl $MSR_GS_BASE,%ecx
- rdmsr
- testl %edx,%edx
- js 1f /* negative -> in kernel */
- SWAPGS
- xorl %ebx,%ebx
-1: ret
- CFI_ENDPROC
-END(save_paranoid)
-
/*
- * A newly forked process directly context switches into this address.
+ * 64bit SYSCALL instruction entry. Up to 6 arguments in registers.
*
- * rdi: prev task we switched from
- */
-ENTRY(ret_from_fork)
- DEFAULT_FRAME
-
- LOCK ; btr $TIF_FORK,TI_flags(%r8)
-
- pushq_cfi $0x0002
- popfq_cfi # reset kernel eflags
-
- call schedule_tail # rdi: 'prev' task parameter
-
- GET_THREAD_INFO(%rcx)
-
- RESTORE_REST
-
- testl $3, CS-ARGOFFSET(%rsp) # from kernel_thread?
- jz 1f
-
- /*
- * By the time we get here, we have no idea whether our pt_regs,
- * ti flags, and ti status came from the 64-bit SYSCALL fast path,
- * the slow path, or one of the ia32entry paths.
- * Use int_ret_from_sys_call to return, since it can safely handle
- * all of the above.
- */
- jmp int_ret_from_sys_call
-
-1:
- subq $REST_SKIP, %rsp # leave space for volatiles
- CFI_ADJUST_CFA_OFFSET REST_SKIP
- movq %rbp, %rdi
- call *%rbx
- movl $0, RAX(%rsp)
- RESTORE_REST
- jmp int_ret_from_sys_call
- CFI_ENDPROC
-END(ret_from_fork)
-
-/*
- * System call entry. Up to 6 arguments in registers are supported.
+ * 64bit SYSCALL saves rip to rcx, clears rflags.RF, then saves rflags to r11,
+ * then loads new ss, cs, and rip from previously programmed MSRs.
+ * rflags gets masked by a value from another MSR (so CLD and CLAC
+ * are not needed). SYSCALL does not save anything on the stack
+ * and does not change rsp.
*
- * SYSCALL does not save anything on the stack and does not change the
- * stack pointer. However, it does mask the flags register for us, so
- * CLD and CLAC are not needed.
- */
-
-/*
- * Register setup:
+ * Registers on entry:
* rax system call number
+ * rcx return address
+ * r11 saved rflags (note: r11 is callee-clobbered register in C ABI)
* rdi arg0
- * rcx return address for syscall/sysret, C arg3
* rsi arg1
* rdx arg2
- * r10 arg3 (--> moved to rcx for C)
+ * r10 arg3 (needs to be moved to rcx to conform to C ABI)
* r8 arg4
* r9 arg5
- * r11 eflags for syscall/sysret, temporary for C
- * r12-r15,rbp,rbx saved by C code, not touched.
+ * (note: r12-r15,rbp,rbx are callee-preserved in C ABI)
*
* Interrupts are off on entry.
* Only called from user space.
*
- * XXX if we had a free scratch register we could save the RSP into the stack frame
- * and report it properly in ps. Unfortunately we haven't.
- *
- * When user can change the frames always force IRET. That is because
+ * When user can change pt_regs->foo always force IRET. That is because
* it deals with uncanonical addresses better. SYSRET has trouble
* with them due to bugs in both AMD and Intel CPUs.
*/
@@ -335,18 +236,29 @@ ENTRY(system_call)
*/
GLOBAL(system_call_after_swapgs)
- movq %rsp,PER_CPU_VAR(old_rsp)
+ /*
+ * We use 'rsp_scratch' as a scratch register, hence this block must execute
+ * atomically in the face of possible interrupt-driven task preemption,
+ * so we can enable interrupts only after we're done with using rsp_scratch:
+ */
+ movq %rsp,PER_CPU_VAR(rsp_scratch)
+ /* kernel_stack is set so that 5 slots (iret frame) are preallocated */
movq PER_CPU_VAR(kernel_stack),%rsp
+ ALLOC_PT_GPREGS_ON_STACK 8 /* +8: space for orig_ax */
+ movq %rcx,RIP(%rsp)
+ movq PER_CPU_VAR(rsp_scratch),%rcx
+ movq %r11,EFLAGS(%rsp)
+ movq %rcx,RSP(%rsp)
/*
* No need to follow this irqs off/on section - it's straight
* and short:
*/
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_ARGS 8, 0, rax_enosys=1
- movq_cfi rax,(ORIG_RAX-ARGOFFSET)
- movq %rcx,RIP-ARGOFFSET(%rsp)
- CFI_REL_OFFSET rip,RIP-ARGOFFSET
- testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ movq_cfi rax,ORIG_RAX
+ SAVE_C_REGS_EXCEPT_RAX_RCX_R11
+ movq $-ENOSYS,RAX(%rsp)
+ CFI_REL_OFFSET rip,RIP
+ testl $_TIF_WORK_SYSCALL_ENTRY,TI_flags+THREAD_INFO(%rsp,RIP)
jnz tracesys
system_call_fastpath:
#if __SYSCALL_MASK == ~0
@@ -358,13 +270,13 @@ system_call_fastpath:
ja ret_from_sys_call /* and return regs->ax */
movq %r10,%rcx
call *sys_call_table(,%rax,8) # XXX: rip relative
- movq %rax,RAX-ARGOFFSET(%rsp)
+ movq %rax,RAX(%rsp)
/*
* Syscall return path ending with SYSRET (fast path)
- * Has incomplete stack frame and undefined top of stack.
+ * Has incompletely filled pt_regs, iret frame is also incomplete.
*/
ret_from_sys_call:
- testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP-ARGOFFSET)
+ testl $_TIF_ALLWORK_MASK,TI_flags+THREAD_INFO(%rsp,RIP)
jnz int_ret_from_sys_call_fixup /* Go the the slow path */
LOCKDEP_SYS_EXIT
@@ -375,31 +287,38 @@ ret_from_sys_call:
* sysretq will re-enable interrupts:
*/
TRACE_IRQS_ON
- movq RIP-ARGOFFSET(%rsp),%rcx
+ RESTORE_C_REGS_EXCEPT_RCX_R11
+ movq RIP(%rsp),%rcx
CFI_REGISTER rip,rcx
- RESTORE_ARGS 1,-ARG_SKIP,0
+ movq EFLAGS(%rsp),%r11
/*CFI_REGISTER rflags,r11*/
- movq PER_CPU_VAR(old_rsp), %rsp
+ movq RSP(%rsp),%rsp
+ /*
+ * 64bit SYSRET restores rip from rcx,
+ * rflags from r11 (but RF and VM bits are forced to 0),
+ * cs and ss are loaded from MSRs.
+ */
USERGS_SYSRET64
CFI_RESTORE_STATE
int_ret_from_sys_call_fixup:
- FIXUP_TOP_OF_STACK %r11, -ARGOFFSET
+ FIXUP_TOP_OF_STACK %r11
jmp int_ret_from_sys_call
- /* Do syscall tracing */
+ /* Do syscall entry tracing */
tracesys:
- leaq -REST_SKIP(%rsp), %rdi
+ movq %rsp, %rdi
movq $AUDIT_ARCH_X86_64, %rsi
call syscall_trace_enter_phase1
test %rax, %rax
jnz tracesys_phase2 /* if needed, run the slow path */
- LOAD_ARGS 0 /* else restore clobbered regs */
+ RESTORE_C_REGS_EXCEPT_RAX /* else restore clobbered regs */
+ movq ORIG_RAX(%rsp), %rax
jmp system_call_fastpath /* and return to the fast path */
tracesys_phase2:
- SAVE_REST
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %rdi
movq %rsp, %rdi
movq $AUDIT_ARCH_X86_64, %rsi
@@ -407,12 +326,12 @@ tracesys_phase2:
call syscall_trace_enter_phase2
/*
- * Reload arg registers from stack in case ptrace changed them.
+ * Reload registers from stack in case ptrace changed them.
* We don't reload %rax because syscall_trace_entry_phase2() returned
* the value it wants us to use in the table lookup.
*/
- LOAD_ARGS ARGOFFSET, 1
- RESTORE_REST
+ RESTORE_C_REGS_EXCEPT_RAX
+ RESTORE_EXTRA_REGS
#if __SYSCALL_MASK == ~0
cmpq $__NR_syscall_max,%rax
#else
@@ -422,12 +341,12 @@ tracesys_phase2:
ja int_ret_from_sys_call /* RAX(%rsp) is already set */
movq %r10,%rcx /* fixup for C */
call *sys_call_table(,%rax,8)
- movq %rax,RAX-ARGOFFSET(%rsp)
- /* Use IRET because user could have changed frame */
+ movq %rax,RAX(%rsp)
+ /* Use IRET because user could have changed pt_regs->foo */
/*
* Syscall return path ending with IRET.
- * Has correct top of stack, but partial stack frame.
+ * Has correct iret frame.
*/
GLOBAL(int_ret_from_sys_call)
DISABLE_INTERRUPTS(CLBR_NONE)
@@ -458,12 +377,11 @@ int_careful:
TRACE_IRQS_OFF
jmp int_with_check
- /* handle signals and tracing -- both require a full stack frame */
+ /* handle signals and tracing -- both require a full pt_regs */
int_very_careful:
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
-int_check_syscall_exit_work:
- SAVE_REST
+ SAVE_EXTRA_REGS
/* Check for syscall exit trace */
testl $_TIF_WORK_SYSCALL_EXIT,%edx
jz int_signal
@@ -482,7 +400,7 @@ int_signal:
call do_notify_resume
1: movl $_TIF_WORK_MASK,%edi
int_restore_rest:
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
jmp int_with_check
@@ -492,45 +410,29 @@ END(system_call)
.macro FORK_LIKE func
ENTRY(stub_\func)
CFI_STARTPROC
- popq %r11 /* save return address */
- PARTIAL_FRAME 0
- SAVE_REST
- pushq %r11 /* put it back on stack */
+ DEFAULT_FRAME 0, 8 /* offset 8: return address */
+ SAVE_EXTRA_REGS 8
FIXUP_TOP_OF_STACK %r11, 8
- DEFAULT_FRAME 0 8 /* offset 8: return address */
call sys_\func
RESTORE_TOP_OF_STACK %r11, 8
- ret $REST_SKIP /* pop extended registers */
- CFI_ENDPROC
-END(stub_\func)
- .endm
-
- .macro FIXED_FRAME label,func
-ENTRY(\label)
- CFI_STARTPROC
- PARTIAL_FRAME 0 8 /* offset 8: return address */
- FIXUP_TOP_OF_STACK %r11, 8-ARGOFFSET
- call \func
- RESTORE_TOP_OF_STACK %r11, 8-ARGOFFSET
ret
CFI_ENDPROC
-END(\label)
+END(stub_\func)
.endm
FORK_LIKE clone
FORK_LIKE fork
FORK_LIKE vfork
- FIXED_FRAME stub_iopl, sys_iopl
ENTRY(stub_execve)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_execve
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execve)
@@ -538,13 +440,13 @@ END(stub_execve)
ENTRY(stub_execveat)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_execveat
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_execveat)
@@ -556,12 +458,12 @@ END(stub_execveat)
ENTRY(stub_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_rt_sigreturn)
@@ -570,12 +472,12 @@ END(stub_rt_sigreturn)
ENTRY(stub_x32_rt_sigreturn)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call sys32_x32_rt_sigreturn
movq %rax,RAX(%rsp) # fixme, this could be done at the higher layer
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_rt_sigreturn)
@@ -583,13 +485,13 @@ END(stub_x32_rt_sigreturn)
ENTRY(stub_x32_execve)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call compat_sys_execve
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_execve)
@@ -597,13 +499,13 @@ END(stub_x32_execve)
ENTRY(stub_x32_execveat)
CFI_STARTPROC
addq $8, %rsp
- PARTIAL_FRAME 0
- SAVE_REST
+ DEFAULT_FRAME 0
+ SAVE_EXTRA_REGS
FIXUP_TOP_OF_STACK %r11
call compat_sys_execveat
RESTORE_TOP_OF_STACK %r11
movq %rax,RAX(%rsp)
- RESTORE_REST
+ RESTORE_EXTRA_REGS
jmp int_ret_from_sys_call
CFI_ENDPROC
END(stub_x32_execveat)
@@ -611,6 +513,46 @@ END(stub_x32_execveat)
#endif
/*
+ * A newly forked process directly context switches into this address.
+ *
+ * rdi: prev task we switched from
+ */
+ENTRY(ret_from_fork)
+ DEFAULT_FRAME
+
+ LOCK ; btr $TIF_FORK,TI_flags(%r8)
+
+ pushq_cfi $0x0002
+ popfq_cfi # reset kernel eflags
+
+ call schedule_tail # rdi: 'prev' task parameter
+
+ GET_THREAD_INFO(%rcx)
+
+ RESTORE_EXTRA_REGS
+
+ testl $3,CS(%rsp) # from kernel_thread?
+ jz 1f
+
+ /*
+ * By the time we get here, we have no idea whether our pt_regs,
+ * ti flags, and ti status came from the 64-bit SYSCALL fast path,
+ * the slow path, or one of the ia32entry paths.
+ * Use int_ret_from_sys_call to return, since it can safely handle
+ * all of the above.
+ */
+ jmp int_ret_from_sys_call
+
+1:
+ movq %rbp, %rdi
+ call *%rbx
+ movl $0, RAX(%rsp)
+ RESTORE_EXTRA_REGS
+ jmp int_ret_from_sys_call
+ CFI_ENDPROC
+END(ret_from_fork)
+
+/*
* Build the entry stubs and pointer table with some assembler magic.
* We pack 7 stubs into a single 32-byte chunk, which will fit in a
* single cache line on all modern x86 implementations.
@@ -659,47 +601,45 @@ END(interrupt)
/* 0(%rsp): ~(interrupt number) */
.macro interrupt func
- /* reserve pt_regs for scratch regs and rbp */
- subq $ORIG_RAX-RBP, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-RBP
cld
- /* start from rbp in pt_regs and jump over */
- movq_cfi rdi, (RDI-RBP)
- movq_cfi rsi, (RSI-RBP)
- movq_cfi rdx, (RDX-RBP)
- movq_cfi rcx, (RCX-RBP)
- movq_cfi rax, (RAX-RBP)
- movq_cfi r8, (R8-RBP)
- movq_cfi r9, (R9-RBP)
- movq_cfi r10, (R10-RBP)
- movq_cfi r11, (R11-RBP)
-
- /* Save rbp so that we can unwind from get_irq_regs() */
- movq_cfi rbp, 0
-
- /* Save previous stack value */
- movq %rsp, %rsi
+ /*
+ * Since nothing in interrupt handling code touches r12...r15 members
+ * of "struct pt_regs", and since interrupts can nest, we can save
+ * four stack slots and simultaneously provide
+ * an unwind-friendly stack layout by saving "truncated" pt_regs
+ * exactly up to rbp slot, without these members.
+ */
+ ALLOC_PT_GPREGS_ON_STACK -RBP
+ SAVE_C_REGS -RBP
+ /* this goes to 0(%rsp) for unwinder, not for saving the value: */
+ SAVE_EXTRA_REGS_RBP -RBP
+
+ leaq -RBP(%rsp),%rdi /* arg1 for \func (pointer to pt_regs) */
- leaq -RBP(%rsp),%rdi /* arg1 for handler */
- testl $3, CS-RBP(%rsi)
+ testl $3, CS-RBP(%rsp)
je 1f
SWAPGS
+1:
/*
+ * Save previous stack pointer, optionally switch to interrupt stack.
* irq_count is used to check if a CPU is already on an interrupt stack
* or not. While this is essentially redundant with preempt_count it is
* a little cheaper to use a separate counter in the PDA (short of
* moving irq_enter into assembly, which would be too much work)
*/
-1: incl PER_CPU_VAR(irq_count)
+ movq %rsp, %rsi
+ incl PER_CPU_VAR(irq_count)
cmovzq PER_CPU_VAR(irq_stack_ptr),%rsp
CFI_DEF_CFA_REGISTER rsi
-
- /* Store previous stack value */
pushq %rsi
+ /*
+ * For debugger:
+ * "CFA (Current Frame Address) is the value on stack + offset"
+ */
CFI_ESCAPE 0x0f /* DW_CFA_def_cfa_expression */, 6, \
- 0x77 /* DW_OP_breg7 */, 0, \
+ 0x77 /* DW_OP_breg7 (rsp) */, 0, \
0x06 /* DW_OP_deref */, \
- 0x08 /* DW_OP_const1u */, SS+8-RBP, \
+ 0x08 /* DW_OP_const1u */, SIZEOF_PTREGS-RBP, \
0x22 /* DW_OP_plus */
/* We entered an interrupt context - irqs are off: */
TRACE_IRQS_OFF
@@ -717,7 +657,7 @@ common_interrupt:
ASM_CLAC
addq $-0x80,(%rsp) /* Adjust vector to [-256,-1] range */
interrupt do_IRQ
- /* 0(%rsp): old_rsp-ARGOFFSET */
+ /* 0(%rsp): rsp_scratch */
ret_from_intr:
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
@@ -725,19 +665,20 @@ ret_from_intr:
/* Restore saved previous stack */
popq %rsi
- CFI_DEF_CFA rsi,SS+8-RBP /* reg/off reset after def_cfa_expr */
- leaq ARGOFFSET-RBP(%rsi), %rsp
+ CFI_DEF_CFA rsi,SIZEOF_PTREGS-RBP /* reg/off reset after def_cfa_expr */
+ /* return code expects complete pt_regs - adjust rsp accordingly: */
+ leaq -RBP(%rsi),%rsp
CFI_DEF_CFA_REGISTER rsp
- CFI_ADJUST_CFA_OFFSET RBP-ARGOFFSET
+ CFI_ADJUST_CFA_OFFSET RBP
exit_intr:
GET_THREAD_INFO(%rcx)
- testl $3,CS-ARGOFFSET(%rsp)
+ testl $3,CS(%rsp)
je retint_kernel
/* Interrupt came from user space */
/*
- * Has a correct top of stack, but a partial stack frame
+ * Has a correct top of stack.
* %rcx: thread info. Interrupts off.
*/
retint_with_reschedule:
@@ -760,8 +701,8 @@ retint_swapgs: /* return to user-space */
* Try to use SYSRET instead of IRET if we're returning to
* a completely clean 64-bit userspace context.
*/
- movq (RCX-R11)(%rsp), %rcx
- cmpq %rcx,(RIP-R11)(%rsp) /* RCX == RIP */
+ movq RCX(%rsp),%rcx
+ cmpq %rcx,RIP(%rsp) /* RCX == RIP */
jne opportunistic_sysret_failed
/*
@@ -782,19 +723,19 @@ retint_swapgs: /* return to user-space */
shr $__VIRTUAL_MASK_SHIFT, %rcx
jnz opportunistic_sysret_failed
- cmpq $__USER_CS,(CS-R11)(%rsp) /* CS must match SYSRET */
+ cmpq $__USER_CS,CS(%rsp) /* CS must match SYSRET */
jne opportunistic_sysret_failed
- movq (R11-ARGOFFSET)(%rsp), %r11
- cmpq %r11,(EFLAGS-ARGOFFSET)(%rsp) /* R11 == RFLAGS */
+ movq R11(%rsp),%r11
+ cmpq %r11,EFLAGS(%rsp) /* R11 == RFLAGS */
jne opportunistic_sysret_failed
- testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
+ testq $X86_EFLAGS_RF,%r11 /* sysret can't restore RF */
jnz opportunistic_sysret_failed
/* nothing to check for RSP */
- cmpq $__USER_DS,(SS-ARGOFFSET)(%rsp) /* SS must match SYSRET */
+ cmpq $__USER_DS,SS(%rsp) /* SS must match SYSRET */
jne opportunistic_sysret_failed
/*
@@ -803,8 +744,9 @@ retint_swapgs: /* return to user-space */
*/
irq_return_via_sysret:
CFI_REMEMBER_STATE
- RESTORE_ARGS 1,8,1
- movq (RSP-RIP)(%rsp),%rsp
+ /* r11 is already restored (see code above) */
+ RESTORE_C_REGS_EXCEPT_R11
+ movq RSP(%rsp),%rsp
USERGS_SYSRET64
CFI_RESTORE_STATE
@@ -819,7 +761,8 @@ retint_restore_args: /* return to kernel space */
*/
TRACE_IRQS_IRETQ
restore_args:
- RESTORE_ARGS 1,8,1
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
irq_return:
INTERRUPT_RETURN
@@ -890,12 +833,12 @@ retint_signal:
jz retint_swapgs
TRACE_IRQS_ON
ENABLE_INTERRUPTS(CLBR_NONE)
- SAVE_REST
+ SAVE_EXTRA_REGS
movq $-1,ORIG_RAX(%rsp)
xorl %esi,%esi # oldset
movq %rsp,%rdi # &pt_regs
call do_notify_resume
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -907,7 +850,7 @@ retint_signal:
ENTRY(retint_kernel)
cmpl $0,PER_CPU_VAR(__preempt_count)
jnz retint_restore_args
- bt $9,EFLAGS-ARGOFFSET(%rsp) /* interrupts off? */
+ bt $9,EFLAGS(%rsp) /* interrupts off? */
jnc retint_restore_args
call preempt_schedule_irq
jmp exit_intr
@@ -1000,7 +943,7 @@ apicinterrupt IRQ_WORK_VECTOR \
/*
* Exception entry points.
*/
-#define INIT_TSS_IST(x) PER_CPU_VAR(init_tss) + (TSS_ist + ((x) - 1) * 8)
+#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8)
.macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1
ENTRY(\sym)
@@ -1022,8 +965,7 @@ ENTRY(\sym)
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
.endif
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
.if \paranoid
.if \paranoid == 1
@@ -1031,10 +973,11 @@ ENTRY(\sym)
testl $3, CS(%rsp) /* If coming from userspace, switch */
jnz 1f /* stacks. */
.endif
- call save_paranoid
+ call paranoid_entry
.else
call error_entry
.endif
+ /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
DEFAULT_FRAME 0
@@ -1056,19 +999,20 @@ ENTRY(\sym)
.endif
.if \shift_ist != -1
- subq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ subq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
call \do_sym
.if \shift_ist != -1
- addq $EXCEPTION_STKSZ, INIT_TSS_IST(\shift_ist)
+ addq $EXCEPTION_STKSZ, CPU_TSS_IST(\shift_ist)
.endif
+ /* these procedures expect "no swapgs" flag in ebx */
.if \paranoid
- jmp paranoid_exit /* %ebx: no swapgs flag */
+ jmp paranoid_exit
.else
- jmp error_exit /* %ebx: no swapgs flag */
+ jmp error_exit
.endif
.if \paranoid == 1
@@ -1272,7 +1216,9 @@ ENTRY(xen_failsafe_callback)
addq $0x30,%rsp
CFI_ADJUST_CFA_OFFSET -0x30
pushq_cfi $-1 /* orig_ax = -1 => not a system call */
- SAVE_ALL
+ ALLOC_PT_GPREGS_ON_STACK
+ SAVE_C_REGS
+ SAVE_EXTRA_REGS
jmp error_exit
CFI_ENDPROC
END(xen_failsafe_callback)
@@ -1304,59 +1250,66 @@ idtentry async_page_fault do_async_page_fault has_error_code=1
idtentry machine_check has_error_code=0 paranoid=1 do_sym=*machine_check_vector(%rip)
#endif
- /*
- * "Paranoid" exit path from exception stack. This is invoked
- * only on return from non-NMI IST interrupts that came
- * from kernel space.
- *
- * We may be returning to very strange contexts (e.g. very early
- * in syscall entry), so checking for preemption here would
- * be complicated. Fortunately, we there's no good reason
- * to try to handle preemption here.
- */
+/*
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Use slow, but surefire "are we in kernel?" check.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
+ */
+ENTRY(paranoid_entry)
+ XCPT_FRAME 1 15*8
+ cld
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
+ movl $1,%ebx
+ movl $MSR_GS_BASE,%ecx
+ rdmsr
+ testl %edx,%edx
+ js 1f /* negative -> in kernel */
+ SWAPGS
+ xorl %ebx,%ebx
+1: ret
+ CFI_ENDPROC
+END(paranoid_entry)
- /* ebx: no swapgs flag */
+/*
+ * "Paranoid" exit path from exception stack. This is invoked
+ * only on return from non-NMI IST interrupts that came
+ * from kernel space.
+ *
+ * We may be returning to very strange contexts (e.g. very early
+ * in syscall entry), so checking for preemption here would
+ * be complicated. Fortunately, we there's no good reason
+ * to try to handle preemption here.
+ */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(paranoid_exit)
DEFAULT_FRAME
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF_DEBUG
testl %ebx,%ebx /* swapgs needed? */
- jnz paranoid_restore
- TRACE_IRQS_IRETQ 0
+ jnz paranoid_exit_no_swapgs
+ TRACE_IRQS_IRETQ
SWAPGS_UNSAFE_STACK
- RESTORE_ALL 8
- INTERRUPT_RETURN
-paranoid_restore:
- TRACE_IRQS_IRETQ_DEBUG 0
- RESTORE_ALL 8
+ jmp paranoid_exit_restore
+paranoid_exit_no_swapgs:
+ TRACE_IRQS_IRETQ_DEBUG
+paranoid_exit_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
+ REMOVE_PT_GPREGS_FROM_STACK 8
INTERRUPT_RETURN
CFI_ENDPROC
END(paranoid_exit)
/*
- * Exception entry point. This expects an error code/orig_rax on the stack.
- * returns in "no swapgs flag" in %ebx.
+ * Save all registers in pt_regs, and switch gs if needed.
+ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise
*/
ENTRY(error_entry)
- XCPT_FRAME
- CFI_ADJUST_CFA_OFFSET 15*8
- /* oldrax contains error code */
+ XCPT_FRAME 1 15*8
cld
- movq %rdi, RDI+8(%rsp)
- movq %rsi, RSI+8(%rsp)
- movq %rdx, RDX+8(%rsp)
- movq %rcx, RCX+8(%rsp)
- movq %rax, RAX+8(%rsp)
- movq %r8, R8+8(%rsp)
- movq %r9, R9+8(%rsp)
- movq %r10, R10+8(%rsp)
- movq %r11, R11+8(%rsp)
- movq_cfi rbx, RBX+8
- movq %rbp, RBP+8(%rsp)
- movq %r12, R12+8(%rsp)
- movq %r13, R13+8(%rsp)
- movq %r14, R14+8(%rsp)
- movq %r15, R15+8(%rsp)
+ SAVE_C_REGS 8
+ SAVE_EXTRA_REGS 8
xorl %ebx,%ebx
testl $3,CS+8(%rsp)
je error_kernelspace
@@ -1366,12 +1319,12 @@ error_sti:
TRACE_IRQS_OFF
ret
-/*
- * There are two places in the kernel that can potentially fault with
- * usergs. Handle them here. B stepping K8s sometimes report a
- * truncated RIP for IRET exceptions returning to compat mode. Check
- * for these here too.
- */
+ /*
+ * There are two places in the kernel that can potentially fault with
+ * usergs. Handle them here. B stepping K8s sometimes report a
+ * truncated RIP for IRET exceptions returning to compat mode. Check
+ * for these here too.
+ */
error_kernelspace:
CFI_REL_OFFSET rcx, RCX+8
incl %ebx
@@ -1401,11 +1354,11 @@ error_bad_iret:
END(error_entry)
-/* ebx: no swapgs flag (1: don't need swapgs, 0: need it) */
+/* On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */
ENTRY(error_exit)
DEFAULT_FRAME
movl %ebx,%eax
- RESTORE_REST
+ RESTORE_EXTRA_REGS
DISABLE_INTERRUPTS(CLBR_NONE)
TRACE_IRQS_OFF
GET_THREAD_INFO(%rcx)
@@ -1587,7 +1540,7 @@ first_nmi:
.rept 5
pushq_cfi 11*8(%rsp)
.endr
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
/* Everything up to here is safe from nested NMIs */
@@ -1615,7 +1568,7 @@ repeat_nmi:
pushq_cfi -6*8(%rsp)
.endr
subq $(5*8), %rsp
- CFI_DEF_CFA_OFFSET SS+8-RIP
+ CFI_DEF_CFA_OFFSET 5*8
end_repeat_nmi:
/*
@@ -1624,16 +1577,16 @@ end_repeat_nmi:
* so that we repeat another NMI.
*/
pushq_cfi $-1 /* ORIG_RAX: no syscall to restart */
- subq $ORIG_RAX-R15, %rsp
- CFI_ADJUST_CFA_OFFSET ORIG_RAX-R15
+ ALLOC_PT_GPREGS_ON_STACK
+
/*
- * Use save_paranoid to handle SWAPGS, but no need to use paranoid_exit
+ * Use paranoid_entry to handle SWAPGS, but no need to use paranoid_exit
* as we should not be calling schedule in NMI context.
* Even with normal interrupts enabled. An NMI should not be
* setting NEED_RESCHED or anything that normal interrupts and
* exceptions might do.
*/
- call save_paranoid
+ call paranoid_entry
DEFAULT_FRAME 0
/*
@@ -1664,8 +1617,10 @@ end_repeat_nmi:
nmi_swapgs:
SWAPGS_UNSAFE_STACK
nmi_restore:
+ RESTORE_EXTRA_REGS
+ RESTORE_C_REGS
/* Pop the extra iret frame at once */
- RESTORE_ALL 6*8
+ REMOVE_PT_GPREGS_FROM_STACK 6*8
/* Clear the NMI executing stack variable */
movq $0, 5*8(%rsp)
diff --git a/arch/x86/kernel/head_32.S b/arch/x86/kernel/head_32.S
index f36bd42d6f0c..d031bad9e07e 100644
--- a/arch/x86/kernel/head_32.S
+++ b/arch/x86/kernel/head_32.S
@@ -22,6 +22,7 @@
#include <asm/cpufeature.h>
#include <asm/percpu.h>
#include <asm/nops.h>
+#include <asm/bootparam.h>
/* Physical address */
#define pa(X) ((X) - __PAGE_OFFSET)
@@ -90,7 +91,7 @@ ENTRY(startup_32)
/* test KEEP_SEGMENTS flag to see if the bootloader is asking
us to not reload segments */
- testb $(1<<6), BP_loadflags(%esi)
+ testb $KEEP_SEGMENTS, BP_loadflags(%esi)
jnz 2f
/*
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index 6fd514d9f69a..ae6588b301c2 100644
--- a/arch/x86/kernel/head_64.S
+++ b/arch/x86/kernel/head_64.S
@@ -1,5 +1,5 @@
/*
- * linux/arch/x86_64/kernel/head.S -- start in 32bit and switch to 64bit
+ * linux/arch/x86/kernel/head_64.S -- start in 32bit and switch to 64bit
*
* Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE
* Copyright (C) 2000 Pavel Machek <pavel@suse.cz>
@@ -56,7 +56,7 @@ startup_64:
* %rsi holds a physical pointer to real_mode_data.
*
* We come here either directly from a 64bit bootloader, or from
- * arch/x86_64/boot/compressed/head.S.
+ * arch/x86/boot/compressed/head_64.S.
*
* We only come here initially at boot nothing else comes here.
*
@@ -146,7 +146,7 @@ startup_64:
leaq level2_kernel_pgt(%rip), %rdi
leaq 4096(%rdi), %r8
/* See if it is a valid page table entry */
-1: testq $1, 0(%rdi)
+1: testb $1, 0(%rdi)
jz 2f
addq %rbp, 0(%rdi)
/* Go to the next page */
diff --git a/arch/x86/kernel/ioport.c b/arch/x86/kernel/ioport.c
index 4ddaf66ea35f..37dae792dbbe 100644
--- a/arch/x86/kernel/ioport.c
+++ b/arch/x86/kernel/ioport.c
@@ -54,7 +54,7 @@ asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int turn_on)
* because the ->io_bitmap_max value must match the bitmap
* contents:
*/
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
if (turn_on)
bitmap_clear(t->io_bitmap_ptr, from, num);
diff --git a/arch/x86/kernel/perf_regs.c b/arch/x86/kernel/perf_regs.c
index 781861cc5ee8..02a8720414c0 100644
--- a/arch/x86/kernel/perf_regs.c
+++ b/arch/x86/kernel/perf_regs.c
@@ -177,7 +177,7 @@ void perf_get_regs_user(struct perf_regs *regs_user,
* than just blindly copying user_regs.
*/
regs_user->abi = PERF_SAMPLE_REGS_ABI_64;
- regs_user_copy->sp = this_cpu_read(old_rsp);
+ regs_user_copy->sp = user_regs->sp;
regs_user_copy->cs = __USER_CS;
regs_user_copy->ss = __USER_DS;
regs_user_copy->cx = -1; /* usually contains garbage */
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 046e2d620bbe..12b1cf606ddf 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -37,7 +37,26 @@
* section. Since TSS's are completely CPU-local, we want them
* on exact cacheline boundaries, to eliminate cacheline ping-pong.
*/
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, init_tss) = INIT_TSS;
+__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = {
+ .x86_tss = {
+ .sp0 = TOP_OF_INIT_STACK,
+#ifdef CONFIG_X86_32
+ .ss0 = __KERNEL_DS,
+ .ss1 = __KERNEL_CS,
+ .io_bitmap_base = INVALID_IO_BITMAP_OFFSET,
+#endif
+ },
+#ifdef CONFIG_X86_32
+ /*
+ * Note that the .io_bitmap member must be extra-big. This is because
+ * the CPU will access an additional byte beyond the end of the IO
+ * permission bitmap. The extra byte must be all 1 bits, and must
+ * be within the limit.
+ */
+ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 },
+#endif
+};
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_tss);
#ifdef CONFIG_X86_64
static DEFINE_PER_CPU(unsigned char, is_idle);
@@ -109,7 +128,7 @@ void exit_thread(void)
unsigned long *bp = t->io_bitmap_ptr;
if (bp) {
- struct tss_struct *tss = &per_cpu(init_tss, get_cpu());
+ struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu());
t->io_bitmap_ptr = NULL;
clear_thread_flag(TIF_IO_BITMAP);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index 603c4f99cb5a..26c596d1ee07 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -206,11 +206,7 @@ start_thread(struct pt_regs *regs, unsigned long new_ip, unsigned long new_sp)
regs->ip = new_ip;
regs->sp = new_sp;
regs->flags = X86_EFLAGS_IF;
- /*
- * force it to the iret return path by making it look as if there was
- * some work pending.
- */
- set_thread_flag(TIF_NOTIFY_RESUME);
+ force_iret();
}
EXPORT_SYMBOL_GPL(start_thread);
@@ -248,7 +244,7 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread,
*next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
fpu_switch_t fpu;
/* never put a printk in __switch_to... printk() calls wake_up*() indirectly */
@@ -256,11 +252,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
/*
- * Reload esp0.
- */
- load_sp0(tss, next);
-
- /*
* Save away %gs. No need to save %fs, as it was saved on the
* stack on entry. No need to save %es and %ds, as those are
* always kernel segments while inside the kernel. Doing this
@@ -310,9 +301,17 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
*/
arch_end_context_switch(next_p);
+ /*
+ * Reload esp0, kernel_stack, and current_top_of_stack. This changes
+ * current_thread_info().
+ */
+ load_sp0(tss, next);
this_cpu_write(kernel_stack,
- (unsigned long)task_stack_page(next_p) +
- THREAD_SIZE - KERNEL_STACK_OFFSET);
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE - KERNEL_STACK_OFFSET);
+ this_cpu_write(cpu_current_top_of_stack,
+ (unsigned long)task_stack_page(next_p) +
+ THREAD_SIZE);
/*
* Restore %gs if needed (which is common)
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index 67fcc43577d2..da8b74598d90 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -52,7 +52,7 @@
asmlinkage extern void ret_from_fork(void);
-__visible DEFINE_PER_CPU(unsigned long, old_rsp);
+__visible DEFINE_PER_CPU(unsigned long, rsp_scratch);
/* Prints also some state that isn't saved in the pt_regs */
void __show_regs(struct pt_regs *regs, int all)
@@ -161,7 +161,6 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE;
childregs = task_pt_regs(p);
p->thread.sp = (unsigned long) childregs;
- p->thread.usersp = me->thread.usersp;
set_tsk_thread_flag(p, TIF_FORK);
p->thread.io_bitmap_ptr = NULL;
@@ -207,7 +206,7 @@ int copy_thread(unsigned long clone_flags, unsigned long sp,
*/
if (clone_flags & CLONE_SETTLS) {
#ifdef CONFIG_IA32_EMULATION
- if (test_thread_flag(TIF_IA32))
+ if (is_ia32_task())
err = do_set_thread_area(p, -1,
(struct user_desc __user *)childregs->si, 0);
else
@@ -235,13 +234,12 @@ start_thread_common(struct pt_regs *regs, unsigned long new_ip,
loadsegment(es, _ds);
loadsegment(ds, _ds);
load_gs_index(0);
- current->thread.usersp = new_sp;
regs->ip = new_ip;
regs->sp = new_sp;
- this_cpu_write(old_rsp, new_sp);
regs->cs = _cs;
regs->ss = _ss;
regs->flags = X86_EFLAGS_IF;
+ force_iret();
}
void
@@ -277,15 +275,12 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
struct thread_struct *prev = &prev_p->thread;
struct thread_struct *next = &next_p->thread;
int cpu = smp_processor_id();
- struct tss_struct *tss = &per_cpu(init_tss, cpu);
+ struct tss_struct *tss = &per_cpu(cpu_tss, cpu);
unsigned fsindex, gsindex;
fpu_switch_t fpu;
fpu = switch_fpu_prepare(prev_p, next_p, cpu);
- /* Reload esp0 and ss1. */
- load_sp0(tss, next);
-
/* We must save %fs and %gs before load_TLS() because
* %fs and %gs may be cleared by load_TLS().
*
@@ -401,8 +396,6 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
/*
* Switch the PDA and FPU contexts.
*/
- prev->usersp = this_cpu_read(old_rsp);
- this_cpu_write(old_rsp, next->usersp);
this_cpu_write(current_task, next_p);
/*
@@ -413,6 +406,9 @@ __switch_to(struct task_struct *prev_p, struct task_struct *next_p)
task_thread_info(prev_p)->saved_preempt_count = this_cpu_read(__preempt_count);
this_cpu_write(__preempt_count, task_thread_info(next_p)->saved_preempt_count);
+ /* Reload esp0 and ss1. This changes current_thread_info(). */
+ load_sp0(tss, next);
+
this_cpu_write(kernel_stack,
(unsigned long)task_stack_page(next_p) +
THREAD_SIZE - KERNEL_STACK_OFFSET);
@@ -602,6 +598,5 @@ long sys_arch_prctl(int code, unsigned long addr)
unsigned long KSTK_ESP(struct task_struct *task)
{
- return (test_tsk_thread_flag(task, TIF_IA32)) ?
- (task_pt_regs(task)->sp) : ((task)->thread.usersp);
+ return task_pt_regs(task)->sp;
}
diff --git a/arch/x86/kernel/ptrace.c b/arch/x86/kernel/ptrace.c
index e510618b2e91..1e125817cf9f 100644
--- a/arch/x86/kernel/ptrace.c
+++ b/arch/x86/kernel/ptrace.c
@@ -364,18 +364,12 @@ static int set_segment_reg(struct task_struct *task,
case offsetof(struct user_regs_struct,cs):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->cs = value;
-#endif
+ task_pt_regs(task)->cs = value;
break;
case offsetof(struct user_regs_struct,ss):
if (unlikely(value == 0))
return -EIO;
-#ifdef CONFIG_IA32_EMULATION
- if (test_tsk_thread_flag(task, TIF_IA32))
- task_pt_regs(task)->ss = value;
-#endif
+ task_pt_regs(task)->ss = value;
break;
}
diff --git a/arch/x86/kernel/relocate_kernel_32.S b/arch/x86/kernel/relocate_kernel_32.S
index e13f8e7c22a6..77630d57e7bf 100644
--- a/arch/x86/kernel/relocate_kernel_32.S
+++ b/arch/x86/kernel/relocate_kernel_32.S
@@ -226,23 +226,23 @@ swap_pages:
movl (%ebx), %ecx
addl $4, %ebx
1:
- testl $0x1, %ecx /* is it a destination page */
+ testb $0x1, %cl /* is it a destination page */
jz 2f
movl %ecx, %edi
andl $0xfffff000, %edi
jmp 0b
2:
- testl $0x2, %ecx /* is it an indirection page */
+ testb $0x2, %cl /* is it an indirection page */
jz 2f
movl %ecx, %ebx
andl $0xfffff000, %ebx
jmp 0b
2:
- testl $0x4, %ecx /* is it the done indicator */
+ testb $0x4, %cl /* is it the done indicator */
jz 2f
jmp 3f
2:
- testl $0x8, %ecx /* is it the source indicator */
+ testb $0x8, %cl /* is it the source indicator */
jz 0b /* Ignore it otherwise */
movl %ecx, %esi /* For every source page do a copy */
andl $0xfffff000, %esi
diff --git a/arch/x86/kernel/relocate_kernel_64.S b/arch/x86/kernel/relocate_kernel_64.S
index 3fd2c693e475..04cb1790a596 100644
--- a/arch/x86/kernel/relocate_kernel_64.S
+++ b/arch/x86/kernel/relocate_kernel_64.S
@@ -221,23 +221,23 @@ swap_pages:
movq (%rbx), %rcx
addq $8, %rbx
1:
- testq $0x1, %rcx /* is it a destination page? */
+ testb $0x1, %cl /* is it a destination page? */
jz 2f
movq %rcx, %rdi
andq $0xfffffffffffff000, %rdi
jmp 0b
2:
- testq $0x2, %rcx /* is it an indirection page? */
+ testb $0x2, %cl /* is it an indirection page? */
jz 2f
movq %rcx, %rbx
andq $0xfffffffffffff000, %rbx
jmp 0b
2:
- testq $0x4, %rcx /* is it the done indicator? */
+ testb $0x4, %cl /* is it the done indicator? */
jz 2f
jmp 3f
2:
- testq $0x8, %rcx /* is it the source indicator? */
+ testb $0x8, %cl /* is it the source indicator? */
jz 0b /* Ignore it otherwise */
movq %rcx, %rsi /* For ever source page do a copy */
andq $0xfffffffffffff000, %rsi
diff --git a/arch/x86/kernel/signal.c b/arch/x86/kernel/signal.c
index e5042463c1bc..eaa2c5e3f2cd 100644
--- a/arch/x86/kernel/signal.c
+++ b/arch/x86/kernel/signal.c
@@ -94,15 +94,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
COPY(r15);
#endif /* CONFIG_X86_64 */
-#ifdef CONFIG_X86_32
COPY_SEG_CPL3(cs);
COPY_SEG_CPL3(ss);
-#else /* !CONFIG_X86_32 */
- /* Kernel saves and restores only the CS segment register on signals,
- * which is the bare minimum needed to allow mixed 32/64-bit code.
- * App's signal handler can save/restore other segments if needed. */
- COPY_SEG_CPL3(cs);
-#endif /* CONFIG_X86_32 */
get_user_ex(tmpflags, &sc->flags);
regs->flags = (regs->flags & ~FIX_EFLAGS) | (tmpflags & FIX_EFLAGS);
@@ -115,6 +108,8 @@ int restore_sigcontext(struct pt_regs *regs, struct sigcontext __user *sc,
err |= restore_xstate_sig(buf, config_enabled(CONFIG_X86_32));
+ force_iret();
+
return err;
}
@@ -162,8 +157,9 @@ int setup_sigcontext(struct sigcontext __user *sc, void __user *fpstate,
#else /* !CONFIG_X86_32 */
put_user_ex(regs->flags, &sc->flags);
put_user_ex(regs->cs, &sc->cs);
- put_user_ex(0, &sc->gs);
- put_user_ex(0, &sc->fs);
+ put_user_ex(0, &sc->__pad2);
+ put_user_ex(0, &sc->__pad1);
+ put_user_ex(regs->ss, &sc->ss);
#endif /* CONFIG_X86_32 */
put_user_ex(fpstate, &sc->fpstate);
@@ -457,9 +453,19 @@ static int __setup_rt_frame(int sig, struct ksignal *ksig,
regs->sp = (unsigned long)frame;
- /* Set up the CS register to run signal handlers in 64-bit mode,
- even if the handler happens to be interrupting 32-bit code. */
+ /*
+ * Set up the CS and SS registers to run signal handlers in
+ * 64-bit mode, even if the handler happens to be interrupting
+ * 32-bit or 16-bit code.
+ *
+ * SS is subtle. In 64-bit mode, we don't need any particular
+ * SS descriptor, but we do need SS to be valid. It's possible
+ * that the old SS is entirely bogus -- this can happen if the
+ * signal we're trying to deliver is #GP or #SS caused by a bad
+ * SS value.
+ */
regs->cs = __USER_CS;
+ regs->ss = __USER_DS;
return 0;
}
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..759388c538cf 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -806,6 +806,8 @@ static int do_boot_cpu(int apicid, int cpu, struct task_struct *idle)
#ifdef CONFIG_X86_32
/* Stack for startup_32 can be just as for start_secondary onwards */
irq_ctx_init(cpu);
+ per_cpu(cpu_current_top_of_stack, cpu) =
+ (unsigned long)task_stack_page(idle) + THREAD_SIZE;
#else
clear_tsk_thread_flag(idle, TIF_FORK);
initial_gs = per_cpu_offset(cpu);
diff --git a/arch/x86/kernel/syscall_32.c b/arch/x86/kernel/syscall_32.c
index e9bcd57d8a9e..3777189c4a19 100644
--- a/arch/x86/kernel/syscall_32.c
+++ b/arch/x86/kernel/syscall_32.c
@@ -5,21 +5,29 @@
#include <linux/cache.h>
#include <asm/asm-offsets.h>
-#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void sym(void) ;
+#ifdef CONFIG_IA32_EMULATION
+#define SYM(sym, compat) compat
+#else
+#define SYM(sym, compat) sym
+#define ia32_sys_call_table sys_call_table
+#define __NR_ia32_syscall_max __NR_syscall_max
+#endif
+
+#define __SYSCALL_I386(nr, sym, compat) extern asmlinkage void SYM(sym, compat)(void) ;
#include <asm/syscalls_32.h>
#undef __SYSCALL_I386
-#define __SYSCALL_I386(nr, sym, compat) [nr] = sym,
+#define __SYSCALL_I386(nr, sym, compat) [nr] = SYM(sym, compat),
typedef asmlinkage void (*sys_call_ptr_t)(void);
extern asmlinkage void sys_ni_syscall(void);
-__visible const sys_call_ptr_t sys_call_table[__NR_syscall_max+1] = {
+__visible const sys_call_ptr_t ia32_sys_call_table[__NR_ia32_syscall_max+1] = {
/*
* Smells like a compiler bug -- it doesn't work
* when the & below is removed.
*/
- [0 ... __NR_syscall_max] = &sys_ni_syscall,
+ [0 ... __NR_ia32_syscall_max] = &sys_ni_syscall,
#include <asm/syscalls_32.h>
};
diff --git a/arch/x86/kernel/traps.c b/arch/x86/kernel/traps.c
index 4ff5d162ff9f..277341128b47 100644
--- a/arch/x86/kernel/traps.c
+++ b/arch/x86/kernel/traps.c
@@ -174,8 +174,8 @@ void ist_begin_non_atomic(struct pt_regs *regs)
* will catch asm bugs and any attempt to use ist_preempt_enable
* from double_fault.
*/
- BUG_ON(((current_stack_pointer() ^ this_cpu_read_stable(kernel_stack))
- & ~(THREAD_SIZE - 1)) != 0);
+ BUG_ON((unsigned long)(current_top_of_stack() -
+ current_stack_pointer()) >= THREAD_SIZE);
preempt_count_sub(HARDIRQ_OFFSET);
}
@@ -925,9 +925,21 @@ dotraplinkage void do_iret_error(struct pt_regs *regs, long error_code)
/* Set of traps needed for early debugging. */
void __init early_trap_init(void)
{
- set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ /*
+ * Don't use IST to set DEBUG_STACK as it doesn't work until TSS
+ * is ready in cpu_init() <-- trap_init(). Before trap_init(),
+ * CPU runs at ring 0 so it is impossible to hit an invalid
+ * stack. Using the original stack works well enough at this
+ * early stage. DEBUG_STACK will be equipped after cpu_init() in
+ * trap_init().
+ *
+ * We don't need to set trace_idt_table like set_intr_gate(),
+ * since we don't have trace_debug and it will be reset to
+ * 'debug' in trap_init() by set_intr_gate_ist().
+ */
+ set_intr_gate_notrace(X86_TRAP_DB, debug);
/* int3 can be called from all */
- set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+ set_system_intr_gate(X86_TRAP_BP, &int3);
#ifdef CONFIG_X86_32
set_intr_gate(X86_TRAP_PF, page_fault);
#endif
@@ -1005,6 +1017,15 @@ void __init trap_init(void)
*/
cpu_init();
+ /*
+ * X86_TRAP_DB and X86_TRAP_BP have been set
+ * in early_trap_init(). However, ITS works only after
+ * cpu_init() loads TSS. See comments in early_trap_init().
+ */
+ set_intr_gate_ist(X86_TRAP_DB, &debug, DEBUG_STACK);
+ /* int3 can be called from all */
+ set_system_intr_gate_ist(X86_TRAP_BP, &int3, DEBUG_STACK);
+
x86_init.irqs.trap_init();
#ifdef CONFIG_X86_64
diff --git a/arch/x86/kernel/vm86_32.c b/arch/x86/kernel/vm86_32.c
index e8edcf52e069..fc9db6ef2a95 100644
--- a/arch/x86/kernel/vm86_32.c
+++ b/arch/x86/kernel/vm86_32.c
@@ -150,7 +150,7 @@ struct pt_regs *save_v86_state(struct kernel_vm86_regs *regs)
do_exit(SIGSEGV);
}
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
current->thread.sp0 = current->thread.saved_sp0;
current->thread.sysenter_cs = __KERNEL_CS;
load_sp0(tss, &current->thread);
@@ -318,7 +318,7 @@ static void do_sys_vm86(struct kernel_vm86_struct *info, struct task_struct *tsk
tsk->thread.saved_fs = info->regs32->fs;
tsk->thread.saved_gs = get_user_gs(info->regs32);
- tss = &per_cpu(init_tss, get_cpu());
+ tss = &per_cpu(cpu_tss, get_cpu());
tsk->thread.sp0 = (unsigned long) &info->VM86_TSS_ESP0;
if (cpu_has_sep)
tsk->thread.sysenter_cs = 0;
diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c
index ac4453d8520e..8561585ee2c6 100644
--- a/arch/x86/lguest/boot.c
+++ b/arch/x86/lguest/boot.c
@@ -1076,6 +1076,7 @@ static void lguest_load_sp0(struct tss_struct *tss,
{
lazy_hcall3(LHCALL_SET_STACK, __KERNEL_DS | 0x1, thread->sp0,
THREAD_SIZE / PAGE_SIZE);
+ tss->x86_tss.sp0 = thread->sp0;
}
/* Let's just say, I wouldn't do debugging under a Guest. */
diff --git a/arch/x86/lib/atomic64_cx8_32.S b/arch/x86/lib/atomic64_cx8_32.S
index f5cc9eb1d51b..082a85167a5b 100644
--- a/arch/x86/lib/atomic64_cx8_32.S
+++ b/arch/x86/lib/atomic64_cx8_32.S
@@ -13,16 +13,6 @@
#include <asm/alternative-asm.h>
#include <asm/dwarf2.h>
-.macro SAVE reg
- pushl_cfi %\reg
- CFI_REL_OFFSET \reg, 0
-.endm
-
-.macro RESTORE reg
- popl_cfi %\reg
- CFI_RESTORE \reg
-.endm
-
.macro read64 reg
movl %ebx, %eax
movl %ecx, %edx
@@ -67,10 +57,10 @@ ENDPROC(atomic64_xchg_cx8)
.macro addsub_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
CFI_STARTPROC
- SAVE ebp
- SAVE ebx
- SAVE esi
- SAVE edi
+ pushl_cfi_reg ebp
+ pushl_cfi_reg ebx
+ pushl_cfi_reg esi
+ pushl_cfi_reg edi
movl %eax, %esi
movl %edx, %edi
@@ -89,10 +79,10 @@ ENTRY(atomic64_\func\()_return_cx8)
10:
movl %ebx, %eax
movl %ecx, %edx
- RESTORE edi
- RESTORE esi
- RESTORE ebx
- RESTORE ebp
+ popl_cfi_reg edi
+ popl_cfi_reg esi
+ popl_cfi_reg ebx
+ popl_cfi_reg ebp
ret
CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
@@ -104,7 +94,7 @@ addsub_return sub sub sbb
.macro incdec_return func ins insc
ENTRY(atomic64_\func\()_return_cx8)
CFI_STARTPROC
- SAVE ebx
+ pushl_cfi_reg ebx
read64 %esi
1:
@@ -119,7 +109,7 @@ ENTRY(atomic64_\func\()_return_cx8)
10:
movl %ebx, %eax
movl %ecx, %edx
- RESTORE ebx
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_\func\()_return_cx8)
@@ -130,7 +120,7 @@ incdec_return dec sub sbb
ENTRY(atomic64_dec_if_positive_cx8)
CFI_STARTPROC
- SAVE ebx
+ pushl_cfi_reg ebx
read64 %esi
1:
@@ -146,18 +136,18 @@ ENTRY(atomic64_dec_if_positive_cx8)
2:
movl %ebx, %eax
movl %ecx, %edx
- RESTORE ebx
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_dec_if_positive_cx8)
ENTRY(atomic64_add_unless_cx8)
CFI_STARTPROC
- SAVE ebp
- SAVE ebx
+ pushl_cfi_reg ebp
+ pushl_cfi_reg ebx
/* these just push these two parameters on the stack */
- SAVE edi
- SAVE ecx
+ pushl_cfi_reg edi
+ pushl_cfi_reg ecx
movl %eax, %ebp
movl %edx, %edi
@@ -179,8 +169,8 @@ ENTRY(atomic64_add_unless_cx8)
3:
addl $8, %esp
CFI_ADJUST_CFA_OFFSET -8
- RESTORE ebx
- RESTORE ebp
+ popl_cfi_reg ebx
+ popl_cfi_reg ebp
ret
4:
cmpl %edx, 4(%esp)
@@ -192,7 +182,7 @@ ENDPROC(atomic64_add_unless_cx8)
ENTRY(atomic64_inc_not_zero_cx8)
CFI_STARTPROC
- SAVE ebx
+ pushl_cfi_reg ebx
read64 %esi
1:
@@ -209,7 +199,7 @@ ENTRY(atomic64_inc_not_zero_cx8)
movl $1, %eax
3:
- RESTORE ebx
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(atomic64_inc_not_zero_cx8)
diff --git a/arch/x86/lib/checksum_32.S b/arch/x86/lib/checksum_32.S
index e78b8eee6615..9bc944a91274 100644
--- a/arch/x86/lib/checksum_32.S
+++ b/arch/x86/lib/checksum_32.S
@@ -51,10 +51,8 @@ unsigned int csum_partial(const unsigned char * buff, int len, unsigned int sum)
*/
ENTRY(csum_partial)
CFI_STARTPROC
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg esi
+ pushl_cfi_reg ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: unsigned char *buff
@@ -127,14 +125,12 @@ ENTRY(csum_partial)
6: addl %ecx,%eax
adcl $0, %eax
7:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 8f
roll $8, %eax
8:
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %esi
- CFI_RESTORE esi
+ popl_cfi_reg ebx
+ popl_cfi_reg esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
@@ -145,10 +141,8 @@ ENDPROC(csum_partial)
ENTRY(csum_partial)
CFI_STARTPROC
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg esi
+ pushl_cfi_reg ebx
movl 20(%esp),%eax # Function arg: unsigned int sum
movl 16(%esp),%ecx # Function arg: int len
movl 12(%esp),%esi # Function arg: const unsigned char *buf
@@ -251,14 +245,12 @@ ENTRY(csum_partial)
addl %ebx,%eax
adcl $0,%eax
80:
- testl $1, 12(%esp)
+ testb $1, 12(%esp)
jz 90f
roll $8, %eax
90:
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %esi
- CFI_RESTORE esi
+ popl_cfi_reg ebx
+ popl_cfi_reg esi
ret
CFI_ENDPROC
ENDPROC(csum_partial)
@@ -298,12 +290,9 @@ ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
subl $4,%esp
CFI_ADJUST_CFA_OFFSET 4
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
+ pushl_cfi_reg ebx
movl ARGBASE+16(%esp),%eax # sum
movl ARGBASE+12(%esp),%ecx # len
movl ARGBASE+4(%esp),%esi # src
@@ -412,12 +401,9 @@ DST( movb %cl, (%edi) )
.previous
- popl_cfi %ebx
- CFI_RESTORE ebx
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
+ popl_cfi_reg ebx
+ popl_cfi_reg esi
+ popl_cfi_reg edi
popl_cfi %ecx # equivalent to addl $4,%esp
ret
CFI_ENDPROC
@@ -441,12 +427,9 @@ ENDPROC(csum_partial_copy_generic)
ENTRY(csum_partial_copy_generic)
CFI_STARTPROC
- pushl_cfi %ebx
- CFI_REL_OFFSET ebx, 0
- pushl_cfi %edi
- CFI_REL_OFFSET edi, 0
- pushl_cfi %esi
- CFI_REL_OFFSET esi, 0
+ pushl_cfi_reg ebx
+ pushl_cfi_reg edi
+ pushl_cfi_reg esi
movl ARGBASE+4(%esp),%esi #src
movl ARGBASE+8(%esp),%edi #dst
movl ARGBASE+12(%esp),%ecx #len
@@ -506,12 +489,9 @@ DST( movb %dl, (%edi) )
jmp 7b
.previous
- popl_cfi %esi
- CFI_RESTORE esi
- popl_cfi %edi
- CFI_RESTORE edi
- popl_cfi %ebx
- CFI_RESTORE ebx
+ popl_cfi_reg esi
+ popl_cfi_reg edi
+ popl_cfi_reg ebx
ret
CFI_ENDPROC
ENDPROC(csum_partial_copy_generic)
diff --git a/arch/x86/lib/clear_page_64.S b/arch/x86/lib/clear_page_64.S
index f2145cfa12a6..e67e579c93bd 100644
--- a/arch/x86/lib/clear_page_64.S
+++ b/arch/x86/lib/clear_page_64.S
@@ -1,31 +1,35 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
/*
- * Zero a page.
- * rdi page
- */
-ENTRY(clear_page_c)
+ * Most CPUs support enhanced REP MOVSB/STOSB instructions. It is
+ * recommended to use this when possible and we do use them by default.
+ * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
+ * Otherwise, use original.
+ */
+
+/*
+ * Zero a page.
+ * %rdi - page
+ */
+ENTRY(clear_page)
CFI_STARTPROC
+
+ ALTERNATIVE_2 "jmp clear_page_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp clear_page_c_e", X86_FEATURE_ERMS
+
movl $4096/8,%ecx
xorl %eax,%eax
rep stosq
ret
CFI_ENDPROC
-ENDPROC(clear_page_c)
+ENDPROC(clear_page)
-ENTRY(clear_page_c_e)
+ENTRY(clear_page_orig)
CFI_STARTPROC
- movl $4096,%ecx
- xorl %eax,%eax
- rep stosb
- ret
- CFI_ENDPROC
-ENDPROC(clear_page_c_e)
-ENTRY(clear_page)
- CFI_STARTPROC
xorl %eax,%eax
movl $4096/64,%ecx
.p2align 4
@@ -45,29 +49,13 @@ ENTRY(clear_page)
nop
ret
CFI_ENDPROC
-.Lclear_page_end:
-ENDPROC(clear_page)
-
- /*
- * Some CPUs support enhanced REP MOVSB/STOSB instructions.
- * It is recommended to use this when possible.
- * If enhanced REP MOVSB/STOSB is not available, try to use fast string.
- * Otherwise, use original function.
- *
- */
+ENDPROC(clear_page_orig)
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c - clear_page) - (2f - 1b) /* offset */
-2: .byte 0xeb /* jmp <disp8> */
- .byte (clear_page_c_e - clear_page) - (3f - 2b) /* offset */
-3:
- .previous
- .section .altinstructions,"a"
- altinstruction_entry clear_page,1b,X86_FEATURE_REP_GOOD,\
- .Lclear_page_end-clear_page, 2b-1b
- altinstruction_entry clear_page,2b,X86_FEATURE_ERMS, \
- .Lclear_page_end-clear_page,3b-2b
- .previous
+ENTRY(clear_page_c_e)
+ CFI_STARTPROC
+ movl $4096,%ecx
+ xorl %eax,%eax
+ rep stosb
+ ret
+ CFI_ENDPROC
+ENDPROC(clear_page_c_e)
diff --git a/arch/x86/lib/copy_page_64.S b/arch/x86/lib/copy_page_64.S
index 176cca67212b..8239dbcbf984 100644
--- a/arch/x86/lib/copy_page_64.S
+++ b/arch/x86/lib/copy_page_64.S
@@ -2,23 +2,26 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
+#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+/*
+ * Some CPUs run faster using the string copy instructions (sane microcode).
+ * It is also a lot simpler. Use this when possible. But, don't use streaming
+ * copy unless the CPU indicates X86_FEATURE_REP_GOOD. Could vary the
+ * prefetch distance based on SMP/UP.
+ */
ALIGN
-copy_page_rep:
+ENTRY(copy_page)
CFI_STARTPROC
+ ALTERNATIVE "jmp copy_page_regs", "", X86_FEATURE_REP_GOOD
movl $4096/8, %ecx
rep movsq
ret
CFI_ENDPROC
-ENDPROC(copy_page_rep)
-
-/*
- * Don't use streaming copy unless the CPU indicates X86_FEATURE_REP_GOOD.
- * Could vary the prefetch distance based on SMP/UP.
-*/
+ENDPROC(copy_page)
-ENTRY(copy_page)
+ENTRY(copy_page_regs)
CFI_STARTPROC
subq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET 2*8
@@ -90,21 +93,5 @@ ENTRY(copy_page)
addq $2*8, %rsp
CFI_ADJUST_CFA_OFFSET -2*8
ret
-.Lcopy_page_end:
CFI_ENDPROC
-ENDPROC(copy_page)
-
- /* Some CPUs run faster using the string copy instructions.
- It is also a lot simpler. Use this when possible */
-
-#include <asm/cpufeature.h>
-
- .section .altinstr_replacement,"ax"
-1: .byte 0xeb /* jmp <disp8> */
- .byte (copy_page_rep - copy_page) - (2f - 1b) /* offset */
-2:
- .previous
- .section .altinstructions,"a"
- altinstruction_entry copy_page, 1b, X86_FEATURE_REP_GOOD, \
- .Lcopy_page_end-copy_page, 2b-1b
- .previous
+ENDPROC(copy_page_regs)
diff --git a/arch/x86/lib/copy_user_64.S b/arch/x86/lib/copy_user_64.S
index dee945d55594..fa997dfaef24 100644
--- a/arch/x86/lib/copy_user_64.S
+++ b/arch/x86/lib/copy_user_64.S
@@ -8,9 +8,6 @@
#include <linux/linkage.h>
#include <asm/dwarf2.h>
-
-#define FIX_ALIGNMENT 1
-
#include <asm/current.h>
#include <asm/asm-offsets.h>
#include <asm/thread_info.h>
@@ -19,33 +16,7 @@
#include <asm/asm.h>
#include <asm/smap.h>
-/*
- * By placing feature2 after feature1 in altinstructions section, we logically
- * implement:
- * If CPU has feature2, jmp to alt2 is used
- * else if CPU has feature1, jmp to alt1 is used
- * else jmp to orig is used.
- */
- .macro ALTERNATIVE_JUMP feature1,feature2,orig,alt1,alt2
-0:
- .byte 0xe9 /* 32bit jump */
- .long \orig-1f /* by default jump to orig */
-1:
- .section .altinstr_replacement,"ax"
-2: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt1-1b /* offset */ /* or alternatively to alt1 */
-3: .byte 0xe9 /* near jump with 32bit immediate */
- .long \alt2-1b /* offset */ /* or alternatively to alt2 */
- .previous
-
- .section .altinstructions,"a"
- altinstruction_entry 0b,2b,\feature1,5,5
- altinstruction_entry 0b,3b,\feature2,5,5
- .previous
- .endm
-
.macro ALIGN_DESTINATION
-#ifdef FIX_ALIGNMENT
/* check for bad alignment of destination */
movl %edi,%ecx
andl $7,%ecx
@@ -67,7 +38,6 @@
_ASM_EXTABLE(100b,103b)
_ASM_EXTABLE(101b,103b)
-#endif
.endm
/* Standard copy_to_user with segment limit checking */
@@ -79,9 +49,11 @@ ENTRY(_copy_to_user)
jc bad_to_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_to_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
- copy_user_generic_unrolled,copy_user_generic_string, \
- copy_user_enhanced_fast_string
+ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
+ "jmp copy_user_generic_string", \
+ X86_FEATURE_REP_GOOD, \
+ "jmp copy_user_enhanced_fast_string", \
+ X86_FEATURE_ERMS
CFI_ENDPROC
ENDPROC(_copy_to_user)
@@ -94,9 +66,11 @@ ENTRY(_copy_from_user)
jc bad_from_user
cmpq TI_addr_limit(%rax),%rcx
ja bad_from_user
- ALTERNATIVE_JUMP X86_FEATURE_REP_GOOD,X86_FEATURE_ERMS, \
- copy_user_generic_unrolled,copy_user_generic_string, \
- copy_user_enhanced_fast_string
+ ALTERNATIVE_2 "jmp copy_user_generic_unrolled", \
+ "jmp copy_user_generic_string", \
+ X86_FEATURE_REP_GOOD, \
+ "jmp copy_user_enhanced_fast_string", \
+ X86_FEATURE_ERMS
CFI_ENDPROC
ENDPROC(_copy_from_user)
diff --git a/arch/x86/lib/csum-copy_64.S b/arch/x86/lib/csum-copy_64.S
index 2419d5fefae3..9734182966f3 100644
--- a/arch/x86/lib/csum-copy_64.S
+++ b/arch/x86/lib/csum-copy_64.S
@@ -196,7 +196,7 @@ ENTRY(csum_partial_copy_generic)
/* handle last odd byte */
.Lhandle_1:
- testl $1, %r10d
+ testb $1, %r10b
jz .Lende
xorl %ebx, %ebx
source
diff --git a/arch/x86/lib/insn.c b/arch/x86/lib/insn.c
index 1313ae6b478b..8f72b334aea0 100644
--- a/arch/x86/lib/insn.c
+++ b/arch/x86/lib/insn.c
@@ -52,6 +52,13 @@
*/
void insn_init(struct insn *insn, const void *kaddr, int buf_len, int x86_64)
{
+ /*
+ * Instructions longer than MAX_INSN_SIZE (15 bytes) are invalid
+ * even if the input buffer is long enough to hold them.
+ */
+ if (buf_len > MAX_INSN_SIZE)
+ buf_len = MAX_INSN_SIZE;
+
memset(insn, 0, sizeof(*insn));
insn->kaddr = kaddr;
insn->end_kaddr = kaddr + buf_len;
@@ -164,6 +171,12 @@ found:
/* VEX.W overrides opnd_size */
insn->opnd_bytes = 8;
} else {
+ /*
+ * For VEX2, fake VEX3-like byte#2.
+ * Makes it easier to decode vex.W, vex.vvvv,
+ * vex.L and vex.pp. Masking with 0x7f sets vex.W == 0.
+ */
+ insn->vex_prefix.bytes[2] = b2 & 0x7f;
insn->vex_prefix.nbytes = 2;
insn->next_byte += 2;
}
diff --git a/arch/x86/lib/memcpy_64.S b/arch/x86/lib/memcpy_64.S
index 89b53c9968e7..b046664f5a1c 100644
--- a/arch/x86/lib/memcpy_64.S
+++ b/arch/x86/lib/memcpy_64.S
@@ -1,12 +1,20 @@
/* Copyright 2002 Andi Kleen */
#include <linux/linkage.h>
-
#include <asm/cpufeature.h>
#include <asm/dwarf2.h>
#include <asm/alternative-asm.h>
/*
+ * We build a jump to memcpy_orig by default which gets NOPped out on
+ * the majority of x86 CPUs which set REP_GOOD. In addition, CPUs which
+ * have the enhanced REP MOVSB/STOSB feature (ERMS), change those NOPs
+ * to a jmp to memcpy_erms which does the REP; MOVSB mem copy.
+ */
+
+.weak memcpy
+
+/*
* memcpy - Copy a memory block.
*
* Input:
@@ -17,15 +25,11 @@
* Output:
* rax original destination
*/
+ENTRY(__memcpy)
+ENTRY(memcpy)
+ ALTERNATIVE_2 "jmp memcpy_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memcpy_erms", X86_FEATURE_ERMS
-/*
- * memcpy_c() - fast string ops (REP MOVSQ) based variant.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
- */
- .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c:
movq %rdi, %rax
movq %rdx, %rcx
shrq $3, %rcx
@@ -34,29 +38,21 @@
movl %edx, %ecx
rep movsb
ret
-.Lmemcpy_e:
- .previous
+ENDPROC(memcpy)
+ENDPROC(__memcpy)
/*
- * memcpy_c_e() - enhanced fast string memcpy. This is faster and simpler than
- * memcpy_c. Use memcpy_c_e when possible.
- *
- * This gets patched over the unrolled variant (below) via the
- * alternative instructions framework:
+ * memcpy_erms() - enhanced fast string memcpy. This is faster and
+ * simpler than memcpy. Use memcpy_erms when possible.
*/
- .section .altinstr_replacement, "ax", @progbits
-.Lmemcpy_c_e:
+ENTRY(memcpy_erms)
movq %rdi, %rax
movq %rdx, %rcx
rep movsb
ret
-.Lmemcpy_e_e:
- .previous
-
-.weak memcpy
+ENDPROC(memcpy_erms)
-ENTRY(__memcpy)
-ENTRY(memcpy)
+ENTRY(memcpy_orig)
CFI_STARTPROC
movq %rdi, %rax
@@ -183,26 +179,4 @@ ENTRY(memcpy)
.Lend:
retq
CFI_ENDPROC
-ENDPROC(memcpy)
-ENDPROC(__memcpy)
-
- /*
- * Some CPUs are adding enhanced REP MOVSB/STOSB feature
- * If the feature is supported, memcpy_c_e() is the first choice.
- * If enhanced rep movsb copy is not available, use fast string copy
- * memcpy_c() when possible. This is faster and code is simpler than
- * original memcpy().
- * Otherwise, original memcpy() is used.
- * In .altinstructions section, ERMS feature is placed after REG_GOOD
- * feature to implement the right patch order.
- *
- * Replace only beginning, memcpy is used to apply alternatives,
- * so it is silly to overwrite itself with nops - reboot is the
- * only outcome...
- */
- .section .altinstructions, "a"
- altinstruction_entry __memcpy,.Lmemcpy_c,X86_FEATURE_REP_GOOD,\
- .Lmemcpy_e-.Lmemcpy_c,.Lmemcpy_e-.Lmemcpy_c
- altinstruction_entry __memcpy,.Lmemcpy_c_e,X86_FEATURE_ERMS, \
- .Lmemcpy_e_e-.Lmemcpy_c_e,.Lmemcpy_e_e-.Lmemcpy_c_e
- .previous
+ENDPROC(memcpy_orig)
diff --git a/arch/x86/lib/memmove_64.S b/arch/x86/lib/memmove_64.S
index 9c4b530575da..0f8a0d0331b9 100644
--- a/arch/x86/lib/memmove_64.S
+++ b/arch/x86/lib/memmove_64.S
@@ -5,7 +5,6 @@
* This assembly file is re-written from memmove_64.c file.
* - Copyright 2011 Fenghua Yu <fenghua.yu@intel.com>
*/
-#define _STRING_C
#include <linux/linkage.h>
#include <asm/dwarf2.h>
#include <asm/cpufeature.h>
@@ -44,6 +43,8 @@ ENTRY(__memmove)
jg 2f
.Lmemmove_begin_forward:
+ ALTERNATIVE "", "movq %rdx, %rcx; rep movsb; retq", X86_FEATURE_ERMS
+
/*
* movsq instruction have many startup latency
* so we handle small size by general register.
@@ -207,21 +208,5 @@ ENTRY(__memmove)
13:
retq
CFI_ENDPROC
-
- .section .altinstr_replacement,"ax"
-.Lmemmove_begin_forward_efs:
- /* Forward moving data. */
- movq %rdx, %rcx
- rep movsb
- retq
-.Lmemmove_end_forward_efs:
- .previous
-
- .section .altinstructions,"a"
- altinstruction_entry .Lmemmove_begin_forward, \
- .Lmemmove_begin_forward_efs,X86_FEATURE_ERMS, \
- .Lmemmove_end_forward-.Lmemmove_begin_forward, \
- .Lmemmove_end_forward_efs-.Lmemmove_begin_forward_efs
- .previous
ENDPROC(__memmove)
ENDPROC(memmove)
diff --git a/arch/x86/lib/memset_64.S b/arch/x86/lib/memset_64.S
index 6f44935c6a60..93118fb23976 100644
--- a/arch/x86/lib/memset_64.S
+++ b/arch/x86/lib/memset_64.S
@@ -5,19 +5,30 @@
#include <asm/cpufeature.h>
#include <asm/alternative-asm.h>
+.weak memset
+
/*
* ISO C memset - set a memory block to a byte value. This function uses fast
* string to get better performance than the original function. The code is
* simpler and shorter than the orignal function as well.
- *
+ *
* rdi destination
- * rsi value (char)
- * rdx count (bytes)
- *
+ * rsi value (char)
+ * rdx count (bytes)
+ *
* rax original destination
- */
- .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c:
+ */
+ENTRY(memset)
+ENTRY(__memset)
+ /*
+ * Some CPUs support enhanced REP MOVSB/STOSB feature. It is recommended
+ * to use it when possible. If not available, use fast string instructions.
+ *
+ * Otherwise, use original memset function.
+ */
+ ALTERNATIVE_2 "jmp memset_orig", "", X86_FEATURE_REP_GOOD, \
+ "jmp memset_erms", X86_FEATURE_ERMS
+
movq %rdi,%r9
movq %rdx,%rcx
andl $7,%edx
@@ -31,8 +42,8 @@
rep stosb
movq %r9,%rax
ret
-.Lmemset_e:
- .previous
+ENDPROC(memset)
+ENDPROC(__memset)
/*
* ISO C memset - set a memory block to a byte value. This function uses
@@ -45,21 +56,16 @@
*
* rax original destination
*/
- .section .altinstr_replacement, "ax", @progbits
-.Lmemset_c_e:
+ENTRY(memset_erms)
movq %rdi,%r9
movb %sil,%al
movq %rdx,%rcx
rep stosb
movq %r9,%rax
ret
-.Lmemset_e_e:
- .previous
-
-.weak memset
+ENDPROC(memset_erms)
-ENTRY(memset)
-ENTRY(__memset)
+ENTRY(memset_orig)
CFI_STARTPROC
movq %rdi,%r10
@@ -134,23 +140,4 @@ ENTRY(__memset)
jmp .Lafter_bad_alignment
.Lfinal:
CFI_ENDPROC
-ENDPROC(memset)
-ENDPROC(__memset)
-
- /* Some CPUs support enhanced REP MOVSB/STOSB feature.
- * It is recommended to use this when possible.
- *
- * If enhanced REP MOVSB/STOSB feature is not available, use fast string
- * instructions.
- *
- * Otherwise, use original memset function.
- *
- * In .altinstructions section, ERMS feature is placed after REG_GOOD
- * feature to implement the right patch order.
- */
- .section .altinstructions,"a"
- altinstruction_entry __memset,.Lmemset_c,X86_FEATURE_REP_GOOD,\
- .Lfinal-__memset,.Lmemset_e-.Lmemset_c
- altinstruction_entry __memset,.Lmemset_c_e,X86_FEATURE_ERMS, \
- .Lfinal-__memset,.Lmemset_e_e-.Lmemset_c_e
- .previous
+ENDPROC(memset_orig)
diff --git a/arch/x86/lib/msr-reg.S b/arch/x86/lib/msr-reg.S
index f6d13eefad10..3ca5218fbece 100644
--- a/arch/x86/lib/msr-reg.S
+++ b/arch/x86/lib/msr-reg.S
@@ -14,8 +14,8 @@
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
CFI_STARTPROC
- pushq_cfi %rbx
- pushq_cfi %rbp
+ pushq_cfi_reg rbx
+ pushq_cfi_reg rbp
movq %rdi, %r10 /* Save pointer */
xorl %r11d, %r11d /* Return value */
movl (%rdi), %eax
@@ -35,8 +35,8 @@ ENTRY(\op\()_safe_regs)
movl %ebp, 20(%r10)
movl %esi, 24(%r10)
movl %edi, 28(%r10)
- popq_cfi %rbp
- popq_cfi %rbx
+ popq_cfi_reg rbp
+ popq_cfi_reg rbx
ret
3:
CFI_RESTORE_STATE
@@ -53,10 +53,10 @@ ENDPROC(\op\()_safe_regs)
.macro op_safe_regs op
ENTRY(\op\()_safe_regs)
CFI_STARTPROC
- pushl_cfi %ebx
- pushl_cfi %ebp
- pushl_cfi %esi
- pushl_cfi %edi
+ pushl_cfi_reg ebx
+ pushl_cfi_reg ebp
+ pushl_cfi_reg esi
+ pushl_cfi_reg edi
pushl_cfi $0 /* Return value */
pushl_cfi %eax
movl 4(%eax), %ecx
@@ -80,10 +80,10 @@ ENTRY(\op\()_safe_regs)
movl %esi, 24(%eax)
movl %edi, 28(%eax)
popl_cfi %eax
- popl_cfi %edi
- popl_cfi %esi
- popl_cfi %ebp
- popl_cfi %ebx
+ popl_cfi_reg edi
+ popl_cfi_reg esi
+ popl_cfi_reg ebp
+ popl_cfi_reg ebx
ret
3:
CFI_RESTORE_STATE
diff --git a/arch/x86/lib/rwsem.S b/arch/x86/lib/rwsem.S
index 5dff5f042468..2322abe4da3b 100644
--- a/arch/x86/lib/rwsem.S
+++ b/arch/x86/lib/rwsem.S
@@ -34,10 +34,10 @@
*/
#define save_common_regs \
- pushl_cfi %ecx; CFI_REL_OFFSET ecx, 0
+ pushl_cfi_reg ecx
#define restore_common_regs \
- popl_cfi %ecx; CFI_RESTORE ecx
+ popl_cfi_reg ecx
/* Avoid uglifying the argument copying x86-64 needs to do. */
.macro movq src, dst
@@ -64,22 +64,22 @@
*/
#define save_common_regs \
- pushq_cfi %rdi; CFI_REL_OFFSET rdi, 0; \
- pushq_cfi %rsi; CFI_REL_OFFSET rsi, 0; \
- pushq_cfi %rcx; CFI_REL_OFFSET rcx, 0; \
- pushq_cfi %r8; CFI_REL_OFFSET r8, 0; \
- pushq_cfi %r9; CFI_REL_OFFSET r9, 0; \
- pushq_cfi %r10; CFI_REL_OFFSET r10, 0; \
- pushq_cfi %r11; CFI_REL_OFFSET r11, 0
+ pushq_cfi_reg rdi; \
+ pushq_cfi_reg rsi; \
+ pushq_cfi_reg rcx; \
+ pushq_cfi_reg r8; \
+ pushq_cfi_reg r9; \
+ pushq_cfi_reg r10; \
+ pushq_cfi_reg r11
#define restore_common_regs \
- popq_cfi %r11; CFI_RESTORE r11; \
- popq_cfi %r10; CFI_RESTORE r10; \
- popq_cfi %r9; CFI_RESTORE r9; \
- popq_cfi %r8; CFI_RESTORE r8; \
- popq_cfi %rcx; CFI_RESTORE rcx; \
- popq_cfi %rsi; CFI_RESTORE rsi; \
- popq_cfi %rdi; CFI_RESTORE rdi
+ popq_cfi_reg r11; \
+ popq_cfi_reg r10; \
+ popq_cfi_reg r9; \
+ popq_cfi_reg r8; \
+ popq_cfi_reg rcx; \
+ popq_cfi_reg rsi; \
+ popq_cfi_reg rdi
#endif
@@ -87,12 +87,10 @@
ENTRY(call_rwsem_down_read_failed)
CFI_STARTPROC
save_common_regs
- __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
- CFI_REL_OFFSET __ASM_REG(dx), 0
+ __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
movq %rax,%rdi
call rwsem_down_read_failed
- __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
- CFI_RESTORE __ASM_REG(dx)
+ __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
@@ -124,12 +122,10 @@ ENDPROC(call_rwsem_wake)
ENTRY(call_rwsem_downgrade_wake)
CFI_STARTPROC
save_common_regs
- __ASM_SIZE(push,_cfi) %__ASM_REG(dx)
- CFI_REL_OFFSET __ASM_REG(dx), 0
+ __ASM_SIZE(push,_cfi_reg) __ASM_REG(dx)
movq %rax,%rdi
call rwsem_downgrade_wake
- __ASM_SIZE(pop,_cfi) %__ASM_REG(dx)
- CFI_RESTORE __ASM_REG(dx)
+ __ASM_SIZE(pop,_cfi_reg) __ASM_REG(dx)
restore_common_regs
ret
CFI_ENDPROC
diff --git a/arch/x86/lib/thunk_32.S b/arch/x86/lib/thunk_32.S
index e28cdaf5ac2c..5eb715087b80 100644
--- a/arch/x86/lib/thunk_32.S
+++ b/arch/x86/lib/thunk_32.S
@@ -13,12 +13,9 @@
.globl \name
\name:
CFI_STARTPROC
- pushl_cfi %eax
- CFI_REL_OFFSET eax, 0
- pushl_cfi %ecx
- CFI_REL_OFFSET ecx, 0
- pushl_cfi %edx
- CFI_REL_OFFSET edx, 0
+ pushl_cfi_reg eax
+ pushl_cfi_reg ecx
+ pushl_cfi_reg edx
.if \put_ret_addr_in_eax
/* Place EIP in the arg1 */
@@ -26,12 +23,9 @@
.endif
call \func
- popl_cfi %edx
- CFI_RESTORE edx
- popl_cfi %ecx
- CFI_RESTORE ecx
- popl_cfi %eax
- CFI_RESTORE eax
+ popl_cfi_reg edx
+ popl_cfi_reg ecx
+ popl_cfi_reg eax
ret
CFI_ENDPROC
_ASM_NOKPROBE(\name)
diff --git a/arch/x86/lib/thunk_64.S b/arch/x86/lib/thunk_64.S
index b30b5ebd614a..f89ba4e93025 100644
--- a/arch/x86/lib/thunk_64.S
+++ b/arch/x86/lib/thunk_64.S
@@ -17,9 +17,18 @@
CFI_STARTPROC
/* this one pushes 9 elems, the next one would be %rIP */
- SAVE_ARGS
+ pushq_cfi_reg rdi
+ pushq_cfi_reg rsi
+ pushq_cfi_reg rdx
+ pushq_cfi_reg rcx
+ pushq_cfi_reg rax
+ pushq_cfi_reg r8
+ pushq_cfi_reg r9
+ pushq_cfi_reg r10
+ pushq_cfi_reg r11
.if \put_ret_addr_in_rdi
+ /* 9*8(%rsp) is return addr on stack */
movq_cfi_restore 9*8, rdi
.endif
@@ -45,11 +54,22 @@
#endif
#endif
- /* SAVE_ARGS below is used only for the .cfi directives it contains. */
+#if defined(CONFIG_TRACE_IRQFLAGS) \
+ || defined(CONFIG_DEBUG_LOCK_ALLOC) \
+ || defined(CONFIG_PREEMPT)
CFI_STARTPROC
- SAVE_ARGS
+ CFI_ADJUST_CFA_OFFSET 9*8
restore:
- RESTORE_ARGS
+ popq_cfi_reg r11
+ popq_cfi_reg r10
+ popq_cfi_reg r9
+ popq_cfi_reg r8
+ popq_cfi_reg rax
+ popq_cfi_reg rcx
+ popq_cfi_reg rdx
+ popq_cfi_reg rsi
+ popq_cfi_reg rdi
ret
CFI_ENDPROC
_ASM_NOKPROBE(restore)
+#endif
diff --git a/arch/x86/lib/x86-opcode-map.txt b/arch/x86/lib/x86-opcode-map.txt
index 1a2be7c6895d..816488c0b97e 100644
--- a/arch/x86/lib/x86-opcode-map.txt
+++ b/arch/x86/lib/x86-opcode-map.txt
@@ -273,6 +273,9 @@ dd: ESC
de: ESC
df: ESC
# 0xe0 - 0xef
+# Note: "forced64" is Intel CPU behavior: they ignore 0x66 prefix
+# in 64-bit mode. AMD CPUs accept 0x66 prefix, it causes RIP truncation
+# to 16 bits. In 32-bit mode, 0x66 is accepted by both Intel and AMD.
e0: LOOPNE/LOOPNZ Jb (f64)
e1: LOOPE/LOOPZ Jb (f64)
e2: LOOP Jb (f64)
@@ -281,6 +284,10 @@ e4: IN AL,Ib
e5: IN eAX,Ib
e6: OUT Ib,AL
e7: OUT Ib,eAX
+# With 0x66 prefix in 64-bit mode, for AMD CPUs immediate offset
+# in "near" jumps and calls is 16-bit. For CALL,
+# push of return address is 16-bit wide, RSP is decremented by 2
+# but is not truncated to 16 bits, unlike RIP.
e8: CALL Jz (f64)
e9: JMP-near Jz (f64)
ea: JMP-far Ap (i64)
@@ -456,6 +463,7 @@ AVXcode: 1
7e: movd/q Ey,Pd | vmovd/q Ey,Vy (66),(v1) | vmovq Vq,Wq (F3),(v1)
7f: movq Qq,Pq | vmovdqa Wx,Vx (66) | vmovdqu Wx,Vx (F3)
# 0x0f 0x80-0x8f
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
80: JO Jz (f64)
81: JNO Jz (f64)
82: JB/JC/JNAE Jz (f64)
@@ -842,6 +850,7 @@ EndTable
GrpTable: Grp5
0: INC Ev
1: DEC Ev
+# Note: "forced64" is Intel CPU behavior (see comment about CALL insn).
2: CALLN Ev (f64)
3: CALLF Ep
4: JMPN Ev (f64)
diff --git a/arch/x86/mm/fault.c b/arch/x86/mm/fault.c
index ede025fb46f1..ae340d3761ca 100644
--- a/arch/x86/mm/fault.c
+++ b/arch/x86/mm/fault.c
@@ -148,7 +148,7 @@ is_prefetch(struct pt_regs *regs, unsigned long error_code, unsigned long addr)
instr = (void *)convert_ip_to_linear(current, regs);
max_instr = instr + 15;
- if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE)
+ if (user_mode(regs) && instr >= (unsigned char *)TASK_SIZE_MAX)
return 0;
while (instr < max_instr) {
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index a110efca6d06..52417e771af9 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -179,7 +179,8 @@ static void __init probe_page_size_mask(void)
if (cpu_has_pge) {
cr4_set_bits_and_update_boot(X86_CR4_PGE);
__supported_pte_mask |= _PAGE_GLOBAL;
- }
+ } else
+ __supported_pte_mask &= ~_PAGE_GLOBAL;
}
#ifdef CONFIG_X86_32
diff --git a/arch/x86/power/cpu.c b/arch/x86/power/cpu.c
index 3e32ed5648a0..757678fb26e1 100644
--- a/arch/x86/power/cpu.c
+++ b/arch/x86/power/cpu.c
@@ -134,7 +134,7 @@ static void do_fpu_end(void)
static void fix_processor_context(void)
{
int cpu = smp_processor_id();
- struct tss_struct *t = &per_cpu(init_tss, cpu);
+ struct tss_struct *t = &per_cpu(cpu_tss, cpu);
#ifdef CONFIG_X86_64
struct desc_struct *desc = get_cpu_gdt_table(cpu);
tss_desc tss;
diff --git a/arch/x86/syscalls/syscall_32.tbl b/arch/x86/syscalls/syscall_32.tbl
index b3560ece1c9f..ef8187f9d28d 100644
--- a/arch/x86/syscalls/syscall_32.tbl
+++ b/arch/x86/syscalls/syscall_32.tbl
@@ -119,7 +119,7 @@
110 i386 iopl sys_iopl
111 i386 vhangup sys_vhangup
112 i386 idle
-113 i386 vm86old sys_vm86old sys32_vm86_warning
+113 i386 vm86old sys_vm86old sys_ni_syscall
114 i386 wait4 sys_wait4 compat_sys_wait4
115 i386 swapoff sys_swapoff
116 i386 sysinfo sys_sysinfo compat_sys_sysinfo
@@ -172,7 +172,7 @@
163 i386 mremap sys_mremap
164 i386 setresuid sys_setresuid16
165 i386 getresuid sys_getresuid16
-166 i386 vm86 sys_vm86 sys32_vm86_warning
+166 i386 vm86 sys_vm86 sys_ni_syscall
167 i386 query_module
168 i386 poll sys_poll
169 i386 nfsservctl
diff --git a/arch/x86/syscalls/syscall_64.tbl b/arch/x86/syscalls/syscall_64.tbl
index 8d656fbb57aa..9ef32d5f1b19 100644
--- a/arch/x86/syscalls/syscall_64.tbl
+++ b/arch/x86/syscalls/syscall_64.tbl
@@ -178,7 +178,7 @@
169 common reboot sys_reboot
170 common sethostname sys_sethostname
171 common setdomainname sys_setdomainname
-172 common iopl stub_iopl
+172 common iopl sys_iopl
173 common ioperm sys_ioperm
174 64 create_module
175 common init_module sys_init_module
diff --git a/arch/x86/um/asm/barrier.h b/arch/x86/um/asm/barrier.h
index 2d7d9a1f5b53..8ffd2146fa6a 100644
--- a/arch/x86/um/asm/barrier.h
+++ b/arch/x86/um/asm/barrier.h
@@ -64,8 +64,8 @@
*/
static inline void rdtsc_barrier(void)
{
- alternative(ASM_NOP3, "mfence", X86_FEATURE_MFENCE_RDTSC);
- alternative(ASM_NOP3, "lfence", X86_FEATURE_LFENCE_RDTSC);
+ alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC,
+ "lfence", X86_FEATURE_LFENCE_RDTSC);
}
#endif
diff --git a/arch/x86/um/sys_call_table_64.c b/arch/x86/um/sys_call_table_64.c
index 5cdfa9db2217..a75d8700472a 100644
--- a/arch/x86/um/sys_call_table_64.c
+++ b/arch/x86/um/sys_call_table_64.c
@@ -16,7 +16,7 @@
*/
/* Not going to be implemented by UML, since we have no hardware. */
-#define stub_iopl sys_ni_syscall
+#define sys_iopl sys_ni_syscall
#define sys_ioperm sys_ni_syscall
/*
diff --git a/arch/x86/xen/enlighten.c b/arch/x86/xen/enlighten.c
index 5240f563076d..81665c9f2132 100644
--- a/arch/x86/xen/enlighten.c
+++ b/arch/x86/xen/enlighten.c
@@ -912,6 +912,7 @@ static void xen_load_sp0(struct tss_struct *tss,
mcs = xen_mc_entry(0);
MULTI_stack_switch(mcs.mc, __KERNEL_DS, thread->sp0);
xen_mc_issue(PARAVIRT_LAZY_CPU);
+ tss->x86_tss.sp0 = thread->sp0;
}
static void xen_set_iopl_mask(unsigned mask)
diff --git a/arch/x86/xen/xen-asm_64.S b/arch/x86/xen/xen-asm_64.S
index 53adefda4275..985fc3ee0973 100644
--- a/arch/x86/xen/xen-asm_64.S
+++ b/arch/x86/xen/xen-asm_64.S
@@ -68,11 +68,11 @@ ENTRY(xen_sysret64)
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
- movq %rsp, PER_CPU_VAR(old_rsp)
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER_DS
- pushq PER_CPU_VAR(old_rsp)
+ pushq PER_CPU_VAR(rsp_scratch)
pushq %r11
pushq $__USER_CS
pushq %rcx
@@ -87,11 +87,11 @@ ENTRY(xen_sysret32)
* We're already on the usermode stack at this point, but
* still with the kernel gs, so we can easily switch back
*/
- movq %rsp, PER_CPU_VAR(old_rsp)
+ movq %rsp, PER_CPU_VAR(rsp_scratch)
movq PER_CPU_VAR(kernel_stack), %rsp
pushq $__USER32_DS
- pushq PER_CPU_VAR(old_rsp)
+ pushq PER_CPU_VAR(rsp_scratch)
pushq %r11
pushq $__USER32_CS
pushq %rcx
diff --git a/include/linux/stddef.h b/include/linux/stddef.h
index f4aec0e75c3a..076af437284d 100644
--- a/include/linux/stddef.h
+++ b/include/linux/stddef.h
@@ -19,3 +19,12 @@ enum {
#define offsetof(TYPE, MEMBER) ((size_t) &((TYPE *)0)->MEMBER)
#endif
#endif
+
+/**
+ * offsetofend(TYPE, MEMBER)
+ *
+ * @TYPE: The type of the structure
+ * @MEMBER: The member within the structure to get the end offset of
+ */
+#define offsetofend(TYPE, MEMBER) \
+ (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index 2d67b8998fd8..049b2f497bc7 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -78,19 +78,6 @@ extern int vfio_register_iommu_driver(const struct vfio_iommu_driver_ops *ops);
extern void vfio_unregister_iommu_driver(
const struct vfio_iommu_driver_ops *ops);
-/**
- * offsetofend(TYPE, MEMBER)
- *
- * @TYPE: The type of the structure
- * @MEMBER: The member within the structure to get the end offset of
- *
- * Simple helper macro for dealing with variable sized structures passed
- * from user space. This allows us to easily determine if the provided
- * structure is sized to include various fields.
- */
-#define offsetofend(TYPE, MEMBER) \
- (offsetof(TYPE, MEMBER) + sizeof(((TYPE *)0)->MEMBER))
-
/*
* External user API
*/
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
index d66ab799b35f..8c0c1a2770c8 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm-def.h
@@ -1,12 +1,12 @@
-MEMCPY_FN(__memcpy,
+MEMCPY_FN(memcpy_orig,
"x86-64-unrolled",
"unrolled memcpy() in arch/x86/lib/memcpy_64.S")
-MEMCPY_FN(memcpy_c,
+MEMCPY_FN(__memcpy,
"x86-64-movsq",
"movsq-based memcpy() in arch/x86/lib/memcpy_64.S")
-MEMCPY_FN(memcpy_c_e,
+MEMCPY_FN(memcpy_erms,
"x86-64-movsb",
"movsb-based memcpy() in arch/x86/lib/memcpy_64.S")
diff --git a/tools/perf/bench/mem-memcpy-x86-64-asm.S b/tools/perf/bench/mem-memcpy-x86-64-asm.S
index fcd9cf00600a..e4c2c30143b9 100644
--- a/tools/perf/bench/mem-memcpy-x86-64-asm.S
+++ b/tools/perf/bench/mem-memcpy-x86-64-asm.S
@@ -1,8 +1,6 @@
#define memcpy MEMCPY /* don't hide glibc's memcpy() */
#define altinstr_replacement text
#define globl p2align 4; .globl
-#define Lmemcpy_c globl memcpy_c; memcpy_c
-#define Lmemcpy_c_e globl memcpy_c_e; memcpy_c_e
#include "../../../arch/x86/lib/memcpy_64.S"
/*
* We need to provide note.GNU-stack section, saying that we want
diff --git a/tools/perf/bench/mem-memcpy.c b/tools/perf/bench/mem-memcpy.c
index db1d3a29d97f..d3dfb7936dcd 100644
--- a/tools/perf/bench/mem-memcpy.c
+++ b/tools/perf/bench/mem-memcpy.c
@@ -36,7 +36,7 @@ static const struct option options[] = {
"Specify length of memory to copy. "
"Available units: B, KB, MB, GB and TB (upper and lower)"),
OPT_STRING('r', "routine", &routine, "default",
- "Specify routine to copy"),
+ "Specify routine to copy, \"all\" runs all available routines"),
OPT_INTEGER('i', "iterations", &iterations,
"repeat memcpy() invocation this number of times"),
OPT_BOOLEAN('c', "cycle", &use_cycle,
@@ -135,55 +135,16 @@ struct bench_mem_info {
const char *const *usage;
};
-static int bench_mem_common(int argc, const char **argv,
- const char *prefix __maybe_unused,
- struct bench_mem_info *info)
+static void __bench_mem_routine(struct bench_mem_info *info, int r_idx, size_t len, double totallen)
{
- int i;
- size_t len;
- double totallen;
+ const struct routine *r = &info->routines[r_idx];
double result_bps[2];
u64 result_cycle[2];
- argc = parse_options(argc, argv, options,
- info->usage, 0);
-
- if (no_prefault && only_prefault) {
- fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
- return 1;
- }
-
- if (use_cycle)
- init_cycle();
-
- len = (size_t)perf_atoll((char *)length_str);
- totallen = (double)len * iterations;
-
result_cycle[0] = result_cycle[1] = 0ULL;
result_bps[0] = result_bps[1] = 0.0;
- if ((s64)len <= 0) {
- fprintf(stderr, "Invalid length:%s\n", length_str);
- return 1;
- }
-
- /* same to without specifying either of prefault and no-prefault */
- if (only_prefault && no_prefault)
- only_prefault = no_prefault = false;
-
- for (i = 0; info->routines[i].name; i++) {
- if (!strcmp(info->routines[i].name, routine))
- break;
- }
- if (!info->routines[i].name) {
- printf("Unknown routine:%s\n", routine);
- printf("Available routines...\n");
- for (i = 0; info->routines[i].name; i++) {
- printf("\t%s ... %s\n",
- info->routines[i].name, info->routines[i].desc);
- }
- return 1;
- }
+ printf("Routine %s (%s)\n", r->name, r->desc);
if (bench_format == BENCH_FORMAT_DEFAULT)
printf("# Copying %s Bytes ...\n\n", length_str);
@@ -191,28 +152,17 @@ static int bench_mem_common(int argc, const char **argv,
if (!only_prefault && !no_prefault) {
/* show both of results */
if (use_cycle) {
- result_cycle[0] =
- info->do_cycle(&info->routines[i], len, false);
- result_cycle[1] =
- info->do_cycle(&info->routines[i], len, true);
+ result_cycle[0] = info->do_cycle(r, len, false);
+ result_cycle[1] = info->do_cycle(r, len, true);
} else {
- result_bps[0] =
- info->do_gettimeofday(&info->routines[i],
- len, false);
- result_bps[1] =
- info->do_gettimeofday(&info->routines[i],
- len, true);
+ result_bps[0] = info->do_gettimeofday(r, len, false);
+ result_bps[1] = info->do_gettimeofday(r, len, true);
}
} else {
- if (use_cycle) {
- result_cycle[pf] =
- info->do_cycle(&info->routines[i],
- len, only_prefault);
- } else {
- result_bps[pf] =
- info->do_gettimeofday(&info->routines[i],
- len, only_prefault);
- }
+ if (use_cycle)
+ result_cycle[pf] = info->do_cycle(r, len, only_prefault);
+ else
+ result_bps[pf] = info->do_gettimeofday(r, len, only_prefault);
}
switch (bench_format) {
@@ -265,6 +215,60 @@ static int bench_mem_common(int argc, const char **argv,
die("unknown format: %d\n", bench_format);
break;
}
+}
+
+static int bench_mem_common(int argc, const char **argv,
+ const char *prefix __maybe_unused,
+ struct bench_mem_info *info)
+{
+ int i;
+ size_t len;
+ double totallen;
+
+ argc = parse_options(argc, argv, options,
+ info->usage, 0);
+
+ if (no_prefault && only_prefault) {
+ fprintf(stderr, "Invalid options: -o and -n are mutually exclusive\n");
+ return 1;
+ }
+
+ if (use_cycle)
+ init_cycle();
+
+ len = (size_t)perf_atoll((char *)length_str);
+ totallen = (double)len * iterations;
+
+ if ((s64)len <= 0) {
+ fprintf(stderr, "Invalid length:%s\n", length_str);
+ return 1;
+ }
+
+ /* same to without specifying either of prefault and no-prefault */
+ if (only_prefault && no_prefault)
+ only_prefault = no_prefault = false;
+
+ if (!strncmp(routine, "all", 3)) {
+ for (i = 0; info->routines[i].name; i++)
+ __bench_mem_routine(info, i, len, totallen);
+ return 0;
+ }
+
+ for (i = 0; info->routines[i].name; i++) {
+ if (!strcmp(info->routines[i].name, routine))
+ break;
+ }
+ if (!info->routines[i].name) {
+ printf("Unknown routine:%s\n", routine);
+ printf("Available routines...\n");
+ for (i = 0; info->routines[i].name; i++) {
+ printf("\t%s ... %s\n",
+ info->routines[i].name, info->routines[i].desc);
+ }
+ return 1;
+ }
+
+ __bench_mem_routine(info, i, len, totallen);
return 0;
}
diff --git a/tools/perf/bench/mem-memset-x86-64-asm-def.h b/tools/perf/bench/mem-memset-x86-64-asm-def.h
index a71dff97c1f5..f02d028771d9 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm-def.h
+++ b/tools/perf/bench/mem-memset-x86-64-asm-def.h
@@ -1,12 +1,12 @@
-MEMSET_FN(__memset,
+MEMSET_FN(memset_orig,
"x86-64-unrolled",
"unrolled memset() in arch/x86/lib/memset_64.S")
-MEMSET_FN(memset_c,
+MEMSET_FN(__memset,
"x86-64-stosq",
"movsq-based memset() in arch/x86/lib/memset_64.S")
-MEMSET_FN(memset_c_e,
+MEMSET_FN(memset_erms,
"x86-64-stosb",
"movsb-based memset() in arch/x86/lib/memset_64.S")
diff --git a/tools/perf/bench/mem-memset-x86-64-asm.S b/tools/perf/bench/mem-memset-x86-64-asm.S
index 9e5af89ed13a..de278784c866 100644
--- a/tools/perf/bench/mem-memset-x86-64-asm.S
+++ b/tools/perf/bench/mem-memset-x86-64-asm.S
@@ -1,8 +1,6 @@
#define memset MEMSET /* don't hide glibc's memset() */
#define altinstr_replacement text
#define globl p2align 4; .globl
-#define Lmemset_c globl memset_c; memset_c
-#define Lmemset_c_e globl memset_c_e; memset_c_e
#include "../../../arch/x86/lib/memset_64.S"
/*
diff --git a/tools/perf/util/include/asm/alternative-asm.h b/tools/perf/util/include/asm/alternative-asm.h
index 6789d788d494..3a3a0f16456a 100644
--- a/tools/perf/util/include/asm/alternative-asm.h
+++ b/tools/perf/util/include/asm/alternative-asm.h
@@ -4,5 +4,6 @@
/* Just disable it so we can build arch/x86/lib/memcpy_64.S for perf bench: */
#define altinstruction_entry #
+#define ALTERNATIVE_2 #
#endif