diff options
Diffstat (limited to 'arch/x86/include/asm/thread_info.h')
-rw-r--r-- | arch/x86/include/asm/thread_info.h | 74 |
1 files changed, 64 insertions, 10 deletions
diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index 1d4e4f279a32..ea2dbe82cba3 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -13,6 +13,33 @@ #include <asm/types.h> /* + * TOP_OF_KERNEL_STACK_PADDING is a number of unused bytes that we + * reserve at the top of the kernel stack. We do it because of a nasty + * 32-bit corner case. On x86_32, the hardware stack frame is + * variable-length. Except for vm86 mode, struct pt_regs assumes a + * maximum-length frame. If we enter from CPL 0, the top 8 bytes of + * pt_regs don't actually exist. Ordinarily this doesn't matter, but it + * does in at least one case: + * + * If we take an NMI early enough in SYSENTER, then we can end up with + * pt_regs that extends above sp0. On the way out, in the espfix code, + * we can read the saved SS value, but that value will be above sp0. + * Without this offset, that can result in a page fault. (We are + * careful that, in this case, the value we read doesn't matter.) + * + * In vm86 mode, the hardware frame is much longer still, but we neither + * access the extra members from NMI context, nor do we write such a + * frame at sp0 at all. + * + * x86_64 has a fixed-length stack frame. + */ +#ifdef CONFIG_X86_32 +# define TOP_OF_KERNEL_STACK_PADDING 8 +#else +# define TOP_OF_KERNEL_STACK_PADDING 0 +#endif + +/* * low level task data that entry.S needs immediate access to * - this struct should fit entirely inside of one cache line * - this struct shares the supervisor stack pages @@ -145,7 +172,6 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) #define STACK_WARN (THREAD_SIZE/8) -#define KERNEL_STACK_OFFSET (5*(BITS_PER_LONG/8)) /* * macros/functions for gaining access to the thread information structure @@ -158,10 +184,7 @@ DECLARE_PER_CPU(unsigned long, kernel_stack); static inline struct thread_info *current_thread_info(void) { - struct thread_info *ti; - ti = (void *)(this_cpu_read_stable(kernel_stack) + - KERNEL_STACK_OFFSET - THREAD_SIZE); - return ti; + return (struct thread_info *)(current_top_of_stack() - THREAD_SIZE); } static inline unsigned long current_stack_pointer(void) @@ -177,16 +200,37 @@ static inline unsigned long current_stack_pointer(void) #else /* !__ASSEMBLY__ */ -/* how to get the thread information struct from ASM */ +/* Load thread_info address into "reg" */ #define GET_THREAD_INFO(reg) \ _ASM_MOV PER_CPU_VAR(kernel_stack),reg ; \ - _ASM_SUB $(THREAD_SIZE-KERNEL_STACK_OFFSET),reg ; + _ASM_SUB $(THREAD_SIZE),reg ; /* - * Same if PER_CPU_VAR(kernel_stack) is, perhaps with some offset, already in - * a certain register (to be used in assembler memory operands). + * ASM operand which evaluates to a 'thread_info' address of + * the current task, if it is known that "reg" is exactly "off" + * bytes below the top of the stack currently. + * + * ( The kernel stack's size is known at build time, it is usually + * 2 or 4 pages, and the bottom of the kernel stack contains + * the thread_info structure. So to access the thread_info very + * quickly from assembly code we can calculate down from the + * top of the kernel stack to the bottom, using constant, + * build-time calculations only. ) + * + * For example, to fetch the current thread_info->flags value into %eax + * on x86-64 defconfig kernels, in syscall entry code where RSP is + * currently at exactly SIZEOF_PTREGS bytes away from the top of the + * stack: + * + * mov ASM_THREAD_INFO(TI_flags, %rsp, SIZEOF_PTREGS), %eax + * + * will translate to: + * + * 8b 84 24 b8 c0 ff ff mov -0x3f48(%rsp), %eax + * + * which is below the current RSP by almost 16K. */ -#define THREAD_INFO(reg, off) KERNEL_STACK_OFFSET+(off)-THREAD_SIZE(reg) +#define ASM_THREAD_INFO(field, reg, off) ((field)+(off)-THREAD_SIZE)(reg) #endif @@ -236,6 +280,16 @@ static inline bool is_ia32_task(void) #endif return false; } + +/* + * Force syscall return via IRET by making it look as if there was + * some work pending. IRET is our most capable (but slowest) syscall + * return path, which is able to restore modified SS, CS and certain + * EFLAGS values that other (fast) syscall return instructions + * are not able to restore properly. + */ +#define force_iret() set_thread_flag(TIF_NOTIFY_RESUME) + #endif /* !__ASSEMBLY__ */ #ifndef __ASSEMBLY__ |