2023-10-11 22:44:18

by Brian Gerst

[permalink] [raw]
Subject: [PATCH v3 0/3] x86: Clean up fast syscall return validation

This patch set cleans up the tests done to determine if a fast syscall
return instruction can be used to return to userspace. It converts the
code to C, and refactors existing code to be more readable.

v3:
- Remove patches already applied to -tip.
- Keep Xen alternatives on the asm side to skip testing the return
value.
- Add patch to simplify canonical-RIP test.

v2:
- Fix shift value for canonical RIP test and use
__is_canonical_address()

Brian Gerst (3):
x86/entry/64: Convert SYSRET validation tests to C
x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test
x86/entry/32: Clean up syscall fast exit tests

arch/x86/entry/common.c | 91 ++++++++++++++++++++++++----------
arch/x86/entry/entry_64.S | 53 +-------------------
arch/x86/include/asm/syscall.h | 2 +-
3 files changed, 67 insertions(+), 79 deletions(-)

--
2.41.0


2023-10-11 22:44:25

by Brian Gerst

[permalink] [raw]
Subject: [PATCH v3 3/3] x86/entry/32: Clean up syscall fast exit tests

Merge compat and native code and clarify comments.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/common.c | 48 +++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index e3d6f255379f..0acf35d7fe55 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -255,34 +255,30 @@ __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
if (!__do_fast_syscall_32(regs))
return false;

-#ifdef CONFIG_X86_64
/*
- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
- * SYSRETL is available on all 64-bit CPUs, so we don't need to
- * bother with SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
*/
- return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
- /*
- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- *
- * We don't allow syscalls at all from VM86 mode, but we still
- * need to check VM, because we might be returning from sys_vm86.
- */
- return regs->cs == __USER_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+ /* XEN PV guests always use IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
}

/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */
--
2.41.0

2023-10-11 22:44:25

by Brian Gerst

[permalink] [raw]
Subject: [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/common.c | 43 ++++++++++++++++++++++++++-
arch/x86/entry/entry_64.S | 53 ++--------------------------------
arch/x86/include/asm/syscall.h | 2 +-
3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0551bcb197fb..207149a0a9b3 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}

-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)

instrumentation_end();
syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * Change top bits to match most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+ if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
}
#endif

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3bdc22d7e78f..de6469dffe3a 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/

- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV

/*
* We win! This label is here just for ease of understanding
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c7e25c940f1a..f44e2f9ab65d 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);

#endif /* CONFIG_X86_32 */

--
2.41.0

2023-10-11 22:44:38

by Brian Gerst

[permalink] [raw]
Subject: [PATCH v3 2/3] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test

Using shifts to determine if an address is canonical is difficult for
the compiler to optimize when the virtual address width is variable
(LA57 feature) without using inline assembly. Instead, compare RIP
against TASK_SIZE_MAX. The only user executable address outside of that
range is the deprecated vsyscall page, which can fall back to using IRET.

Signed-off-by: Brian Gerst <[email protected]>
---
arch/x86/entry/common.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 207149a0a9b3..e3d6f255379f 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -110,10 +110,10 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
*/
- if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
return false;

/*
--
2.41.0

Subject: [tip: x86/entry] x86/entry/64: Convert SYSRET validation tests to C

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: ca282b486a570a0bfda5c1a4595ace7fa14243bf
Gitweb: https://git.kernel.org/tip/ca282b486a570a0bfda5c1a4595ace7fa14243bf
Author: Brian Gerst <[email protected]>
AuthorDate: Wed, 11 Oct 2023 18:43:49 -04:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Fri, 13 Oct 2023 13:05:28 +02:00

x86/entry/64: Convert SYSRET validation tests to C

No change in functionality expected.

Signed-off-by: Brian Gerst <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Brian Gerst <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: Uros Bizjak <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/entry/common.c | 43 ++++++++++++++++++++++++++-
arch/x86/entry/entry_64.S | 53 +--------------------------------
arch/x86/include/asm/syscall.h | 2 +-
3 files changed, 45 insertions(+), 53 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 0551bcb..9021465 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
return false;
}

-__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
+/* Returns true to return using SYSRET, or false to use IRET */
+__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
{
add_random_kstack_offset();
nr = syscall_enter_from_user_mode(regs, nr);
@@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)

instrumentation_end();
syscall_exit_to_user_mode(regs);
+
+ /*
+ * Check that the register state is valid for using SYSRET to exit
+ * to userspace. Otherwise use the slower but fully capable IRET
+ * exit path.
+ */
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* SYSRET requires RCX == RIP and R11 == EFLAGS */
+ if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
+ return false;
+
+ /*
+ * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
+ * in kernel space. This essentially lets the user take over
+ * the kernel, since userspace controls RSP.
+ *
+ * Change top bits to match the most significant bit (47th or 56th bit
+ * depending on paging mode) in the address.
+ */
+ if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+ return false;
+
+ /*
+ * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
+ * restoring TF results in a trap from userspace immediately after
+ * SYSRET.
+ */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
+ return false;
+
+ /* Use SYSRET to exit to userspace */
+ return true;
}
#endif

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 7574639..1730640 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
* In the Xen PV case we must use iret anyway.
*/

- ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
- X86_FEATURE_XENPV
-
- movq RCX(%rsp), %rcx
- movq RIP(%rsp), %r11
-
- cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
- * in kernel space. This essentially lets the user take over
- * the kernel, since userspace controls RSP.
- *
- * If width of "canonical tail" ever becomes variable, this will need
- * to be updated to remain correct on both old and new CPUs.
- *
- * Change top bits to match most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
- */
-#ifdef CONFIG_X86_5LEVEL
- ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
- "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
-#else
- shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
- sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
-#endif
-
- /* If this changed %rcx, it was not canonical */
- cmpq %rcx, %r11
- jne swapgs_restore_regs_and_return_to_usermode
-
- cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
-
- movq R11(%rsp), %r11
- cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
- jne swapgs_restore_regs_and_return_to_usermode
-
- /*
- * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
- * restoring TF results in a trap from userspace immediately after
- * SYSRET.
- */
- testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
- jnz swapgs_restore_regs_and_return_to_usermode
-
- /* nothing to check for RSP */
-
- cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
- jne swapgs_restore_regs_and_return_to_usermode
+ ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
+ "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV

/*
* We win! This label is here just for ease of understanding
diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
index c7e25c9..f44e2f9 100644
--- a/arch/x86/include/asm/syscall.h
+++ b/arch/x86/include/asm/syscall.h
@@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
}

-void do_syscall_64(struct pt_regs *regs, int nr);
+bool do_syscall_64(struct pt_regs *regs, int nr);

#endif /* CONFIG_X86_32 */

Subject: [tip: x86/entry] x86/entry/32: Clean up syscall fast exit tests

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 1a09a27153f91cd7676b2d4ca574577572a8c999
Gitweb: https://git.kernel.org/tip/1a09a27153f91cd7676b2d4ca574577572a8c999
Author: Brian Gerst <[email protected]>
AuthorDate: Wed, 11 Oct 2023 18:43:51 -04:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Fri, 13 Oct 2023 13:05:28 +02:00

x86/entry/32: Clean up syscall fast exit tests

Merge compat and native code and clarify comments.

No change in functionality expected.

Signed-off-by: Brian Gerst <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Brian Gerst <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: Uros Bizjak <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/entry/common.c | 48 ++++++++++++++++++----------------------
1 file changed, 22 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 4c7154d..d813160 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -255,34 +255,30 @@ __visible noinstr bool do_fast_syscall_32(struct pt_regs *regs)
if (!__do_fast_syscall_32(regs))
return false;

-#ifdef CONFIG_X86_64
/*
- * Opportunistic SYSRETL: if possible, try to return using SYSRETL.
- * SYSRETL is available on all 64-bit CPUs, so we don't need to
- * bother with SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
+ * Check that the register state is valid for using SYSRETL/SYSEXIT
+ * to exit to userspace. Otherwise use the slower but fully capable
+ * IRET exit path.
*/
- return regs->cs == __USER32_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)) == 0;
-#else
- /*
- * Opportunistic SYSEXIT: if possible, try to return using SYSEXIT.
- *
- * Unlike 64-bit opportunistic SYSRET, we can't check that CX == IP,
- * because the ECX fixup above will ensure that this is essentially
- * never the case.
- *
- * We don't allow syscalls at all from VM86 mode, but we still
- * need to check VM, because we might be returning from sys_vm86.
- */
- return regs->cs == __USER_CS && regs->ss == __USER_DS &&
- regs->ip == landing_pad &&
- (regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)) == 0;
-#endif
+
+ /* XEN PV guests always use the IRET path */
+ if (cpu_feature_enabled(X86_FEATURE_XENPV))
+ return false;
+
+ /* EIP must point to the VDSO landing pad */
+ if (unlikely(regs->ip != landing_pad))
+ return false;
+
+ /* CS and SS must match the values set in MSR_STAR */
+ if (unlikely(regs->cs != __USER32_CS || regs->ss != __USER_DS))
+ return false;
+
+ /* If the TF, RF, or VM flags are set, use IRET */
+ if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF | X86_EFLAGS_VM)))
+ return false;
+
+ /* Use SYSRETL/SYSEXIT to exit to userspace */
+ return true;
}

/* Returns true to return using SYSEXIT/SYSRETL, or false to use IRET */

Subject: [tip: x86/entry] x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test

The following commit has been merged into the x86/entry branch of tip:

Commit-ID: 58978b44df7276f7c75a2c6aad6c201421cd4daa
Gitweb: https://git.kernel.org/tip/58978b44df7276f7c75a2c6aad6c201421cd4daa
Author: Brian Gerst <[email protected]>
AuthorDate: Wed, 11 Oct 2023 18:43:50 -04:00
Committer: Ingo Molnar <[email protected]>
CommitterDate: Fri, 13 Oct 2023 13:05:28 +02:00

x86/entry/64: Use TASK_SIZE_MAX for canonical RIP test

Using shifts to determine if an address is canonical is difficult for
the compiler to optimize when the virtual address width is variable
(LA57 feature) without using inline assembly. Instead, compare RIP
against TASK_SIZE_MAX. The only user executable address outside of that
range is the deprecated vsyscall page, which can fall back to using IRET.

Signed-off-by: Brian Gerst <[email protected]>
Signed-off-by: Ingo Molnar <[email protected]>
Cc: Andy Lutomirski <[email protected]>
Cc: Borislav Petkov <[email protected]>
Cc: Brian Gerst <[email protected]>
Cc: Denys Vlasenko <[email protected]>
Cc: H. Peter Anvin <[email protected]>
Cc: Linus Torvalds <[email protected]>
Cc: Peter Zijlstra <[email protected]>
Cc: Thomas Gleixner <[email protected]>
Cc: Josh Poimboeuf <[email protected]>
Cc: Uros Bizjak <[email protected]>
Link: https://lore.kernel.org/r/[email protected]
---
arch/x86/entry/common.c | 6 +++---
1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
index 9021465..4c7154d 100644
--- a/arch/x86/entry/common.c
+++ b/arch/x86/entry/common.c
@@ -110,10 +110,10 @@ __visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
* in kernel space. This essentially lets the user take over
* the kernel, since userspace controls RSP.
*
- * Change top bits to match the most significant bit (47th or 56th bit
- * depending on paging mode) in the address.
+ * TASK_SIZE_MAX covers all user-accessible addresses other than
+ * the deprecated vsyscall page.
*/
- if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
+ if (unlikely(regs->ip >= TASK_SIZE_MAX))
return false;

/*

2023-10-13 12:39:27

by Nikolay Borisov

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C



On 12.10.23 г. 1:43 ч., Brian Gerst wrote:
> Signed-off-by: Brian Gerst <[email protected]>
> ---
> arch/x86/entry/common.c | 43 ++++++++++++++++++++++++++-
> arch/x86/entry/entry_64.S | 53 ++--------------------------------
> arch/x86/include/asm/syscall.h | 2 +-
> 3 files changed, 45 insertions(+), 53 deletions(-)
>
> diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> index 0551bcb197fb..207149a0a9b3 100644
> --- a/arch/x86/entry/common.c
> +++ b/arch/x86/entry/common.c
> @@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
> return false;
> }
>
> -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
> +/* Returns true to return using SYSRET, or false to use IRET */
> +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
> {
> add_random_kstack_offset();
> nr = syscall_enter_from_user_mode(regs, nr);
> @@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
>
> instrumentation_end();
> syscall_exit_to_user_mode(regs);
> +
> + /*
> + * Check that the register state is valid for using SYSRET to exit
> + * to userspace. Otherwise use the slower but fully capable IRET
> + * exit path.
> + */
> +
> + /* XEN PV guests always use IRET path */
> + if (cpu_feature_enabled(X86_FEATURE_XENPV))
> + return false;
> +
> + /* SYSRET requires RCX == RIP and R11 == EFLAGS */
> + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
> + return false;


Under what conditions do we expect this to not be true since we've come
via the syscall which adheres to this layout? IOW isn't this always
going to be true and we can simply eliminate it?

> +
> + /* CS and SS must match the values set in MSR_STAR */
> + if (unlikely(regs->cs != __USER_CS || regs->ss != __USER_DS))
> + return false;
> +
> + /*
> + * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> + * in kernel space. This essentially lets the user take over
> + * the kernel, since userspace controls RSP.
> + *
> + * Change top bits to match most significant bit (47th or 56th bit
> + * depending on paging mode) in the address.
> + */
> + if (unlikely(!__is_canonical_address(regs->ip, __VIRTUAL_MASK_SHIFT + 1)))
> + return false;
> +
> + /*
> + * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
> + * restoring TF results in a trap from userspace immediately after
> + * SYSRET.
> + */
> + if (unlikely(regs->flags & (X86_EFLAGS_RF | X86_EFLAGS_TF)))
> + return false;
> +
> + /* Use SYSRET to exit to userspace */
> + return true;
> }
> #endif
>
> diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
> index 3bdc22d7e78f..de6469dffe3a 100644
> --- a/arch/x86/entry/entry_64.S
> +++ b/arch/x86/entry/entry_64.S
> @@ -126,57 +126,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, SYM_L_GLOBAL)
> * In the Xen PV case we must use iret anyway.
> */
>
> - ALTERNATIVE "", "jmp swapgs_restore_regs_and_return_to_usermode", \
> - X86_FEATURE_XENPV
> -
> - movq RCX(%rsp), %rcx
> - movq RIP(%rsp), %r11
> -
> - cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */
> - jne swapgs_restore_regs_and_return_to_usermode
> -
> - /*
> - * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP
> - * in kernel space. This essentially lets the user take over
> - * the kernel, since userspace controls RSP.
> - *
> - * If width of "canonical tail" ever becomes variable, this will need
> - * to be updated to remain correct on both old and new CPUs.
> - *
> - * Change top bits to match most significant bit (47th or 56th bit
> - * depending on paging mode) in the address.
> - */
> -#ifdef CONFIG_X86_5LEVEL
> - ALTERNATIVE "shl $(64 - 48), %rcx; sar $(64 - 48), %rcx", \
> - "shl $(64 - 57), %rcx; sar $(64 - 57), %rcx", X86_FEATURE_LA57
> -#else
> - shl $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> - sar $(64 - (__VIRTUAL_MASK_SHIFT+1)), %rcx
> -#endif
> -
> - /* If this changed %rcx, it was not canonical */
> - cmpq %rcx, %r11
> - jne swapgs_restore_regs_and_return_to_usermode
> -
> - cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */
> - jne swapgs_restore_regs_and_return_to_usermode
> -
> - movq R11(%rsp), %r11
> - cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */
> - jne swapgs_restore_regs_and_return_to_usermode
> -
> - /*
> - * SYSRET cannot restore RF. It can restore TF, but unlike IRET,
> - * restoring TF results in a trap from userspace immediately after
> - * SYSRET.
> - */
> - testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11
> - jnz swapgs_restore_regs_and_return_to_usermode
> -
> - /* nothing to check for RSP */
> -
> - cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */
> - jne swapgs_restore_regs_and_return_to_usermode
> + ALTERNATIVE "testb %al, %al; jz swapgs_restore_regs_and_return_to_usermode", \
> + "jmp swapgs_restore_regs_and_return_to_usermode", X86_FEATURE_XENPV
>
> /*
> * We win! This label is here just for ease of understanding
> diff --git a/arch/x86/include/asm/syscall.h b/arch/x86/include/asm/syscall.h
> index c7e25c940f1a..f44e2f9ab65d 100644
> --- a/arch/x86/include/asm/syscall.h
> +++ b/arch/x86/include/asm/syscall.h
> @@ -126,7 +126,7 @@ static inline int syscall_get_arch(struct task_struct *task)
> ? AUDIT_ARCH_I386 : AUDIT_ARCH_X86_64;
> }
>
> -void do_syscall_64(struct pt_regs *regs, int nr);
> +bool do_syscall_64(struct pt_regs *regs, int nr);
>
> #endif /* CONFIG_X86_32 */
>

2023-10-13 13:06:29

by Brian Gerst

[permalink] [raw]
Subject: Re: [PATCH v3 1/3] x86/entry/64: Convert SYSRET validation tests to C

On Fri, Oct 13, 2023 at 8:39 AM Nikolay Borisov <[email protected]> wrote:
>
>
>
> On 12.10.23 г. 1:43 ч., Brian Gerst wrote:
> > Signed-off-by: Brian Gerst <[email protected]>
> > ---
> > arch/x86/entry/common.c | 43 ++++++++++++++++++++++++++-
> > arch/x86/entry/entry_64.S | 53 ++--------------------------------
> > arch/x86/include/asm/syscall.h | 2 +-
> > 3 files changed, 45 insertions(+), 53 deletions(-)
> >
> > diff --git a/arch/x86/entry/common.c b/arch/x86/entry/common.c
> > index 0551bcb197fb..207149a0a9b3 100644
> > --- a/arch/x86/entry/common.c
> > +++ b/arch/x86/entry/common.c
> > @@ -71,7 +71,8 @@ static __always_inline bool do_syscall_x32(struct pt_regs *regs, int nr)
> > return false;
> > }
> >
> > -__visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
> > +/* Returns true to return using SYSRET, or false to use IRET */
> > +__visible noinstr bool do_syscall_64(struct pt_regs *regs, int nr)
> > {
> > add_random_kstack_offset();
> > nr = syscall_enter_from_user_mode(regs, nr);
> > @@ -85,6 +86,46 @@ __visible noinstr void do_syscall_64(struct pt_regs *regs, int nr)
> >
> > instrumentation_end();
> > syscall_exit_to_user_mode(regs);
> > +
> > + /*
> > + * Check that the register state is valid for using SYSRET to exit
> > + * to userspace. Otherwise use the slower but fully capable IRET
> > + * exit path.
> > + */
> > +
> > + /* XEN PV guests always use IRET path */
> > + if (cpu_feature_enabled(X86_FEATURE_XENPV))
> > + return false;
> > +
> > + /* SYSRET requires RCX == RIP and R11 == EFLAGS */
> > + if (unlikely(regs->cx != regs->ip || regs->r11 != regs->flags))
> > + return false;
>
>
> Under what conditions do we expect this to not be true since we've come
> via the syscall which adheres to this layout? IOW isn't this always
> going to be true and we can simply eliminate it?

Any syscall that can modify pt_regs, like sigreturn(), exec(), etc.
Also ptrace can change registers.

Brian Gerst