Instead of doing complicated calculations to find the size of the subroutines
(which are even more complicated because they need to be stringified into
an asm statement), just hardcode to 16.
It is less dense for a few combinations of IBT/SLS/retbleed, but it has
the advantage of being really simple.
Suggested-by: Linus Torvalds <[email protected]>
Signed-off-by: Paolo Bonzini <[email protected]>
---
arch/x86/kvm/emulate.c | 17 +++++++----------
1 file changed, 7 insertions(+), 10 deletions(-)
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 0a15b0fec6d9..f8382abe22ff 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -189,13 +189,6 @@
#define X8(x...) X4(x), X4(x)
#define X16(x...) X8(x), X8(x)
-#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
-#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETHUNK)) + \
- IS_ENABLED(CONFIG_SLS))
-#define FASTOP_LENGTH (ENDBR_INSN_SIZE + 7 + RET_LENGTH)
-#define FASTOP_SIZE (8 << ((FASTOP_LENGTH > 8) & 1) << ((FASTOP_LENGTH > 16) & 1))
-static_assert(FASTOP_LENGTH <= FASTOP_SIZE);
-
struct opcode {
u64 flags;
u8 intercept;
@@ -310,9 +303,15 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
* Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
* different operand sizes can be reached by calculation, rather than a jump
* table (which would be bigger than the code).
+ *
+ * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR
+ * and 1 for the straight line speculation INT3, leaves 7 bytes for the
+ * body of the function. Currently none is larger than 4.
*/
static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
+#define FASTOP_SIZE 16
+
#define __FOP_FUNC(name) \
".align " __stringify(FASTOP_SIZE) " \n\t" \
".type " name ", @function \n\t" \
@@ -446,9 +445,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
* RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK]
* INT3 [1 byte; CONFIG_SLS]
*/
-#define SETCC_LENGTH (ENDBR_INSN_SIZE + 3 + RET_LENGTH)
-#define SETCC_ALIGN (4 << ((SETCC_LENGTH > 4) & 1) << ((SETCC_LENGTH > 8) & 1))
-static_assert(SETCC_LENGTH <= SETCC_ALIGN);
+#define SETCC_ALIGN 16
#define FOP_SETCC(op) \
".align " __stringify(SETCC_ALIGN) " \n\t" \
--
2.31.1
On Fri, Jul 15, 2022 at 07:49:27AM -0400, Paolo Bonzini wrote:
> Instead of doing complicated calculations to find the size of the subroutines
> (which are even more complicated because they need to be stringified into
> an asm statement), just hardcode to 16.
>
> It is less dense for a few combinations of IBT/SLS/retbleed, but it has
> the advantage of being really simple.
>
> Suggested-by: Linus Torvalds <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/emulate.c | 17 +++++++----------
> 1 file changed, 7 insertions(+), 10 deletions(-)
Acked-by: Borislav Petkov <[email protected]>
--
Regards/Gruss,
Boris.
SUSE Software Solutions Germany GmbH
GF: Ivo Totev, Andrew Myers, Andrew McDonald, Martje Boudien Moerman
(HRB 36809, AG Nürnberg)
On Fri, Jul 15, 2022 at 07:49:27AM -0400, Paolo Bonzini wrote:
> Instead of doing complicated calculations to find the size of the subroutines
> (which are even more complicated because they need to be stringified into
> an asm statement), just hardcode to 16.
>
> It is less dense for a few combinations of IBT/SLS/retbleed, but it has
> the advantage of being really simple.
>
> Suggested-by: Linus Torvalds <[email protected]>
> Signed-off-by: Paolo Bonzini <[email protected]>
> ---
> arch/x86/kvm/emulate.c | 17 +++++++----------
> 1 file changed, 7 insertions(+), 10 deletions(-)
>
> diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
> index 0a15b0fec6d9..f8382abe22ff 100644
> --- a/arch/x86/kvm/emulate.c
> +++ b/arch/x86/kvm/emulate.c
> @@ -189,13 +189,6 @@
> #define X8(x...) X4(x), X4(x)
> #define X16(x...) X8(x), X8(x)
>
> -#define NR_FASTOP (ilog2(sizeof(ulong)) + 1)
> -#define RET_LENGTH (1 + (4 * IS_ENABLED(CONFIG_RETHUNK)) + \
> - IS_ENABLED(CONFIG_SLS))
> -#define FASTOP_LENGTH (ENDBR_INSN_SIZE + 7 + RET_LENGTH)
> -#define FASTOP_SIZE (8 << ((FASTOP_LENGTH > 8) & 1) << ((FASTOP_LENGTH > 16) & 1))
> -static_assert(FASTOP_LENGTH <= FASTOP_SIZE);
> -
> struct opcode {
> u64 flags;
> u8 intercept;
> @@ -310,9 +303,15 @@ static void invalidate_registers(struct x86_emulate_ctxt *ctxt)
> * Moreover, they are all exactly FASTOP_SIZE bytes long, so functions for
> * different operand sizes can be reached by calculation, rather than a jump
> * table (which would be bigger than the code).
> + *
> + * The 16 byte alignment, considering 5 bytes for the RET thunk, 3 for ENDBR
> + * and 1 for the straight line speculation INT3, leaves 7 bytes for the
> + * body of the function. Currently none is larger than 4.
The ENDBR is 4 bytes long, which leaves only 6 bytes left. Which is why the
calculation being replaced may end up with a FASTOP_SIZE of 32.
Can you fix up these numbers when applying?
Cascardo.
> */
> static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
>
> +#define FASTOP_SIZE 16
> +
> #define __FOP_FUNC(name) \
> ".align " __stringify(FASTOP_SIZE) " \n\t" \
> ".type " name ", @function \n\t" \
> @@ -446,9 +445,7 @@ static int fastop(struct x86_emulate_ctxt *ctxt, fastop_t fop);
> * RET | JMP __x86_return_thunk [1,5 bytes; CONFIG_RETHUNK]
> * INT3 [1 byte; CONFIG_SLS]
> */
> -#define SETCC_LENGTH (ENDBR_INSN_SIZE + 3 + RET_LENGTH)
> -#define SETCC_ALIGN (4 << ((SETCC_LENGTH > 4) & 1) << ((SETCC_LENGTH > 8) & 1))
> -static_assert(SETCC_LENGTH <= SETCC_ALIGN);
> +#define SETCC_ALIGN 16
>
> #define FOP_SETCC(op) \
> ".align " __stringify(SETCC_ALIGN) " \n\t" \
> --
> 2.31.1
>