2017-12-19 22:17:54

by Junaid Shahid

[permalink] [raw]
Subject: [PATCH] crypto: Fix out-of-bounds memory access in generic-gcm-aesni

The aesni_gcm_enc/dec functions can access memory before the start of
the data buffer if the length of the data buffer is less than 16 bytes.
This is because they perform the read via a single 16-byte load. It
didn't matter with rfc4106-gcm-aesni as in that case there was always at
least 16 bytes preceding the data buffer in the form of AAD+IV, but that
is no longer the case with generic-gcm-aesni. This can potentially result
in accessing a page that is not mapped and thus causing the machine to
crash. This patch fixes that by reading the partial block byte-by-byte and
optionally via an 8-byte load if the block was at least 8 bytes.

Fixes: 0487ccac ("crypto: aesni - make non-AVX AES-GCM work with any aadlen")
Signed-off-by: Junaid Shahid <[email protected]>
---
arch/x86/crypto/aesni-intel_asm.S | 85 ++++++++++++++++++++-------------------
1 file changed, 43 insertions(+), 42 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 16627fec80b2..a442d4645e91 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -85,6 +85,7 @@ enc: .octa 0x2
# and zero should follow ALL_F
.section .rodata, "a", @progbits
.align 16
+ .octa 0xffffffffffffffffffffffffffffffff
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
@@ -1310,6 +1311,40 @@ _esb_loop_\@:
MOVADQ (%r10),\TMP1
AESENCLAST \TMP1,\XMM0
.endm
+
+# read the last <16 byte block
+# %r11 is the data offset value
+# %r13 is the length of the partial block
+# Clobbers %rax, TMP1-2 and XMM1-2
+.macro READ_PARTIAL_BLOCK TMP1, TMP2, XMM1, XMM2, XMMDst
+ pxor \XMMDst, \XMMDst
+ lea -1(%arg3,%r11,1), \TMP1
+ mov %r13, \TMP2
+ cmp $8, %r13
+ jl _read_last_lt8_encrypt_\@
+ mov 1(\TMP1), %rax
+ MOVQ_R64_XMM %rax, \XMMDst
+ add $8, \TMP1
+ sub $8, \TMP2
+ jz _done_read_last_partial_block_encrypt_\@
+_read_last_lt8_encrypt_\@:
+ shl $8, %rax
+ mov (\TMP1, \TMP2, 1), %al
+ dec \TMP2
+ jnz _read_last_lt8_encrypt_\@
+ MOVQ_R64_XMM %rax, \XMM1
+ # adjust the shuffle mask pointer to be able to shift either 0 or 8
+ # bytes depending on whether the last block is <8 bytes or not
+ mov %r13, \TMP1
+ and $8, \TMP1
+ lea SHIFT_MASK(%rip), %rax
+ sub \TMP1, %rax
+ movdqu (%rax), \XMM2 # get the appropriate shuffle mask
+ PSHUFB_XMM \XMM2, \XMM1 # shift left either 0 or 8 bytes
+ por \XMM1, \XMMDst
+_done_read_last_partial_block_encrypt_\@:
+.endm
+
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
* u8 *out, // Plaintext output. Encrypt in-place is allowed.
@@ -1385,14 +1420,6 @@ _esb_loop_\@:
*
* AAD Format with 64-bit Extended Sequence Number
*
-* aadLen:
-* from the definition of the spec, aadLen can only be 8 or 12 bytes.
-* The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-* For other sizes, the code will fail.
-*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
@@ -1486,19 +1513,13 @@ _zero_cipher_left_decrypt:
PSHUFB_XMM %xmm10, %xmm0

ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12
-# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
-# (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm3 %xmm1

+ lea ALL_F+16(%rip), %r12
+ sub %r13, %r12
movdqa %xmm1, %xmm2
pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
pand %xmm1, %xmm2
@@ -1507,9 +1528,6 @@ _zero_cipher_left_decrypt:

pxor %xmm2, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11

# output %r13 bytes
MOVQ_R64_XMM %xmm0, %rax
@@ -1663,14 +1681,6 @@ ENDPROC(aesni_gcm_dec)
*
* AAD Format with 64-bit Extended Sequence Number
*
-* aadLen:
-* from the definition of the spec, aadLen can only be 8 or 12 bytes.
-* The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-* For other sizes, the code will fail.
-*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
@@ -1763,19 +1773,13 @@ _zero_cipher_left_encrypt:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0

-
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
- lea SHIFT_MASK+16(%rip), %r12
+ READ_PARTIAL_BLOCK %r10 %r12 %xmm2 %xmm3 %xmm1
+
+ lea ALL_F+16(%rip), %r12
sub %r13, %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-r13 bytes of xmm0
pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
movdqa SHUF_MASK(%rip), %xmm10
@@ -1784,9 +1788,6 @@ _zero_cipher_left_encrypt:
pxor %xmm0, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11
-
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0

--
2.15.1.620.gb9897f4670-goog


2017-12-20 04:43:03

by Junaid Shahid

[permalink] [raw]
Subject: [PATCH v2 0/2] Fix out-of-bounds memory accesses in generic-gcm-aesni

Changes in v2:
- Also fixed issue 2 described below in addition to issue 1 in v1

The aesni_gcm_enc/dec functions can access memory before the start or end of
the supplied src buffer. This can happen if either:

1. The data length is less than 16 bytes and there is no AAD or the AAD
length is not enough to cover the underrun. In this case, memory before
the start of the buffer would be accessed.
2. The AAD length is not a multiple of 4 bytes and the data length is too
small to cover the overrun. In this case, memory after the end of the
buffer would be accessed.

This was not a problem when rfc4106-gcm-aesni was the only mode supported by
the aesni module, as in that case there is always enough AAD and IV bytes to
cover the out-of-bounds accesses. However, that is no longer the case with
the generic-gcm-aesni mode. This could potentially result in accessing pages
that are not mapped, thus causing a crash.


Junaid Shahid (2):
crypto: Fix out-of-bounds access of the data buffer in
generic-gcm-aesni
crypto: Fix out-of-bounds access of the AAD buffer in
generic-gcm-aesni

arch/x86/crypto/aesni-intel_asm.S | 166 +++++++++++++-------------------------
1 file changed, 54 insertions(+), 112 deletions(-)

--
2.15.1.620.gb9897f4670-goog

2017-12-20 04:43:03

by Junaid Shahid

[permalink] [raw]
Subject: [PATCH v2 1/2] crypto: Fix out-of-bounds access of the data buffer in generic-gcm-aesni

The aesni_gcm_enc/dec functions can access memory before the start of
the data buffer if the length of the data buffer is less than 16 bytes.
This is because they perform the read via a single 16-byte load. This
can potentially result in accessing a page that is not mapped and thus
causing the machine to crash. This patch fixes that by reading the
partial block byte-by-byte and optionally an via 8-byte load if the block
was at least 8 bytes.

Fixes: 0487ccac ("crypto: aesni - make non-AVX AES-GCM work with any aadlen")
Signed-off-by: Junaid Shahid <[email protected]>
---
arch/x86/crypto/aesni-intel_asm.S | 86 ++++++++++++++++++++-------------------
1 file changed, 44 insertions(+), 42 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 16627fec80b2..4b16f31cb359 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -85,6 +85,7 @@ enc: .octa 0x2
# and zero should follow ALL_F
.section .rodata, "a", @progbits
.align 16
+ .octa 0xffffffffffffffffffffffffffffffff
SHIFT_MASK: .octa 0x0f0e0d0c0b0a09080706050403020100
ALL_F: .octa 0xffffffffffffffffffffffffffffffff
.octa 0x00000000000000000000000000000000
@@ -256,6 +257,36 @@ aad_shift_arr:
pxor \TMP1, \GH # result is in TMP1
.endm

+# read the last <16 byte block
+# Clobbers %rax, DPTR, TMP1 and XMM1-2
+.macro READ_PARTIAL_BLOCK DPTR DLEN TMP1 XMM1 XMM2 XMMDst
+ pxor \XMMDst, \XMMDst
+ mov \DLEN, \TMP1
+ cmp $8, \DLEN
+ jl _read_last_lt8_\@
+ mov (\DPTR), %rax
+ MOVQ_R64_XMM %rax, \XMMDst
+ add $8, \DPTR
+ sub $8, \TMP1
+ jz _done_read_last_partial_block_\@
+_read_last_lt8_\@:
+ shl $8, %rax
+ mov -1(\DPTR, \TMP1, 1), %al
+ dec \TMP1
+ jnz _read_last_lt8_\@
+ MOVQ_R64_XMM %rax, \XMM1
+ # adjust the shuffle mask pointer to be able to shift either 0 or 8
+ # bytes depending on whether the last block is <8 bytes or not
+ mov \DLEN, \TMP1
+ and $8, \TMP1
+ lea SHIFT_MASK(%rip), %rax
+ sub \TMP1, %rax
+ movdqu (%rax), \XMM2 # get the appropriate shuffle mask
+ PSHUFB_XMM \XMM2, \XMM1 # shift left either 0 or 8 bytes
+ por \XMM1, \XMMDst
+_done_read_last_partial_block_\@:
+.endm
+
/*
* if a = number of total plaintext bytes
* b = floor(a/16)
@@ -1310,6 +1341,7 @@ _esb_loop_\@:
MOVADQ (%r10),\TMP1
AESENCLAST \TMP1,\XMM0
.endm
+
/*****************************************************************************
* void aesni_gcm_dec(void *aes_ctx, // AES Key schedule. Starts on a 16 byte boundary.
* u8 *out, // Plaintext output. Encrypt in-place is allowed.
@@ -1385,14 +1417,6 @@ _esb_loop_\@:
*
* AAD Format with 64-bit Extended Sequence Number
*
-* aadLen:
-* from the definition of the spec, aadLen can only be 8 or 12 bytes.
-* The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-* For other sizes, the code will fail.
-*
* poly = x^128 + x^127 + x^126 + x^121 + 1
*
*****************************************************************************/
@@ -1486,19 +1510,15 @@ _zero_cipher_left_decrypt:
PSHUFB_XMM %xmm10, %xmm0

ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # E(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte block
- lea SHIFT_MASK+16(%rip), %r12
- sub %r13, %r12
-# adjust the shuffle mask pointer to be able to shift 16-%r13 bytes
-# (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # right shift 16-%r13 butes

+ lea (%arg3,%r11,1), %r10
+ READ_PARTIAL_BLOCK %r10 %r13 %r12 %xmm2 %xmm3 %xmm1
+
+ lea ALL_F+16(%rip), %r12
+ sub %r13, %r12
movdqa %xmm1, %xmm2
pxor %xmm1, %xmm0 # Ciphertext XOR E(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-%r13 bytes of %xmm0
pand %xmm1, %xmm0 # mask out top 16-%r13 bytes of %xmm0
pand %xmm1, %xmm2
@@ -1507,9 +1527,6 @@ _zero_cipher_left_decrypt:

pxor %xmm2, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
- # GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11

# output %r13 bytes
MOVQ_R64_XMM %xmm0, %rax
@@ -1663,14 +1680,6 @@ ENDPROC(aesni_gcm_dec)
*
* AAD Format with 64-bit Extended Sequence Number
*
-* aadLen:
-* from the definition of the spec, aadLen can only be 8 or 12 bytes.
-* The code supports 16 too but for other sizes, the code will fail.
-*
-* TLen:
-* from the definition of the spec, TLen can only be 8, 12 or 16 bytes.
-* For other sizes, the code will fail.
-*
* poly = x^128 + x^127 + x^126 + x^121 + 1
***************************************************************************/
ENTRY(aesni_gcm_enc)
@@ -1763,19 +1772,15 @@ _zero_cipher_left_encrypt:
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0

-
ENCRYPT_SINGLE_BLOCK %xmm0, %xmm1 # Encrypt(K, Yn)
- sub $16, %r11
- add %r13, %r11
- movdqu (%arg3,%r11,1), %xmm1 # receive the last <16 byte blocks
- lea SHIFT_MASK+16(%rip), %r12
+
+ lea (%arg3,%r11,1), %r10
+ READ_PARTIAL_BLOCK %r10 %r13 %r12 %xmm2 %xmm3 %xmm1
+
+ lea ALL_F+16(%rip), %r12
sub %r13, %r12
- # adjust the shuffle mask pointer to be able to shift 16-r13 bytes
- # (%r13 is the number of bytes in plaintext mod 16)
- movdqu (%r12), %xmm2 # get the appropriate shuffle mask
- PSHUFB_XMM %xmm2, %xmm1 # shift right 16-r13 byte
pxor %xmm1, %xmm0 # Plaintext XOR Encrypt(K, Yn)
- movdqu ALL_F-SHIFT_MASK(%r12), %xmm1
+ movdqu (%r12), %xmm1
# get the appropriate mask to mask out top 16-r13 bytes of xmm0
pand %xmm1, %xmm0 # mask out top 16-r13 bytes of xmm0
movdqa SHUF_MASK(%rip), %xmm10
@@ -1784,9 +1789,6 @@ _zero_cipher_left_encrypt:
pxor %xmm0, %xmm8
GHASH_MUL %xmm8, %xmm13, %xmm9, %xmm10, %xmm11, %xmm5, %xmm6
# GHASH computation for the last <16 byte block
- sub %r13, %r11
- add $16, %r11
-
movdqa SHUF_MASK(%rip), %xmm10
PSHUFB_XMM %xmm10, %xmm0

--
2.15.1.620.gb9897f4670-goog

2017-12-20 04:43:04

by Junaid Shahid

[permalink] [raw]
Subject: [PATCH v2 2/2] crypto: Fix out-of-bounds access of the AAD buffer in generic-gcm-aesni

The aesni_gcm_enc/dec functions can access memory after the end of
the AAD buffer if the AAD length is not a multiple of 4 bytes.
It didn't matter with rfc4106-gcm-aesni as in that case the AAD was
always followed by the 8 byte IV, but that is no longer the case with
generic-gcm-aesni. This can potentially result in accessing a page that
is not mapped and thus causing the machine to crash. This patch fixes
that by reading the last <16 byte block of the AAD byte-by-byte and
optionally via an 8-byte load if the block was at least 8 bytes.

Fixes: 0487ccac ("crypto: aesni - make non-AVX AES-GCM work with any aadlen")
Signed-off-by: Junaid Shahid <[email protected]>
---
arch/x86/crypto/aesni-intel_asm.S | 80 +++++----------------------------------
1 file changed, 10 insertions(+), 70 deletions(-)

diff --git a/arch/x86/crypto/aesni-intel_asm.S b/arch/x86/crypto/aesni-intel_asm.S
index 4b16f31cb359..03846f2d32c9 100644
--- a/arch/x86/crypto/aesni-intel_asm.S
+++ b/arch/x86/crypto/aesni-intel_asm.S
@@ -309,7 +309,7 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
pxor \XMM2, \XMM2

cmp $16, %r11
- jl _get_AAD_rest8\num_initial_blocks\operation
+ jl _get_AAD_rest\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
@@ -322,43 +322,13 @@ _get_AAD_blocks\num_initial_blocks\operation:
jge _get_AAD_blocks\num_initial_blocks\operation

movdqu \XMM2, %xmm\i
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
cmp $0, %r11
je _get_AAD_done\num_initial_blocks\operation

- pxor %xmm\i,%xmm\i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some CT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
- cmp $4, %r11
- jle _get_AAD_rest4\num_initial_blocks\operation
- movq (%r10), \TMP1
- add $8, %r10
- sub $8, %r11
- pslldq $8, \TMP1
- psrldq $8, %xmm\i
- pxor \TMP1, %xmm\i
- jmp _get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
- cmp $0, %r11
- jle _get_AAD_rest0\num_initial_blocks\operation
- mov (%r10), %eax
- movq %rax, \TMP1
- add $4, %r10
- sub $4, %r10
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \TMP1
- PSHUFB_XMM \TMP1, %xmm\i
-_get_AAD_rest_final\num_initial_blocks\operation:
+ READ_PARTIAL_BLOCK %r10, %r12, %r11, \TMP1, \TMP2, %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor \XMM2, %xmm\i
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
@@ -568,7 +538,7 @@ XMM2 XMM3 XMM4 XMMDst TMP6 TMP7 i i_seq operation
pxor \XMM2, \XMM2

cmp $16, %r11
- jl _get_AAD_rest8\num_initial_blocks\operation
+ jl _get_AAD_rest\num_initial_blocks\operation
_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
@@ -581,43 +551,13 @@ _get_AAD_blocks\num_initial_blocks\operation:
jge _get_AAD_blocks\num_initial_blocks\operation

movdqu \XMM2, %xmm\i
+
+ /* read the last <16B of AAD */
+_get_AAD_rest\num_initial_blocks\operation:
cmp $0, %r11
je _get_AAD_done\num_initial_blocks\operation

- pxor %xmm\i,%xmm\i
-
- /* read the last <16B of AAD. since we have at least 4B of
- data right after the AAD (the ICV, and maybe some PT), we can
- read 4B/8B blocks safely, and then get rid of the extra stuff */
-_get_AAD_rest8\num_initial_blocks\operation:
- cmp $4, %r11
- jle _get_AAD_rest4\num_initial_blocks\operation
- movq (%r10), \TMP1
- add $8, %r10
- sub $8, %r11
- pslldq $8, \TMP1
- psrldq $8, %xmm\i
- pxor \TMP1, %xmm\i
- jmp _get_AAD_rest8\num_initial_blocks\operation
-_get_AAD_rest4\num_initial_blocks\operation:
- cmp $0, %r11
- jle _get_AAD_rest0\num_initial_blocks\operation
- mov (%r10), %eax
- movq %rax, \TMP1
- add $4, %r10
- sub $4, %r10
- pslldq $12, \TMP1
- psrldq $4, %xmm\i
- pxor \TMP1, %xmm\i
-_get_AAD_rest0\num_initial_blocks\operation:
- /* finalize: shift out the extra bytes we read, and align
- left. since pslldq can only shift by an immediate, we use
- vpshufb and an array of shuffle masks */
- movq %r12, %r11
- salq $4, %r11
- movdqu aad_shift_arr(%r11), \TMP1
- PSHUFB_XMM \TMP1, %xmm\i
-_get_AAD_rest_final\num_initial_blocks\operation:
+ READ_PARTIAL_BLOCK %r10, %r12, %r11, \TMP1, \TMP2, %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor \XMM2, %xmm\i
GHASH_MUL %xmm\i, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
--
2.15.1.620.gb9897f4670-goog

2017-12-20 08:36:19

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] crypto: Fix out-of-bounds access of the data buffer in generic-gcm-aesni

On Tue, Dec 19, 2017 at 08:42:58PM -0800, Junaid Shahid wrote:
> The aesni_gcm_enc/dec functions can access memory before the start of
> the data buffer if the length of the data buffer is less than 16 bytes.
> This is because they perform the read via a single 16-byte load. This
> can potentially result in accessing a page that is not mapped and thus
> causing the machine to crash. This patch fixes that by reading the
> partial block byte-by-byte and optionally an via 8-byte load if the block
> was at least 8 bytes.
>
> Fixes: 0487ccac ("crypto: aesni - make non-AVX AES-GCM work with any aadlen")
> Signed-off-by: Junaid Shahid <[email protected]>

Did you run the self-tests (boot with CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
unset)? The second patch causes them to start failing:

[ 1.169640] alg: aead: Test 7 failed on encryption for rfc4106-gcm-aesni
[ 1.178308] alg: aead: Test 4 failed on encryption for generic-gcm-aesni

Also, don't the AVX and AVX2 versions have the same bug? These patches only fix
the non-AVX version.

> +# read the last <16 byte block
> +# Clobbers %rax, DPTR, TMP1 and XMM1-2
> +.macro READ_PARTIAL_BLOCK DPTR DLEN TMP1 XMM1 XMM2 XMMDst
> + pxor \XMMDst, \XMMDst
> + mov \DLEN, \TMP1
> + cmp $8, \DLEN
> + jl _read_last_lt8_\@
> + mov (\DPTR), %rax
> + MOVQ_R64_XMM %rax, \XMMDst
> + add $8, \DPTR
> + sub $8, \TMP1
> + jz _done_read_last_partial_block_\@
> +_read_last_lt8_\@:
> + shl $8, %rax
> + mov -1(\DPTR, \TMP1, 1), %al
> + dec \TMP1
> + jnz _read_last_lt8_\@
> + MOVQ_R64_XMM %rax, \XMM1

Doesn't %rax needs to be zeroed at '_read_last_lt8' so that the padding is
zeroed?

> + # adjust the shuffle mask pointer to be able to shift either 0 or 8
> + # bytes depending on whether the last block is <8 bytes or not
> + mov \DLEN, \TMP1
> + and $8, \TMP1
> + lea SHIFT_MASK(%rip), %rax
> + sub \TMP1, %rax
> + movdqu (%rax), \XMM2 # get the appropriate shuffle mask
> + PSHUFB_XMM \XMM2, \XMM1 # shift left either 0 or 8 bytes

Is there any way this can be simplified to use pslldq? The shift is just 8
bytes or nothing, and we already had to branch earlier depending on whether we
needed to read the 8 bytes or not.

Eric

2017-12-20 08:42:13

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] crypto: Fix out-of-bounds access of the AAD buffer in generic-gcm-aesni

On Tue, Dec 19, 2017 at 08:42:59PM -0800, Junaid Shahid wrote:
> The aesni_gcm_enc/dec functions can access memory after the end of
> the AAD buffer if the AAD length is not a multiple of 4 bytes.
> It didn't matter with rfc4106-gcm-aesni as in that case the AAD was
> always followed by the 8 byte IV, but that is no longer the case with
> generic-gcm-aesni. This can potentially result in accessing a page that
> is not mapped and thus causing the machine to crash. This patch fixes
> that by reading the last <16 byte block of the AAD byte-by-byte and
> optionally via an 8-byte load if the block was at least 8 bytes.
>
> Fixes: 0487ccac ("crypto: aesni - make non-AVX AES-GCM work with any aadlen")
[...]
> -_get_AAD_rest0\num_initial_blocks\operation:
> - /* finalize: shift out the extra bytes we read, and align
> - left. since pslldq can only shift by an immediate, we use
> - vpshufb and an array of shuffle masks */
> - movq %r12, %r11
> - salq $4, %r11
> - movdqu aad_shift_arr(%r11), \TMP1
> - PSHUFB_XMM \TMP1, %xmm\i

aad_shift_arr is no longer used, so it should be removed.

> -_get_AAD_rest_final\num_initial_blocks\operation:
> + READ_PARTIAL_BLOCK %r10, %r12, %r11, \TMP1, \TMP2, %xmm\i

It seems that INITIAL_BLOCKS_DEC and INITIAL_BLOCKS_ENC maintain both %r11 and
%r12 as the AAD length. %r11 is used for real earlier, but here %r12 is used
for real while %r11 is a temporary register. Can this be simplified to have the
AAD length in %r11 only?

Eric

2017-12-20 19:28:30

by Junaid Shahid

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] crypto: Fix out-of-bounds access of the data buffer in generic-gcm-aesni

On Wednesday, December 20, 2017 12:36:16 AM PST Eric Biggers wrote:
>
> Did you run the self-tests (boot with CONFIG_CRYPTO_MANAGER_DISABLE_TESTS
> unset)? The second patch causes them to start failing:
>
> [ 1.169640] alg: aead: Test 7 failed on encryption for rfc4106-gcm-aesni
> [ 1.178308] alg: aead: Test 4 failed on encryption for generic-gcm-aesni

Thanks for the pointer. Let me run the self-tests and see.

>
> Also, don't the AVX and AVX2 versions have the same bug? These patches only fix
> the non-AVX version.

The AVX/AVX2 versions are used only when the data length is at least 640/4K bytes respectively. Therefore, the first bug doesn’t apply at all. The second bug does exist, but it won’t cause any ill effect at the present time because the overrun will be covered by the data bytes. However, I am planning to send out a separate patch series that allows for non-contiguous AAD, data and AuthTag buffers, which will cause the AAD overrun to manifest even in the AVX versions, so I am going to include the AVX version fixes in that series.

>
> > +# read the last <16 byte block
> > +# Clobbers %rax, DPTR, TMP1 and XMM1-2
> > +.macro READ_PARTIAL_BLOCK DPTR DLEN TMP1 XMM1 XMM2 XMMDst
> > + pxor \XMMDst, \XMMDst
> > + mov \DLEN, \TMP1
> > + cmp $8, \DLEN
> > + jl _read_last_lt8_\@
> > + mov (\DPTR), %rax
> > + MOVQ_R64_XMM %rax, \XMMDst
> > + add $8, \DPTR
> > + sub $8, \TMP1
> > + jz _done_read_last_partial_block_\@
> > +_read_last_lt8_\@:
> > + shl $8, %rax
> > + mov -1(\DPTR, \TMP1, 1), %al
> > + dec \TMP1
> > + jnz _read_last_lt8_\@
> > + MOVQ_R64_XMM %rax, \XMM1
>
> Doesn't %rax needs to be zeroed at '_read_last_lt8' so that the padding is
> zeroed?
>

Ah, yes. It actually isn’t needed for the first patch, as in that case the result returned by this macro is masked appropriately at the call-sites anyway. But that is not the case for the second patch. That is probably also the reason for the failures that you noticed.

> > + # adjust the shuffle mask pointer to be able to shift either 0 or 8
> > + # bytes depending on whether the last block is <8 bytes or not
> > + mov \DLEN, \TMP1
> > + and $8, \TMP1
> > + lea SHIFT_MASK(%rip), %rax
> > + sub \TMP1, %rax
> > + movdqu (%rax), \XMM2 # get the appropriate shuffle mask
> > + PSHUFB_XMM \XMM2, \XMM1 # shift left either 0 or 8 bytes
>
> Is there any way this can be simplified to use pslldq? The shift is just 8
> bytes or nothing, and we already had to branch earlier depending on whether we
> needed to read the 8 bytes or not.

I am not sure if we can use a simple pslldq without either introducing another branch, or duplicating the _read_last_lt8 block for each case of the original branch. Do you think that it is better to just duplicate the _read_last_lt8 block instead of using pshufb? Or do you have any other suggestion about how to do it?

Thanks,
Junaid

2017-12-20 19:35:47

by Junaid Shahid

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] crypto: Fix out-of-bounds access of the AAD buffer in generic-gcm-aesni

On Wednesday, December 20, 2017 12:42:10 AM PST Eric Biggers wrote:
> > -_get_AAD_rest0\num_initial_blocks\operation:
> > - /* finalize: shift out the extra bytes we read, and align
> > - left. since pslldq can only shift by an immediate, we use
> > - vpshufb and an array of shuffle masks */
> > - movq %r12, %r11
> > - salq $4, %r11
> > - movdqu aad_shift_arr(%r11), \TMP1
> > - PSHUFB_XMM \TMP1, %xmm\i
>
> aad_shift_arr is no longer used, so it should be removed.

Ack.

>
> > -_get_AAD_rest_final\num_initial_blocks\operation:
> > + READ_PARTIAL_BLOCK %r10, %r12, %r11, \TMP1, \TMP2, %xmm\i
>
> It seems that INITIAL_BLOCKS_DEC and INITIAL_BLOCKS_ENC maintain both %r11 and
> %r12 as the AAD length. %r11 is used for real earlier, but here %r12 is used
> for real while %r11 is a temporary register. Can this be simplified to have the
> AAD length in %r11 only?
>

We do need both registers, though we could certainly swap their usage to make r12 the temp register. The reason we need the second register is because we need to keep the original length to perform the pshufb at the end. But, of course, that will not be needed anymore if we avoid the pshufb by duplicating the _read_last_lt8 block or utilizing pslldq some other way.

Thanks,
Junaid

2017-12-20 21:05:24

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 1/2] crypto: Fix out-of-bounds access of the data buffer in generic-gcm-aesni

On Wed, Dec 20, 2017 at 11:28:27AM -0800, Junaid Shahid wrote:
> > > + # adjust the shuffle mask pointer to be able to shift either 0 or 8
> > > + # bytes depending on whether the last block is <8 bytes or not
> > > + mov \DLEN, \TMP1
> > > + and $8, \TMP1
> > > + lea SHIFT_MASK(%rip), %rax
> > > + sub \TMP1, %rax
> > > + movdqu (%rax), \XMM2 # get the appropriate shuffle mask
> > > + PSHUFB_XMM \XMM2, \XMM1 # shift left either 0 or 8 bytes
> >
> > Is there any way this can be simplified to use pslldq? The shift is just 8
> > bytes or nothing, and we already had to branch earlier depending on whether we
> > needed to read the 8 bytes or not.
>
> I am not sure if we can use a simple pslldq without either introducing another
> branch, or duplicating the _read_last_lt8 block for each case of the original
> branch. Do you think that it is better to just duplicate the _read_last_lt8
> block instead of using pshufb? Or do you have any other suggestion about how
> to do it?
>

The best I can come up with now is just duplicating the "read one byte at a
time" instructions (see below). One way to avoid the duplication would be to
read the 1-7 byte remainder (end of the block) first and the 8-byte word
(beginning of the block) second, but it would be a bit weird.

# read the last <16 byte block
# Clobbers %rax, TMP1 and XMM1
.macro READ_PARTIAL_BLOCK DPTR DLEN TMP1 XMM1 XMMDst
mov \DLEN, \TMP1
cmp $8, \DLEN
jl _read_partial_lt8_\@
mov (\DPTR), %rax
MOVQ_R64_XMM %rax, \XMMDst
sub $8, \TMP1
jz _done_read_partial_\@
xor %rax, %rax
1:
shl $8, %rax
mov 7(\DPTR, \TMP1, 1), %al
dec \TMP1
jnz 1b
MOVQ_R64_XMM %rax, \XMM1
pslldq $8, \XMM1
por \XMM1, \XMMDst
jmp _done_read_partial_\@

_read_partial_lt8_\@:
xor %rax, %rax
1:
shl $8, %rax
mov -1(\DPTR, \TMP1, 1), %al
dec \TMP1
jnz 1b
MOVQ_R64_XMM %rax, \XMMDst
_done_read_partial_\@:
.endm

2017-12-20 21:12:58

by Eric Biggers

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] crypto: Fix out-of-bounds access of the AAD buffer in generic-gcm-aesni

On Wed, Dec 20, 2017 at 11:35:44AM -0800, Junaid Shahid wrote:
> On Wednesday, December 20, 2017 12:42:10 AM PST Eric Biggers wrote:
> > > -_get_AAD_rest0\num_initial_blocks\operation:
> > > - /* finalize: shift out the extra bytes we read, and align
> > > - left. since pslldq can only shift by an immediate, we use
> > > - vpshufb and an array of shuffle masks */
> > > - movq %r12, %r11
> > > - salq $4, %r11
> > > - movdqu aad_shift_arr(%r11), \TMP1
> > > - PSHUFB_XMM \TMP1, %xmm\i
> >
> > aad_shift_arr is no longer used, so it should be removed.
>
> Ack.
>
> >
> > > -_get_AAD_rest_final\num_initial_blocks\operation:
> > > + READ_PARTIAL_BLOCK %r10, %r12, %r11, \TMP1, \TMP2, %xmm\i
> >
> > It seems that INITIAL_BLOCKS_DEC and INITIAL_BLOCKS_ENC maintain both %r11 and
> > %r12 as the AAD length. %r11 is used for real earlier, but here %r12 is used
> > for real while %r11 is a temporary register. Can this be simplified to have the
> > AAD length in %r11 only?
> >
>
> We do need both registers, though we could certainly swap their usage to make
> r12 the temp register. The reason we need the second register is because we
> need to keep the original length to perform the pshufb at the end. But, of
> course, that will not be needed anymore if we avoid the pshufb by duplicating
> the _read_last_lt8 block or utilizing pslldq some other way.
>

If READ_PARTIAL_BLOCK can clobber 'DLEN' that would simplify it even more (no
need for 'TMP1'), but what I am talking about here is how INITIAL_BLOCKS_DEC and
INITIAL_BLOCKS_ENC maintain two copies of the remaining length in lock-step in
r11 and r12:

_get_AAD_blocks\num_initial_blocks\operation:
movdqu (%r10), %xmm\i
PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
pxor %xmm\i, \XMM2
GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
add $16, %r10
sub $16, %r12
sub $16, %r11
cmp $16, %r11
jge _get_AAD_blocks\num_initial_blocks\operation

The code which you are replacing with READ_PARTIAL_BLOCK actually needed the two
copies, but now it seems that only one copy is needed, so it can be simplified
by only using r11.

Eric

2017-12-20 21:51:29

by Junaid Shahid

[permalink] [raw]
Subject: Re: [PATCH v2 2/2] crypto: Fix out-of-bounds access of the AAD buffer in generic-gcm-aesni

On Wednesday, December 20, 2017 1:12:54 PM PST Eric Biggers wrote:
> >
> > We do need both registers, though we could certainly swap their usage to make
> > r12 the temp register. The reason we need the second register is because we
> > need to keep the original length to perform the pshufb at the end. But, of
> > course, that will not be needed anymore if we avoid the pshufb by duplicating
> > the _read_last_lt8 block or utilizing pslldq some other way.
> >
>
> If READ_PARTIAL_BLOCK can clobber 'DLEN' that would simplify it even more (no
> need for 'TMP1'), but what I am talking about here is how INITIAL_BLOCKS_DEC and
> INITIAL_BLOCKS_ENC maintain two copies of the remaining length in lock-step in
> r11 and r12:
>
> _get_AAD_blocks\num_initial_blocks\operation:
> movdqu (%r10), %xmm\i
> PSHUFB_XMM %xmm14, %xmm\i # byte-reflect the AAD data
> pxor %xmm\i, \XMM2
> GHASH_MUL \XMM2, \TMP3, \TMP1, \TMP2, \TMP4, \TMP5, \XMM1
> add $16, %r10
> sub $16, %r12
> sub $16, %r11
> cmp $16, %r11
> jge _get_AAD_blocks\num_initial_blocks\operation
>
> The code which you are replacing with READ_PARTIAL_BLOCK actually needed the two
> copies, but now it seems that only one copy is needed, so it can be simplified
> by only using r11.
>

Sorry, I misunderstood earlier. I’ll remove the extra register from the preceding code in INIITIAL_BLOCKS_ENC/DEC.

Thanks,
Junaid