2018-05-03 14:52:07

by Austin Morton

[permalink] [raw]
Subject: [PATCH 1/1] sbc: Use extended inline assembly for armv6 primitives

GCC 5+ may generate incorrect code when calling a naked function
which modifies certain registers if optimizations are enabled.

Specifically in this case the math to increment "out" between calls to
sbc_analyze_xxx in sbc_analyze_4b_xxx will be generated to use
r3 without saving it. The inline assembly clobbers this register and
does not push it to the stack.

While this does appear to be a bug in GCC, the recommendation on the
GCC bug tracker was to simply avoid using naked functions.

See GCC bug 85593.
---
sbc/sbc_primitives_armv6.c | 216 ++++++++++++++++++-------------------
1 file changed, 104 insertions(+), 112 deletions(-)

diff --git a/sbc/sbc_primitives_armv6.c b/sbc/sbc_primitives_armv6.c
index 665f157..9c58afc 100644
--- a/sbc/sbc_primitives_armv6.c
+++ b/sbc/sbc_primitives_armv6.c
@@ -38,58 +38,55 @@

#ifdef SBC_BUILD_WITH_ARMV6_SUPPORT

-static void __attribute__((naked)) sbc_analyze_four_armv6()
+static void sbc_analyze_four_armv6(int16_t *in, int32_t *out, const
FIXED_T* consts)
{
- /* r0 = in, r1 = out, r2 = consts */
__asm__ volatile (
- "push {r1, r4-r7, lr}\n"
- "push {r8-r11}\n"
- "ldrd r4, r5, [r0, #0]\n"
- "ldrd r6, r7, [r2, #0]\n"
- "ldrd r8, r9, [r0, #16]\n"
- "ldrd r10, r11, [r2, #16]\n"
+ "ldrd r4, r5, [%1, #0]\n"
+ "ldrd r6, r7, [%2, #0]\n"
+ "ldrd r8, r9, [%1, #16]\n"
+ "ldrd r10, r11, [%2, #16]\n"
"mov r14, #0x8000\n"
"smlad r3, r4, r6, r14\n"
"smlad r12, r5, r7, r14\n"
- "ldrd r4, r5, [r0, #32]\n"
- "ldrd r6, r7, [r2, #32]\n"
+ "ldrd r4, r5, [%1, #32]\n"
+ "ldrd r6, r7, [%2, #32]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #48]\n"
- "ldrd r10, r11, [r2, #48]\n"
+ "ldrd r8, r9, [%1, #48]\n"
+ "ldrd r10, r11, [%2, #48]\n"
"smlad r3, r4, r6, r3\n"
"smlad r12, r5, r7, r12\n"
- "ldrd r4, r5, [r0, #64]\n"
- "ldrd r6, r7, [r2, #64]\n"
+ "ldrd r4, r5, [%1, #64]\n"
+ "ldrd r6, r7, [%2, #64]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #8]\n"
- "ldrd r10, r11, [r2, #8]\n"
+ "ldrd r8, r9, [%1, #8]\n"
+ "ldrd r10, r11, [%2, #8]\n"
"smlad r3, r4, r6, r3\n" /* t1[0] is done */
"smlad r12, r5, r7, r12\n" /* t1[1] is done */
- "ldrd r4, r5, [r0, #24]\n"
- "ldrd r6, r7, [r2, #24]\n"
+ "ldrd r4, r5, [%1, #24]\n"
+ "ldrd r6, r7, [%2, #24]\n"
"pkhtb r3, r12, r3, asr #16\n" /* combine t1[0] and t1[1] */
"smlad r12, r8, r10, r14\n"
"smlad r14, r9, r11, r14\n"
- "ldrd r8, r9, [r0, #40]\n"
- "ldrd r10, r11, [r2, #40]\n"
+ "ldrd r8, r9, [%1, #40]\n"
+ "ldrd r10, r11, [%2, #40]\n"
"smlad r12, r4, r6, r12\n"
"smlad r14, r5, r7, r14\n"
- "ldrd r4, r5, [r0, #56]\n"
- "ldrd r6, r7, [r2, #56]\n"
+ "ldrd r4, r5, [%1, #56]\n"
+ "ldrd r6, r7, [%2, #56]\n"
"smlad r12, r8, r10, r12\n"
"smlad r14, r9, r11, r14\n"
- "ldrd r8, r9, [r0, #72]\n"
- "ldrd r10, r11, [r2, #72]\n"
+ "ldrd r8, r9, [%1, #72]\n"
+ "ldrd r10, r11, [%2, #72]\n"
"smlad r12, r4, r6, r12\n"
"smlad r14, r5, r7, r14\n"
- "ldrd r4, r5, [r2, #80]\n" /* start loading cos table */
+ "ldrd r4, r5, [%2, #80]\n" /* start loading cos table */
"smlad r12, r8, r10, r12\n" /* t1[2] is done */
"smlad r14, r9, r11, r14\n" /* t1[3] is done */
- "ldrd r6, r7, [r2, #88]\n"
- "ldrd r8, r9, [r2, #96]\n"
- "ldrd r10, r11, [r2, #104]\n" /* cos table fully loaded */
+ "ldrd r6, r7, [%2, #88]\n"
+ "ldrd r8, r9, [%2, #96]\n"
+ "ldrd r10, r11, [%2, #104]\n" /* cos table fully loaded */
"pkhtb r12, r14, r12, asr #16\n" /* combine t1[2] and t1[3] */
"smuad r4, r3, r4\n"
"smuad r5, r3, r5\n"
@@ -99,196 +96,191 @@ static void __attribute__((naked))
sbc_analyze_four_armv6()
"smuad r7, r3, r7\n"
"smlad r6, r12, r10, r6\n"
"smlad r7, r12, r11, r7\n"
- "pop {r8-r11}\n"
- "stmia r1, {r4, r5, r6, r7}\n"
- "pop {r1, r4-r7, pc}\n"
+ "stmia %0, {r4, r5, r6, r7}\n"
+ : "+r" (out)
+ : "r" (in), "r" (consts)
+ : "memory",
+ "r3", "r4", "r5", "r6", "r7", "r8",
+ "r9", "r10", "r11", "r12", "r14"
);
}

-#define sbc_analyze_four(in, out, consts) \
- ((void (*)(int16_t *, int32_t *, const FIXED_T*)) \
- sbc_analyze_four_armv6)((in), (out), (consts))
-
-static void __attribute__((naked)) sbc_analyze_eight_armv6()
+static void sbc_analyze_eight_armv6(int16_t *in, int32_t *out, const
FIXED_T* consts)
{
- /* r0 = in, r1 = out, r2 = consts */
__asm__ volatile (
- "push {r1, r4-r7, lr}\n"
- "push {r8-r11}\n"
- "ldrd r4, r5, [r0, #24]\n"
- "ldrd r6, r7, [r2, #24]\n"
- "ldrd r8, r9, [r0, #56]\n"
- "ldrd r10, r11, [r2, #56]\n"
+ "ldrd r4, r5, [%1, #24]\n"
+ "ldrd r6, r7, [%2, #24]\n"
+ "ldrd r8, r9, [%1, #56]\n"
+ "ldrd r10, r11, [%2, #56]\n"
"mov r14, #0x8000\n"
"smlad r3, r4, r6, r14\n"
"smlad r12, r5, r7, r14\n"
- "ldrd r4, r5, [r0, #88]\n"
- "ldrd r6, r7, [r2, #88]\n"
+ "ldrd r4, r5, [%1, #88]\n"
+ "ldrd r6, r7, [%2, #88]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #120]\n"
- "ldrd r10, r11, [r2, #120]\n"
+ "ldrd r8, r9, [%1, #120]\n"
+ "ldrd r10, r11, [%2, #120]\n"
"smlad r3, r4, r6, r3\n"
"smlad r12, r5, r7, r12\n"
- "ldrd r4, r5, [r0, #152]\n"
- "ldrd r6, r7, [r2, #152]\n"
+ "ldrd r4, r5, [%1, #152]\n"
+ "ldrd r6, r7, [%2, #152]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #16]\n"
- "ldrd r10, r11, [r2, #16]\n"
+ "ldrd r8, r9, [%1, #16]\n"
+ "ldrd r10, r11, [%2, #16]\n"
"smlad r3, r4, r6, r3\n" /* t1[6] is done */
"smlad r12, r5, r7, r12\n" /* t1[7] is done */
- "ldrd r4, r5, [r0, #48]\n"
- "ldrd r6, r7, [r2, #48]\n"
+ "ldrd r4, r5, [%1, #48]\n"
+ "ldrd r6, r7, [%2, #48]\n"
"pkhtb r3, r12, r3, asr #16\n" /* combine t1[6] and t1[7] */
"str r3, [sp, #-4]!\n" /* save to stack */
"smlad r3, r8, r10, r14\n"
"smlad r12, r9, r11, r14\n"
- "ldrd r8, r9, [r0, #80]\n"
- "ldrd r10, r11, [r2, #80]\n"
+ "ldrd r8, r9, [%1, #80]\n"
+ "ldrd r10, r11, [%2, #80]\n"
"smlad r3, r4, r6, r3\n"
"smlad r12, r5, r7, r12\n"
- "ldrd r4, r5, [r0, #112]\n"
- "ldrd r6, r7, [r2, #112]\n"
+ "ldrd r4, r5, [%1, #112]\n"
+ "ldrd r6, r7, [%2, #112]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #144]\n"
- "ldrd r10, r11, [r2, #144]\n"
+ "ldrd r8, r9, [%1, #144]\n"
+ "ldrd r10, r11, [%2, #144]\n"
"smlad r3, r4, r6, r3\n"
"smlad r12, r5, r7, r12\n"
- "ldrd r4, r5, [r0, #0]\n"
- "ldrd r6, r7, [r2, #0]\n"
+ "ldrd r4, r5, [%1, #0]\n"
+ "ldrd r6, r7, [%2, #0]\n"
"smlad r3, r8, r10, r3\n" /* t1[4] is done */
"smlad r12, r9, r11, r12\n" /* t1[5] is done */
- "ldrd r8, r9, [r0, #32]\n"
- "ldrd r10, r11, [r2, #32]\n"
+ "ldrd r8, r9, [%1, #32]\n"
+ "ldrd r10, r11, [%2, #32]\n"
"pkhtb r3, r12, r3, asr #16\n" /* combine t1[4] and t1[5] */
"str r3, [sp, #-4]!\n" /* save to stack */
"smlad r3, r4, r6, r14\n"
"smlad r12, r5, r7, r14\n"
- "ldrd r4, r5, [r0, #64]\n"
- "ldrd r6, r7, [r2, #64]\n"
+ "ldrd r4, r5, [%1, #64]\n"
+ "ldrd r6, r7, [%2, #64]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #96]\n"
- "ldrd r10, r11, [r2, #96]\n"
+ "ldrd r8, r9, [%1, #96]\n"
+ "ldrd r10, r11, [%2, #96]\n"
"smlad r3, r4, r6, r3\n"
"smlad r12, r5, r7, r12\n"
- "ldrd r4, r5, [r0, #128]\n"
- "ldrd r6, r7, [r2, #128]\n"
+ "ldrd r4, r5, [%1, #128]\n"
+ "ldrd r6, r7, [%2, #128]\n"
"smlad r3, r8, r10, r3\n"
"smlad r12, r9, r11, r12\n"
- "ldrd r8, r9, [r0, #8]\n"
- "ldrd r10, r11, [r2, #8]\n"
+ "ldrd r8, r9, [%1, #8]\n"
+ "ldrd r10, r11, [%2, #8]\n"
"smlad r3, r4, r6, r3\n" /* t1[0] is done */
"smlad r12, r5, r7, r12\n" /* t1[1] is done */
- "ldrd r4, r5, [r0, #40]\n"
- "ldrd r6, r7, [r2, #40]\n"
+ "ldrd r4, r5, [%1, #40]\n"
+ "ldrd r6, r7, [%2, #40]\n"
"pkhtb r3, r12, r3, asr #16\n" /* combine t1[0] and t1[1] */
"smlad r12, r8, r10, r14\n"
"smlad r14, r9, r11, r14\n"
- "ldrd r8, r9, [r0, #72]\n"
- "ldrd r10, r11, [r2, #72]\n"
+ "ldrd r8, r9, [%1, #72]\n"
+ "ldrd r10, r11, [%2, #72]\n"
"smlad r12, r4, r6, r12\n"
"smlad r14, r5, r7, r14\n"
- "ldrd r4, r5, [r0, #104]\n"
- "ldrd r6, r7, [r2, #104]\n"
+ "ldrd r4, r5, [%1, #104]\n"
+ "ldrd r6, r7, [%2, #104]\n"
"smlad r12, r8, r10, r12\n"
"smlad r14, r9, r11, r14\n"
- "ldrd r8, r9, [r0, #136]\n"
- "ldrd r10, r11, [r2, #136]!\n"
+ "ldrd r8, r9, [%1, #136]\n"
+ "ldrd r10, r11, [%2, #136]!\n"
"smlad r12, r4, r6, r12\n"
"smlad r14, r5, r7, r14\n"
- "ldrd r4, r5, [r2, #(160 - 136 + 0)]\n"
+ "ldrd r4, r5, [%2, #(160 - 136 + 0)]\n"
"smlad r12, r8, r10, r12\n" /* t1[2] is done */
"smlad r14, r9, r11, r14\n" /* t1[3] is done */
- "ldrd r6, r7, [r2, #(160 - 136 + 8)]\n"
+ "ldrd r6, r7, [%2, #(160 - 136 + 8)]\n"
"smuad r4, r3, r4\n"
"smuad r5, r3, r5\n"
"pkhtb r12, r14, r12, asr #16\n" /* combine t1[2] and t1[3] */
/* r3 = t2[0:1] */
/* r12 = t2[2:3] */
"pop {r0, r14}\n" /* t2[4:5], t2[6:7] */
- "ldrd r8, r9, [r2, #(160 - 136 + 32)]\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 32)]\n"
"smuad r6, r3, r6\n"
"smuad r7, r3, r7\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 40)]\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 40)]\n"
"smlad r4, r12, r8, r4\n"
"smlad r5, r12, r9, r5\n"
- "ldrd r8, r9, [r2, #(160 - 136 + 64)]\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 64)]\n"
"smlad r6, r12, r10, r6\n"
"smlad r7, r12, r11, r7\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 72)]\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 72)]\n"
"smlad r4, r0, r8, r4\n"
"smlad r5, r0, r9, r5\n"
- "ldrd r8, r9, [r2, #(160 - 136 + 96)]\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 96)]\n"
"smlad r6, r0, r10, r6\n"
"smlad r7, r0, r11, r7\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 104)]\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 104)]\n"
"smlad r4, r14, r8, r4\n"
"smlad r5, r14, r9, r5\n"
- "ldrd r8, r9, [r2, #(160 - 136 + 16 + 0)]\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 16 + 0)]\n"
"smlad r6, r14, r10, r6\n"
"smlad r7, r14, r11, r7\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 16 + 8)]\n"
- "stmia r1!, {r4, r5}\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 16 + 8)]\n"
+ "stmia %0!, {r4, r5}\n"
"smuad r4, r3, r8\n"
"smuad r5, r3, r9\n"
- "ldrd r8, r9, [r2, #(160 - 136 + 16 + 32)]\n"
- "stmia r1!, {r6, r7}\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 16 + 32)]\n"
+ "stmia %0!, {r6, r7}\n"
"smuad r6, r3, r10\n"
"smuad r7, r3, r11\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 16 + 40)]\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 16 + 40)]\n"
"smlad r4, r12, r8, r4\n"
"smlad r5, r12, r9, r5\n"
- "ldrd r8, r9, [r2, #(160 - 136 + 16 + 64)]\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 16 + 64)]\n"
"smlad r6, r12, r10, r6\n"
"smlad r7, r12, r11, r7\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 16 + 72)]\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 16 + 72)]\n"
"smlad r4, r0, r8, r4\n"
"smlad r5, r0, r9, r5\n"
- "ldrd r8, r9, [r2, #(160 - 136 + 16 + 96)]\n"
+ "ldrd r8, r9, [%2, #(160 - 136 + 16 + 96)]\n"
"smlad r6, r0, r10, r6\n"
"smlad r7, r0, r11, r7\n"
- "ldrd r10, r11, [r2, #(160 - 136 + 16 + 104)]\n"
+ "ldrd r10, r11, [%2, #(160 - 136 + 16 + 104)]\n"
"smlad r4, r14, r8, r4\n"
"smlad r5, r14, r9, r5\n"
"smlad r6, r14, r10, r6\n"
"smlad r7, r14, r11, r7\n"
- "pop {r8-r11}\n"
- "stmia r1!, {r4, r5, r6, r7}\n"
- "pop {r1, r4-r7, pc}\n"
+ "stmia %0!, {r4, r5, r6, r7}\n"
+ : "+r" (out)
+ : "r" (in), "r" (consts)
+ : "memory",
+ "r3", "r4", "r5", "r6", "r7", "r8",
+ "r9", "r10", "r11", "r12", "r14"
);
}

-#define sbc_analyze_eight(in, out, consts) \
- ((void (*)(int16_t *, int32_t *, const FIXED_T*)) \
- sbc_analyze_eight_armv6)((in), (out), (consts))
-
static void sbc_analyze_4b_4s_armv6(struct sbc_encoder_state *state,
int16_t *x, int32_t *out, int out_stride)
{
/* Analyze blocks */
- sbc_analyze_four(x + 12, out, analysis_consts_fixed4_simd_odd);
+ sbc_analyze_four_armv6(x + 12, out, analysis_consts_fixed4_simd_odd);
out += out_stride;
- sbc_analyze_four(x + 8, out, analysis_consts_fixed4_simd_even);
+ sbc_analyze_four_armv6(x + 8, out, analysis_consts_fixed4_simd_even);
out += out_stride;
- sbc_analyze_four(x + 4, out, analysis_consts_fixed4_simd_odd);
+ sbc_analyze_four_armv6(x + 4, out, analysis_consts_fixed4_simd_odd);
out += out_stride;
- sbc_analyze_four(x + 0, out, analysis_consts_fixed4_simd_even);
+ sbc_analyze_four_armv6(x + 0, out, analysis_consts_fixed4_simd_even);
}

static void sbc_analyze_4b_8s_armv6(struct sbc_encoder_state *state,
int16_t *x, int32_t *out, int out_stride)
{
/* Analyze blocks */
- sbc_analyze_eight(x + 24, out, analysis_consts_fixed8_simd_odd);
+ sbc_analyze_eight_armv6(x + 24, out, analysis_consts_fixed8_simd_odd);
out += out_stride;
- sbc_analyze_eight(x + 16, out, analysis_consts_fixed8_simd_even);
+ sbc_analyze_eight_armv6(x + 16, out, analysis_consts_fixed8_simd_even);
out += out_stride;
- sbc_analyze_eight(x + 8, out, analysis_consts_fixed8_simd_odd);
+ sbc_analyze_eight_armv6(x + 8, out, analysis_consts_fixed8_simd_odd);
out += out_stride;
- sbc_analyze_eight(x + 0, out, analysis_consts_fixed8_simd_even);
+ sbc_analyze_eight_armv6(x + 0, out, analysis_consts_fixed8_simd_even);
}

static inline void sbc_analyze_1b_8s_armv6_even(struct
sbc_encoder_state *state,
--
2.17.0.windows.1


2018-12-12 20:48:16

by r yang

[permalink] [raw]
Subject: Re: [PATCH 1/1] sbc: Use extended inline assembly for armv6 primitives

What happened to this[1] patch? Did it fall by the wayside?

I can reproduce the issue this patch addresses. It was also reported
here[2] with gdb traces.

I've tested this patch with Alpine Linux on the current edge release.
Alpine currently uses sbc 1.4 and gcc 8. Before this patch bluetooth
a2dp audio will segmentation fault in sbc_analyze_eight_armv6.
After this patch the armv6 assembly code does not crash and a2dp audio
streaming works.

PS: I'm not certain how to properly send this as reply to the original
patch mail so I've referenced the original this way.

[1] https://marc.info/?l=linux-bluetooth&m=152535913710490
[2] https://bugzilla.redhat.com/show_bug.cgi?format=multiple&id=1350490

2018-12-12 21:22:59

by Austin Morton

[permalink] [raw]
Subject: Re: [PATCH 1/1] sbc: Use extended inline assembly for armv6 primitives

Resent due to HTML rejected.

Hi,

I am the original patch author.

I submitted the patch to the mailing list but received no response
from a maintainer.

I am using the sbc codec in an embedded project and have to maintain
other patches for it anyway, so it was of low priority to get the fix
merged for me.

I previously submitted another arm related patch to the sbc library
and it was also not merged (although it did get an initial response,
just no follow through).

I get the impression the maintainers are simply busy elsewhere - the
sbc library code has remain unchanged aside from a single line bugfix
since 2014.

I also submitted a bug to gcc, since the issue was clearly a
regression in a newer compiler version.

You can find that bug here: https://gcc.gnu.org/bugzilla/show_bug.cgi?id=85593

There was actually a patch made for the bug recently - so it may soon
be resolved in gcc trunk. Although that doesn't do much to fix the
issue for anyone using the compiler they have today.

I have not had opportunity to test the patch for gcc to see if it does
indeed fix the issue in the sbc code unfortunately.

Cheers,
Austin Morton