Some patches for making SBC encoding up to 10% faster on ARM and x86.
Also available in git://gitorious.org/system-performance/bluez-sbc.git
branch 'sbc-fast-scalefactors-for-master'
Siarhei Siamashka (3):
sbc: new 'sbc_calc_scalefactors_j' function added to sbc primitives
sbc: MMX optimization for scale factors calculation
sbc: ARM NEON optimization for scale factors calculation
sbc/sbc.c | 94 ++++++++++++---------------------------------
sbc/sbc_primitives.c | 75 ++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives.h | 4 ++
sbc/sbc_primitives_mmx.c | 54 ++++++++++++++++++++++++++
sbc/sbc_primitives_neon.c | 58 +++++++++++++++++++++++++++
5 files changed, 216 insertions(+), 69 deletions(-)
On Wednesday 30 June 2010 07:32:39 Johan Hedberg wrote:
> All three patches have been pushed upstream. Thanks!
Thanks.
A few more SBC encoder performance optimizations will follow shortly :)
--
Best regards,
Siarhei Siamashka
Hi Siarhei,
On Tue, Jun 29, 2010, Siarhei Siamashka wrote:
> Some patches for making SBC encoding up to 10% faster on ARM and x86.
>
> Also available in git://gitorious.org/system-performance/bluez-sbc.git
> branch 'sbc-fast-scalefactors-for-master'
>
> Siarhei Siamashka (3):
> sbc: new 'sbc_calc_scalefactors_j' function added to sbc primitives
> sbc: MMX optimization for scale factors calculation
> sbc: ARM NEON optimization for scale factors calculation
>
> sbc/sbc.c | 94 ++++++++++++---------------------------------
> sbc/sbc_primitives.c | 75 ++++++++++++++++++++++++++++++++++++
> sbc/sbc_primitives.h | 4 ++
> sbc/sbc_primitives_mmx.c | 54 ++++++++++++++++++++++++++
> sbc/sbc_primitives_neon.c | 58 +++++++++++++++++++++++++++
> 5 files changed, 216 insertions(+), 69 deletions(-)
All three patches have been pushed upstream. Thanks!
Johan
From: Siarhei Siamashka <[email protected]>
Improves SBC encoding performance when joint stereo is not used.
Benchmarked on ARM Cortex-A8:
== Before: ==
$ time ./sbcenc -b53 -s8 test.au > /dev/null
real 0m4.756s
user 0m4.313s
sys 0m0.438s
samples % image name symbol name
2569 27.6296 sbcenc sbc_pack_frame
1934 20.8002 sbcenc sbc_analyze_4b_8s_neon
1386 14.9064 sbcenc sbc_calculate_bits
1221 13.1319 sbcenc sbc_calc_scalefactors
996 10.7120 sbcenc sbc_enc_process_input_8s_be
878 9.4429 no-vmlinux /no-vmlinux
204 2.1940 sbcenc sbc_encode
56 0.6023 libc-2.10.1.so memcpy
== After: ==
$ time ./sbcenc -b53 -s8 test.au > /dev/null
real 0m4.220s
user 0m3.797s
sys 0m0.422s
samples % image name symbol name
2563 31.3249 sbcenc sbc_pack_frame
1892 23.1239 sbcenc sbc_analyze_4b_8s_neon
1368 16.7196 sbcenc sbc_calculate_bits
961 11.7453 sbcenc sbc_enc_process_input_8s_be
836 10.2176 no-vmlinux /no-vmlinux
262 3.2022 sbcenc sbc_calc_scalefactors_neon
199 2.4322 sbcenc sbc_encode
49 0.5989 libc-2.10.1.so memcpy
---
sbc/sbc.c | 2 +-
sbc/sbc_primitives_neon.c | 58 +++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 59 insertions(+), 1 deletions(-)
diff --git a/sbc/sbc.c b/sbc/sbc.c
index 7da7313..569dd7c 100644
--- a/sbc/sbc.c
+++ b/sbc/sbc.c
@@ -78,7 +78,7 @@ struct sbc_frame {
uint8_t joint;
/* only the lower 4 bits of every element are to be used */
- uint32_t scale_factor[2][8];
+ uint32_t SBC_ALIGNED scale_factor[2][8];
/* raw integer subband samples in the frame */
int32_t SBC_ALIGNED sb_sample_f[16][2][8];
diff --git a/sbc/sbc_primitives_neon.c b/sbc/sbc_primitives_neon.c
index d20eeca..2a4cdf0 100644
--- a/sbc/sbc_primitives_neon.c
+++ b/sbc/sbc_primitives_neon.c
@@ -237,10 +237,68 @@ static inline void sbc_analyze_4b_8s_neon(int16_t *x,
_sbc_analyze_eight_neon(x + 0, out, analysis_consts_fixed8_simd_even);
}
+static void sbc_calc_scalefactors_neon(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands)
+{
+ int ch, sb;
+ for (ch = 0; ch < channels; ch++) {
+ for (sb = 0; sb < subbands; sb += 4) {
+ int blk = blocks;
+ int32_t *in = &sb_sample_f[0][ch][sb];
+ asm volatile (
+ "vmov.s32 q0, %[c1]\n"
+ "vmov.s32 q1, %[c1]\n"
+ "1:\n"
+ "vld1.32 {d16, d17}, [%[in], :128], %[inc]\n"
+ "vabs.s32 q8, q8\n"
+ "vld1.32 {d18, d19}, [%[in], :128], %[inc]\n"
+ "vabs.s32 q9, q9\n"
+ "vld1.32 {d20, d21}, [%[in], :128], %[inc]\n"
+ "vabs.s32 q10, q10\n"
+ "vld1.32 {d22, d23}, [%[in], :128], %[inc]\n"
+ "vabs.s32 q11, q11\n"
+ "vcgt.s32 q12, q8, #0\n"
+ "vcgt.s32 q13, q9, #0\n"
+ "vcgt.s32 q14, q10, #0\n"
+ "vcgt.s32 q15, q11, #0\n"
+ "vadd.s32 q8, q8, q12\n"
+ "vadd.s32 q9, q9, q13\n"
+ "vadd.s32 q10, q10, q14\n"
+ "vadd.s32 q11, q11, q15\n"
+ "vorr.s32 q0, q0, q8\n"
+ "vorr.s32 q1, q1, q9\n"
+ "vorr.s32 q0, q0, q10\n"
+ "vorr.s32 q1, q1, q11\n"
+ "subs %[blk], %[blk], #4\n"
+ "bgt 1b\n"
+ "vorr.s32 q0, q0, q1\n"
+ "vmov.s32 q15, %[c2]\n"
+ "vclz.s32 q0, q0\n"
+ "vsub.s32 q0, q15, q0\n"
+ "vst1.32 {d0, d1}, [%[out], :128]\n"
+ :
+ [blk] "+r" (blk),
+ [in] "+r" (in)
+ :
+ [inc] "r" ((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]),
+ [out] "r" (&scale_factor[ch][sb]),
+ [c1] "i" (1 << SCALE_OUT_BITS),
+ [c2] "i" (31 - SCALE_OUT_BITS)
+ : "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19",
+ "d20", "d21", "d22", "d23", "d24", "d25", "d26",
+ "d27", "d28", "d29", "d30", "d31", "cc", "memory");
+ }
+ }
+}
+
void sbc_init_primitives_neon(struct sbc_encoder_state *state)
{
state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_neon;
state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_neon;
+ state->sbc_calc_scalefactors = sbc_calc_scalefactors_neon;
state->implementation_info = "NEON";
}
--
1.6.4.4
From: Siarhei Siamashka <[email protected]>
Improves SBC encoding performance when joint stereo is not used.
Benchmarked on Pentium-M:
== Before: ==
$ time ./sbcenc -b53 -s8 test.au > /dev/null
real 0m1.439s
user 0m1.336s
sys 0m0.104s
samples % image name symbol name
8642 33.7473 sbcenc sbc_pack_frame
5873 22.9342 sbcenc sbc_analyze_4b_8s_mmx
4435 17.3188 sbcenc sbc_calc_scalefactors
4285 16.7331 sbcenc sbc_calculate_bits
1942 7.5836 sbcenc sbc_enc_process_input_8s_be
322 1.2574 sbcenc sbc_encode
== After: ==
$ time ./sbcenc -b53 -s8 test.au > /dev/null
real 0m1.319s
user 0m1.220s
sys 0m0.084s
samples % image name symbol name
8706 37.9959 sbcenc sbc_pack_frame
5740 25.0513 sbcenc sbc_analyze_4b_8s_mmx
4307 18.7972 sbcenc sbc_calculate_bits
1937 8.4537 sbcenc sbc_enc_process_input_8s_be
1801 7.8602 sbcenc sbc_calc_scalefactors_mmx
307 1.3399 sbcenc sbc_encode
---
sbc/sbc_primitives_mmx.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 54 insertions(+), 0 deletions(-)
diff --git a/sbc/sbc_primitives_mmx.c b/sbc/sbc_primitives_mmx.c
index e6900bc..45c62ac 100644
--- a/sbc/sbc_primitives_mmx.c
+++ b/sbc/sbc_primitives_mmx.c
@@ -276,6 +276,59 @@ static inline void sbc_analyze_4b_8s_mmx(int16_t *x, int32_t *out,
asm volatile ("emms\n");
}
+static void sbc_calc_scalefactors_mmx(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int channels, int subbands)
+{
+ static const SBC_ALIGNED int32_t consts[2] = {
+ 1 << SCALE_OUT_BITS,
+ 1 << SCALE_OUT_BITS,
+ };
+ int ch, sb;
+ intptr_t blk;
+ for (ch = 0; ch < channels; ch++) {
+ for (sb = 0; sb < subbands; sb += 2) {
+ blk = (blocks - 1) * (((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]));
+ asm volatile (
+ "movq (%4), %%mm0\n"
+ "1:\n"
+ "movq (%1, %0), %%mm1\n"
+ "pxor %%mm2, %%mm2\n"
+ "pcmpgtd %%mm2, %%mm1\n"
+ "paddd (%1, %0), %%mm1\n"
+ "pcmpgtd %%mm1, %%mm2\n"
+ "pxor %%mm2, %%mm1\n"
+
+ "por %%mm1, %%mm0\n"
+
+ "sub %2, %0\n"
+ "jns 1b\n"
+
+ "movd %%mm0, %k0\n"
+ "psrlq $32, %%mm0\n"
+ "bsrl %k0, %k0\n"
+ "subl %5, %k0\n"
+ "movl %k0, (%3)\n"
+
+ "movd %%mm0, %k0\n"
+ "bsrl %k0, %k0\n"
+ "subl %5, %k0\n"
+ "movl %k0, 4(%3)\n"
+ : "+r" (blk)
+ : "r" (&sb_sample_f[0][ch][sb]),
+ "i" ((char *) &sb_sample_f[1][0][0] -
+ (char *) &sb_sample_f[0][0][0]),
+ "r" (&scale_factor[ch][sb]),
+ "r" (&consts),
+ "i" (SCALE_OUT_BITS)
+ : "memory");
+ }
+ }
+ asm volatile ("emms\n");
+}
+
static int check_mmx_support(void)
{
#ifdef __amd64__
@@ -314,6 +367,7 @@ void sbc_init_primitives_mmx(struct sbc_encoder_state *state)
if (check_mmx_support()) {
state->sbc_analyze_4b_4s = sbc_analyze_4b_4s_mmx;
state->sbc_analyze_4b_8s = sbc_analyze_4b_8s_mmx;
+ state->sbc_calc_scalefactors = sbc_calc_scalefactors_mmx;
state->implementation_info = "MMX";
}
}
--
1.6.4.4
From: Siarhei Siamashka <[email protected]>
The code for scale factors calculation with joint stereo support has
been moved to a separate function. It can get platform-specific
SIMD optimizations later for best possible performance.
But even this change in C code improves performance because of the
use of __builtin_clz() instead of loops similar to what was done
to sbc_calc_scalefactors earlier. Also technically it does loop
unrolling by processing two channels at once, which might be either
good or bad for performance (if the registers pressure is increased
and more data is spilled to memory). But the benchmark from 32-bit
x86 system (pentium-m) shows that it got clearly faster:
$ time ./sbcenc.old -b53 -s8 -j test.au > /dev/null
real 0m1.868s
user 0m1.808s
sys 0m0.048s
$ time ./sbcenc.new -b53 -s8 -j test.au > /dev/null
real 0m1.742s
user 0m1.668s
sys 0m0.064s
---
sbc/sbc.c | 92 +++++++++++++-------------------------------------
sbc/sbc_primitives.c | 75 ++++++++++++++++++++++++++++++++++++++++
sbc/sbc_primitives.h | 4 ++
3 files changed, 103 insertions(+), 68 deletions(-)
diff --git a/sbc/sbc.c b/sbc/sbc.c
index 86399dd..7da7313 100644
--- a/sbc/sbc.c
+++ b/sbc/sbc.c
@@ -745,7 +745,8 @@ static int sbc_analyze_audio(struct sbc_encoder_state *state,
static SBC_ALWAYS_INLINE int sbc_pack_frame_internal(uint8_t *data,
struct sbc_frame *frame, size_t len,
- int frame_subbands, int frame_channels)
+ int frame_subbands, int frame_channels,
+ int joint)
{
/* Bitstream writer starts from the fourth byte */
uint8_t *data_ptr = data + 4;
@@ -802,63 +803,6 @@ static SBC_ALWAYS_INLINE int sbc_pack_frame_internal(uint8_t *data,
crc_pos = 16;
if (frame->mode == JOINT_STEREO) {
- /* like frame->sb_sample but joint stereo */
- int32_t sb_sample_j[16][2];
- /* scalefactor and scale_factor in joint case */
- uint32_t scalefactor_j[2];
- uint8_t scale_factor_j[2];
-
- uint8_t joint = 0;
- frame->joint = 0;
-
- for (sb = 0; sb < frame_subbands - 1; sb++) {
- scale_factor_j[0] = 0;
- scalefactor_j[0] = 2 << SCALE_OUT_BITS;
- scale_factor_j[1] = 0;
- scalefactor_j[1] = 2 << SCALE_OUT_BITS;
-
- for (blk = 0; blk < frame->blocks; blk++) {
- uint32_t tmp;
- /* Calculate joint stereo signal */
- sb_sample_j[blk][0] =
- ASR(frame->sb_sample_f[blk][0][sb], 1) +
- ASR(frame->sb_sample_f[blk][1][sb], 1);
- sb_sample_j[blk][1] =
- ASR(frame->sb_sample_f[blk][0][sb], 1) -
- ASR(frame->sb_sample_f[blk][1][sb], 1);
-
- /* calculate scale_factor_j and scalefactor_j for joint case */
- tmp = fabs(sb_sample_j[blk][0]);
- while (scalefactor_j[0] < tmp) {
- scale_factor_j[0]++;
- scalefactor_j[0] *= 2;
- }
- tmp = fabs(sb_sample_j[blk][1]);
- while (scalefactor_j[1] < tmp) {
- scale_factor_j[1]++;
- scalefactor_j[1] *= 2;
- }
- }
-
- /* decide whether to join this subband */
- if ((frame->scale_factor[0][sb] +
- frame->scale_factor[1][sb]) >
- (scale_factor_j[0] +
- scale_factor_j[1])) {
- /* use joint stereo for this subband */
- joint |= 1 << (frame_subbands - 1 - sb);
- frame->joint |= 1 << sb;
- frame->scale_factor[0][sb] = scale_factor_j[0];
- frame->scale_factor[1][sb] = scale_factor_j[1];
- for (blk = 0; blk < frame->blocks; blk++) {
- frame->sb_sample_f[blk][0][sb] =
- sb_sample_j[blk][0];
- frame->sb_sample_f[blk][1][sb] =
- sb_sample_j[blk][1];
- }
- }
- }
-
PUT_BITS(data_ptr, bits_cache, bits_count,
joint, frame_subbands);
crc_header[crc_pos >> 3] = joint;
@@ -916,18 +860,23 @@ static SBC_ALWAYS_INLINE int sbc_pack_frame_internal(uint8_t *data,
return data_ptr - data;
}
-static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len)
+static int sbc_pack_frame(uint8_t *data, struct sbc_frame *frame, size_t len,
+ int joint)
{
if (frame->subbands == 4) {
if (frame->channels == 1)
- return sbc_pack_frame_internal(data, frame, len, 4, 1);
+ return sbc_pack_frame_internal(
+ data, frame, len, 4, 1, joint);
else
- return sbc_pack_frame_internal(data, frame, len, 4, 2);
+ return sbc_pack_frame_internal(
+ data, frame, len, 4, 2, joint);
} else {
if (frame->channels == 1)
- return sbc_pack_frame_internal(data, frame, len, 8, 1);
+ return sbc_pack_frame_internal(
+ data, frame, len, 8, 1, joint);
else
- return sbc_pack_frame_internal(data, frame, len, 8, 2);
+ return sbc_pack_frame_internal(
+ data, frame, len, 8, 2, joint);
}
}
@@ -1121,11 +1070,18 @@ ssize_t sbc_encode(sbc_t *sbc, const void *input, size_t input_len,
samples = sbc_analyze_audio(&priv->enc_state, &priv->frame);
- priv->enc_state.sbc_calc_scalefactors(
- priv->frame.sb_sample_f, priv->frame.scale_factor,
- priv->frame.blocks, priv->frame.channels, priv->frame.subbands);
-
- framelen = sbc_pack_frame(output, &priv->frame, output_len);
+ if (priv->frame.mode == JOINT_STEREO) {
+ int j = priv->enc_state.sbc_calc_scalefactors_j(
+ priv->frame.sb_sample_f, priv->frame.scale_factor,
+ priv->frame.blocks, priv->frame.subbands);
+ framelen = sbc_pack_frame(output, &priv->frame, output_len, j);
+ } else {
+ priv->enc_state.sbc_calc_scalefactors(
+ priv->frame.sb_sample_f, priv->frame.scale_factor,
+ priv->frame.blocks, priv->frame.channels,
+ priv->frame.subbands);
+ framelen = sbc_pack_frame(output, &priv->frame, output_len, 0);
+ }
if (written)
*written = framelen;
diff --git a/sbc/sbc_primitives.c b/sbc/sbc_primitives.c
index 41934d3..c73fb1c 100644
--- a/sbc/sbc_primitives.c
+++ b/sbc/sbc_primitives.c
@@ -440,6 +440,80 @@ static void sbc_calc_scalefactors(
}
}
+static int sbc_calc_scalefactors_j(
+ int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int subbands)
+{
+ int blk, joint = 0;
+ int32_t tmp0, tmp1;
+ uint32_t x, y;
+
+ /* last subband does not use joint stereo */
+ int sb = subbands - 1;
+ x = 1 << SCALE_OUT_BITS;
+ y = 1 << SCALE_OUT_BITS;
+ for (blk = 0; blk < blocks; blk++) {
+ tmp0 = fabs(sb_sample_f[blk][0][sb]);
+ tmp1 = fabs(sb_sample_f[blk][1][sb]);
+ if (tmp0 != 0)
+ x |= tmp0 - 1;
+ if (tmp1 != 0)
+ y |= tmp1 - 1;
+ }
+ scale_factor[0][sb] = (31 - SCALE_OUT_BITS) - sbc_clz(x);
+ scale_factor[1][sb] = (31 - SCALE_OUT_BITS) - sbc_clz(y);
+
+ /* the rest of subbands can use joint stereo */
+ while (--sb >= 0) {
+ int32_t sb_sample_j[16][2];
+ x = 1 << SCALE_OUT_BITS;
+ y = 1 << SCALE_OUT_BITS;
+ for (blk = 0; blk < blocks; blk++) {
+ tmp0 = sb_sample_f[blk][0][sb];
+ tmp1 = sb_sample_f[blk][1][sb];
+ sb_sample_j[blk][0] = ASR(tmp0, 1) + ASR(tmp1, 1);
+ sb_sample_j[blk][1] = ASR(tmp0, 1) - ASR(tmp1, 1);
+ tmp0 = fabs(tmp0);
+ tmp1 = fabs(tmp1);
+ if (tmp0 != 0)
+ x |= tmp0 - 1;
+ if (tmp1 != 0)
+ y |= tmp1 - 1;
+ }
+ scale_factor[0][sb] = (31 - SCALE_OUT_BITS) -
+ sbc_clz(x);
+ scale_factor[1][sb] = (31 - SCALE_OUT_BITS) -
+ sbc_clz(y);
+ x = 1 << SCALE_OUT_BITS;
+ y = 1 << SCALE_OUT_BITS;
+ for (blk = 0; blk < blocks; blk++) {
+ tmp0 = fabs(sb_sample_j[blk][0]);
+ tmp1 = fabs(sb_sample_j[blk][1]);
+ if (tmp0 != 0)
+ x |= tmp0 - 1;
+ if (tmp1 != 0)
+ y |= tmp1 - 1;
+ }
+ x = (31 - SCALE_OUT_BITS) - sbc_clz(x);
+ y = (31 - SCALE_OUT_BITS) - sbc_clz(y);
+
+ /* decide whether to use joint stereo for this subband */
+ if ((scale_factor[0][sb] + scale_factor[1][sb]) > x + y) {
+ joint |= 1 << (subbands - 1 - sb);
+ scale_factor[0][sb] = x;
+ scale_factor[1][sb] = y;
+ for (blk = 0; blk < blocks; blk++) {
+ sb_sample_f[blk][0][sb] = sb_sample_j[blk][0];
+ sb_sample_f[blk][1][sb] = sb_sample_j[blk][1];
+ }
+ }
+ }
+
+ /* bitmask with the information about subbands using joint stereo */
+ return joint;
+}
+
/*
* Detect CPU features and setup function pointers
*/
@@ -457,6 +531,7 @@ void sbc_init_primitives(struct sbc_encoder_state *state)
/* Default implementation for scale factors calculation */
state->sbc_calc_scalefactors = sbc_calc_scalefactors;
+ state->sbc_calc_scalefactors_j = sbc_calc_scalefactors_j;
state->implementation_info = "Generic C";
/* X86/AMD64 optimizations */
diff --git a/sbc/sbc_primitives.h b/sbc/sbc_primitives.h
index 3be02ed..3fec8d5 100644
--- a/sbc/sbc_primitives.h
+++ b/sbc/sbc_primitives.h
@@ -63,6 +63,10 @@ struct sbc_encoder_state {
void (*sbc_calc_scalefactors)(int32_t sb_sample_f[16][2][8],
uint32_t scale_factor[2][8],
int blocks, int channels, int subbands);
+ /* Scale factors calculation with joint stereo support */
+ int (*sbc_calc_scalefactors_j)(int32_t sb_sample_f[16][2][8],
+ uint32_t scale_factor[2][8],
+ int blocks, int subbands);
const char *implementation_info;
};
--
1.6.4.4