Skip to content

Commit bfd24af

Browse files
VSZSagosdahu
authored andcommitted
Arm: Speed up -1..1 soft clipping with Neon
If the signal exceeds -1..1 then, as error handling, the soft_clip function forces the signal back into -1..1. This is problematic since the search loop to find the next sample exceeding -1..1 is slow. If cheap on the current platform, while doing -2..2 hardclipping we can also detect if the signal never exceeds -1..1, avoiding the need for a second search loop. Change-Id: I7d751afc2d335765798ed7b48993c0d51ff843bd
1 parent df02d25 commit bfd24af

10 files changed

+256
-8
lines changed

celt/arch.h

+2
Original file line numberDiff line numberDiff line change
@@ -103,6 +103,8 @@ void celt_fatal(const char *str, const char *file, int line)
103103
#define MAX32(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum 32-bit value. */
104104
#define IMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum int value. */
105105
#define IMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum int value. */
106+
#define FMIN(a,b) ((a) < (b) ? (a) : (b)) /**< Minimum float value. */
107+
#define FMAX(a,b) ((a) > (b) ? (a) : (b)) /**< Maximum float value. */
106108
#define UADD32(a,b) ((a)+(b))
107109
#define USUB32(a,b) ((a)-(b))
108110
#define MAXG(a,b) MAX32(a, b)

celt/arm/arm_celt_map.c

+8
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,14 @@ void (*const CELT_FLOAT2INT16_IMPL[OPUS_ARCHMASK+1])(const float * OPUS_RESTRICT
4646
celt_float2int16_neon,/* NEON */
4747
celt_float2int16_neon /* DOTPROD */
4848
};
49+
50+
int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt) = {
51+
opus_limit2_checkwithin1_c, /* ARMv4 */
52+
opus_limit2_checkwithin1_c, /* EDSP */
53+
opus_limit2_checkwithin1_c, /* Media */
54+
opus_limit2_checkwithin1_neon,/* NEON */
55+
opus_limit2_checkwithin1_neon /* DOTPROD */
56+
};
4957
# endif
5058
# endif
5159

celt/arm/celt_neon_intr.c

+77
Original file line numberDiff line numberDiff line change
@@ -86,8 +86,85 @@ void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT
8686
out[i] = FLOAT2INT16(in[i]);
8787
}
8888
}
89+
90+
int opus_limit2_checkwithin1_neon(float *samples, int cnt)
91+
{
92+
const float hardclipMin = -2.0f;
93+
const float hardclipMax = 2.0f;
94+
95+
int i = 0;
96+
int exceeding1 = 0;
97+
int nextIndex = 0;
98+
99+
#if defined(__ARM_NEON)
100+
const int BLOCK_SIZE = 16;
101+
const int blockedSize = cnt / BLOCK_SIZE * BLOCK_SIZE;
102+
103+
float32x4_t min_all_0 = vdupq_n_f32(0.0f);
104+
float32x4_t min_all_1 = vdupq_n_f32(0.0f);
105+
float32x4_t max_all_0 = vdupq_n_f32(0.0f);
106+
float32x4_t max_all_1 = vdupq_n_f32(0.0f);
107+
108+
float max, min;
109+
110+
for (i = 0; i < blockedSize; i += BLOCK_SIZE)
111+
{
112+
const float32x4_t orig_a = vld1q_f32(&samples[i + 0]);
113+
const float32x4_t orig_b = vld1q_f32(&samples[i + 4]);
114+
const float32x4_t orig_c = vld1q_f32(&samples[i + 8]);
115+
const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
116+
max_all_0 = vmaxq_f32(max_all_0, vmaxq_f32(orig_a, orig_b));
117+
max_all_1 = vmaxq_f32(max_all_1, vmaxq_f32(orig_c, orig_d));
118+
min_all_0 = vminq_f32(min_all_0, vminq_f32(orig_a, orig_b));
119+
min_all_1 = vminq_f32(min_all_1, vminq_f32(orig_c, orig_d));
120+
}
121+
122+
max = vmaxvf(vmaxq_f32(max_all_0, max_all_1));
123+
min = vminvf(vminq_f32(min_all_0, min_all_1));
124+
125+
if (min < hardclipMin || max > hardclipMax)
126+
{
127+
const float32x4_t hardclipMinReg = vdupq_n_f32(hardclipMin);
128+
const float32x4_t hardclipMaxReg = vdupq_n_f32(hardclipMax);
129+
for (i = 0; i < blockedSize; i += BLOCK_SIZE)
130+
{
131+
const float32x4_t orig_a = vld1q_f32(&samples[i + 0]);
132+
const float32x4_t orig_b = vld1q_f32(&samples[i + 4]);
133+
const float32x4_t orig_c = vld1q_f32(&samples[i + 8]);
134+
const float32x4_t orig_d = vld1q_f32(&samples[i + 12]);
135+
const float32x4_t clipped_a = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_a, hardclipMinReg));
136+
const float32x4_t clipped_b = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_b, hardclipMinReg));
137+
const float32x4_t clipped_c = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_c, hardclipMinReg));
138+
const float32x4_t clipped_d = vminq_f32(hardclipMaxReg, vmaxq_f32(orig_d, hardclipMinReg));
139+
vst1q_f32(&samples[i + 0], clipped_a);
140+
vst1q_f32(&samples[i + 4], clipped_b);
141+
vst1q_f32(&samples[i + 8], clipped_c);
142+
vst1q_f32(&samples[i + 12], clipped_d);
143+
}
144+
}
145+
146+
nextIndex = blockedSize;
147+
exceeding1 |= max > 1.0f || min < -1.0f;
148+
89149
#endif
90150

151+
for (i = nextIndex; i < cnt; i++)
152+
{
153+
const float origVal = samples[i];
154+
float clippedVal = origVal;
155+
clippedVal = MAX16(hardclipMin, clippedVal);
156+
clippedVal = MIN16(hardclipMax, clippedVal);
157+
samples[i] = clippedVal;
158+
159+
exceeding1 |= origVal > 1.0f || origVal < -1.0f;
160+
}
161+
162+
return !exceeding1;
163+
}
164+
165+
#endif
166+
167+
91168
#if defined(FIXED_POINT)
92169
#include <string.h>
93170

celt/arm/mathops_arm.h

+38
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,30 @@ static inline int32x4_t vroundf(float32x4_t x)
4646
# endif
4747
}
4848

49+
static inline float vminvf(float32x4_t a)
50+
{
51+
#if defined(__aarch64__)
52+
return vminvq_f32(a);
53+
#else
54+
float32x2_t xy = vmin_f32(vget_low_f32(a), vget_high_f32(a));
55+
float x = vget_lane_f32(xy, 0);
56+
float y = vget_lane_f32(xy, 1);
57+
return x < y ? x : y;
58+
#endif
59+
}
60+
61+
static inline float vmaxvf(float32x4_t a)
62+
{
63+
#if defined(__aarch64__)
64+
return vmaxvq_f32(a);
65+
#else
66+
float32x2_t xy = vmax_f32(vget_low_f32(a), vget_high_f32(a));
67+
float x = vget_lane_f32(xy, 0);
68+
float y = vget_lane_f32(xy, 1);
69+
return x > y ? x : y;
70+
#endif
71+
}
72+
4973
void celt_float2int16_neon(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT out, int cnt);
5074
# if defined(OPUS_HAVE_RTCD) && \
5175
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
@@ -60,6 +84,20 @@ extern void
6084
# define OVERRIDE_FLOAT2INT16 (1)
6185
# define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_neon(in, out, cnt))
6286
# endif
87+
88+
int opus_limit2_checkwithin1_neon(float * samples, int cnt);
89+
# if defined(OPUS_HAVE_RTCD) && \
90+
(defined(OPUS_ARM_MAY_HAVE_NEON_INTR) && !defined(OPUS_ARM_PRESUME_NEON_INTR))
91+
extern int (*const OPUS_LIMIT2_CHECKWITHIN1_IMPL[OPUS_ARCHMASK+1])(float * samples, int cnt);
92+
93+
# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)
94+
# define opus_limit2_checkwithin1(samples, cnt, arch) \
95+
((*OPUS_LIMIT2_CHECKWITHIN1_IMPL[(arch)&OPUS_ARCHMASK])(samples, cnt))
96+
97+
# elif defined(OPUS_ARM_PRESUME_NEON_INTR)
98+
# define OVERRIDE_LIMIT2_CHECKWITHIN1 (1)
99+
# define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_neon(samples, cnt))
100+
# endif
63101
# endif
64102

65103
#endif /* MATHOPS_ARM_H */

celt/mathops.c

+20
Original file line numberDiff line numberDiff line change
@@ -229,4 +229,24 @@ void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT ou
229229
}
230230
}
231231

232+
int opus_limit2_checkwithin1_c(float * samples, int cnt)
233+
{
234+
int i;
235+
if (cnt <= 0)
236+
{
237+
return 1;
238+
}
239+
240+
for (i = 0; i < cnt; i++)
241+
{
242+
float clippedVal = samples[i];
243+
clippedVal = FMAX(-2.0f, clippedVal);
244+
clippedVal = FMIN(2.0f, clippedVal);
245+
samples[i] = clippedVal;
246+
}
247+
248+
/* C implementation can't provide quick hint. Assume it might exceed -1/+1. */
249+
return 0;
250+
}
251+
232252
#endif /* DISABLE_FLOAT_API */

celt/mathops.h

+6
Original file line numberDiff line numberDiff line change
@@ -490,6 +490,12 @@ void celt_float2int16_c(const float * OPUS_RESTRICT in, short * OPUS_RESTRICT ou
490490
#define celt_float2int16(in, out, cnt, arch) ((void)(arch), celt_float2int16_c(in, out, cnt))
491491
#endif
492492

493+
int opus_limit2_checkwithin1_c(float *samples, int cnt);
494+
495+
#ifndef OVERRIDE_LIMIT2_CHECKWITHIN1
496+
#define opus_limit2_checkwithin1(samples, cnt, arch) ((void)(arch), opus_limit2_checkwithin1_c(samples, cnt))
497+
#endif
498+
493499
#endif /* DISABLE_FLOAT_API */
494500

495501
#endif /* MATHOPS_H */

celt/tests/test_unit_mathops.c

+72
Original file line numberDiff line numberDiff line change
@@ -435,6 +435,77 @@ void testcelt_float2int16(int use_ref_impl, int buffer_size)
435435
#undef MAX_BUFFER_SIZE
436436
}
437437

438+
void testopus_limit2_checkwithin1(int use_ref_impl)
439+
{
440+
#define BUFFER_SIZE 37 /* strange float count to trigger residue loop of SIMD implementation */
441+
#define BYTE_COUNT (BUFFER_SIZE * sizeof(float))
442+
int i, within1;
443+
const int arch = opus_select_arch();
444+
445+
float pattern[BUFFER_SIZE], buffer[BUFFER_SIZE];
446+
447+
for (i = 0; i < BUFFER_SIZE; ++i)
448+
{
449+
pattern[i] = i % 2 ? -1.f : 1.f;
450+
}
451+
452+
/* All values within -1..1:
453+
Nothing changed. Return value is implementation-dependent (not expected to recognise nothing exceeds -1..1) */
454+
memcpy(buffer, pattern, BYTE_COUNT);
455+
within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
456+
if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
457+
{
458+
fprintf (stderr, "opus_limit2_checkwithin1() modified values not exceeding -1..1 (ref=%d)\n", use_ref_impl);
459+
ret = 1;
460+
}
461+
462+
/* One value exceeds -1..1, within -2..2:
463+
Values unchanged. Return value says not all values are within -1..1 */
464+
for (i = 0; i < BUFFER_SIZE; ++i)
465+
{
466+
const float replace_value = pattern[i] * 1.001f;
467+
468+
memcpy(buffer, pattern, BYTE_COUNT);
469+
buffer[i] = replace_value;
470+
within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
471+
if (within1 || buffer[i] != replace_value)
472+
{
473+
fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -1..1 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);
474+
ret = 1;
475+
}
476+
buffer[i] = pattern[i];
477+
if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
478+
{
479+
fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i);
480+
ret = 1;
481+
}
482+
}
483+
484+
/* One value exceeds -2..2:
485+
One value is hardclipped, others are unchanged. Return value says not all values are within -1..1 */
486+
for (i = 0; i < BUFFER_SIZE; ++i)
487+
{
488+
const float replace_value = pattern[i] * 2.1;
489+
490+
memcpy(buffer, pattern, BYTE_COUNT);
491+
buffer[i] = replace_value;
492+
within1 = use_ref_impl ? opus_limit2_checkwithin1_c(buffer, BUFFER_SIZE) : opus_limit2_checkwithin1(buffer, BUFFER_SIZE, arch);
493+
if (within1 || buffer[i] != (replace_value > 0.f ? 2.f : -2.f))
494+
{
495+
fprintf (stderr, "opus_limit2_checkwithin1() handled value exceeding -2..2 erroneously (ref=%d, i=%d)\n", use_ref_impl, i);
496+
ret = 1;
497+
}
498+
buffer[i] = pattern[i];
499+
if (memcmp(buffer, pattern, BYTE_COUNT) != 0)
500+
{
501+
fprintf (stderr, "opus_limit2_checkwithin1() modified value within -2..2 (ref=%d, i=%d)\n", use_ref_impl, i);
502+
ret = 1;
503+
}
504+
}
505+
#undef BUFFER_SIZE
506+
#undef BYTE_COUNT
507+
}
508+
438509
#endif
439510

440511
int main(void)
@@ -461,6 +532,7 @@ int main(void)
461532
testcelt_float2int16(use_ref_impl[i], 32);
462533
testcelt_float2int16(use_ref_impl[i], 127);
463534
testcelt_float2int16(use_ref_impl[i], 1031);
535+
testopus_limit2_checkwithin1(use_ref_impl[i]);
464536
}
465537
#endif
466538
return ret;

src/opus.c

+30-7
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,5 @@
11
/* Copyright (c) 2011 Xiph.Org Foundation, Skype Limited
2+
Copyright (c) 2024 Arm Limited
23
Written by Jean-Marc Valin and Koen Vos */
34
/*
45
Redistribution and use in source and binary forms, with or without
@@ -30,23 +31,29 @@
3031
#endif
3132

3233
#include "opus.h"
34+
#include "celt/mathops.h"
3335
#include "opus_private.h"
3436

3537
#ifndef DISABLE_FLOAT_API
36-
OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
38+
39+
static void opus_pcm_soft_clip_impl(float *_x, int N, int C, float *declip_mem, int arch)
3740
{
3841
int c;
3942
int i;
4043
float *x;
44+
int all_within_neg1pos1;
4145

4246
if (C<1 || N<1 || !_x || !declip_mem) return;
4347

4448
/* First thing: saturate everything to +/- 2 which is the highest level our
4549
non-linearity can handle. At the point where the signal reaches +/-2,
4650
the derivative will be zero anyway, so this doesn't introduce any
47-
discontinuity in the derivative. */
48-
for (i=0;i<N*C;i++)
49-
_x[i] = MAX16(-2.f, MIN16(2.f, _x[i]));
51+
discontinuity in the derivative.
52+
53+
Implementation might provide a hint, if none of the samples exceed +/-1. */
54+
55+
all_within_neg1pos1 = opus_limit2_checkwithin1(_x, N*C, arch);
56+
5057
for (c=0;c<C;c++)
5158
{
5259
float a;
@@ -72,10 +79,15 @@ OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
7279
float maxval;
7380
int special=0;
7481
int peak_pos;
75-
for (i=curr;i<N;i++)
82+
if (all_within_neg1pos1)
7683
{
77-
if (x[i*C]>1 || x[i*C]<-1)
78-
break;
84+
i = N;
85+
} else {
86+
for (i=curr;i<N;i++)
87+
{
88+
if (x[i*C]>1 || x[i*C]<-1)
89+
break;
90+
}
7991
}
8092
if (i==N)
8193
{
@@ -135,6 +147,17 @@ OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
135147
declip_mem[c] = a;
136148
}
137149
}
150+
151+
void opus_pcm_soft_clip_with_arch(float *_x, int N, int C, float *declip_mem, int arch)
152+
{
153+
opus_pcm_soft_clip_impl(_x, N, C, declip_mem, arch);
154+
}
155+
156+
OPUS_EXPORT void opus_pcm_soft_clip(float *_x, int N, int C, float *declip_mem)
157+
{
158+
opus_pcm_soft_clip_impl(_x, N, C, declip_mem, 0);
159+
}
160+
138161
#endif
139162

140163
int encode_size(int size, unsigned char *data)

src/opus_decoder.c

+1-1
Original file line numberDiff line numberDiff line change
@@ -810,7 +810,7 @@ int opus_decode_native(OpusDecoder *st, const unsigned char *data,
810810
OPUS_PRINT_INT(nb_samples);
811811
#ifndef FIXED_POINT
812812
if (soft_clip)
813-
opus_pcm_soft_clip(pcm, nb_samples, st->channels, st->softclip_mem);
813+
opus_pcm_soft_clip_with_arch(pcm, nb_samples, st->channels, st->softclip_mem, st->arch);
814814
else
815815
st->softclip_mem[0]=st->softclip_mem[1]=0;
816816
#endif

src/opus_private.h

+2
Original file line numberDiff line numberDiff line change
@@ -177,6 +177,8 @@ void downmix_int(const void *_x, opus_val32 *sub, int subframe, int offset, int
177177
void downmix_int24(const void *_x, opus_val32 *sub, int subframe, int offset, int c1, int c2, int C);
178178
int is_digital_silence(const opus_res* pcm, int frame_size, int channels, int lsb_depth);
179179

180+
void opus_pcm_soft_clip_with_arch(float *_x, int N, int C, float *declip_mem, int arch);
181+
180182
int encode_size(int size, unsigned char *data);
181183

182184
opus_int32 frame_size_select(opus_int32 frame_size, int variable_duration, opus_int32 Fs);

0 commit comments

Comments
 (0)