1/*
2Copyright 2018 Google Inc. All Rights Reserved.
3
4Licensed under the Apache License, Version 2.0 (the "License");
5you may not use this file except in compliance with the License.
6You may obtain a copy of the License at
7
8 http://www.apache.org/licenses/LICENSE-2.0
9
10Unless required by applicable law or agreed to in writing, software
11distributed under the License is distributed on an "AS-IS" BASIS,
12WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13See the License for the specific language governing permissions and
14limitations under the License.
15*/
16
17// Prevent Visual Studio from complaining about std::copy_n.
18#if defined(_WIN32)
19#define _SCL_SECURE_NO_WARNINGS
20#endif
21
22#include "base/simd_utils.h"
23
24#include <algorithm>
25#include <limits>
26
27#include "base/constants_and_types.h"
28#include "base/logging.h"
29#include "base/misc_math.h"
30#include "base/simd_macros.h"
31
32
33namespace vraudio {
34
35namespace {
36
37#ifdef SIMD_NEON
38// Deinterleaving operates on 8 int16s at a time.
39const size_t kSixteenBitSimdLength = SIMD_LENGTH * 2;
40#endif // SIMD_NEON
41
42// Float format of max and min values storable in an int16_t, for clamping.
43const float kInt16Max = static_cast<float>(0x7FFF);
44const float kInt16Min = static_cast<float>(-0x7FFF);
45
46// Conversion factors between float and int16_t (both directions).
47const float kFloatFromInt16 = 1.0f / kInt16Max;
48const float kInt16FromFloat = kInt16Max;
49
50// Expected SIMD alignment in bytes.
51const size_t kSimdSizeBytes = 16;
52
53inline size_t GetNumChunks(size_t length) { return length / SIMD_LENGTH; }
54
55inline size_t GetLeftoverSamples(size_t length) { return length % SIMD_LENGTH; }
56
57template <typename T>
58inline bool IsAlignedTemplated(const T* pointer) {
59 return reinterpret_cast<uintptr_t>(pointer) % kSimdSizeBytes == 0;
60}
61
62#ifdef SIMD_DISABLED
63// Calculates the approximate complex magnude of z = real + i * imaginary.
64inline void ComplexMagnitude(float real, float imaginary, float* output) {
65 *output = real * real + imaginary * imaginary;
66 // The value of |output| is not being recalculated, simply modified.
67 *output = 1.0f / FastReciprocalSqrt(*output);
68}
69#endif // defined(SIMD_DISABLED)
70
71} // namespace
72
73bool IsAligned(const float* pointer) {
74 return IsAlignedTemplated<float>(pointer);
75}
76
77bool IsAligned(const int16_t* pointer) {
78 return IsAlignedTemplated<int16_t>(pointer);
79}
80
81size_t FindNextAlignedArrayIndex(size_t length, size_t type_size_bytes,
82 size_t memory_alignment_bytes) {
83 const size_t byte_length = type_size_bytes * length;
84 const size_t unaligned_bytes = byte_length % memory_alignment_bytes;
85 const size_t bytes_to_next_aligned =
86 (unaligned_bytes == 0) ? 0 : memory_alignment_bytes - unaligned_bytes;
87 return (byte_length + bytes_to_next_aligned) / type_size_bytes;
88}
89
90void AddPointwise(size_t length, const float* input_a, const float* input_b,
91 float* output) {
92 DCHECK(input_a);
93 DCHECK(input_b);
94 DCHECK(output);
95
96 const SimdVector* input_a_vector =
97 reinterpret_cast<const SimdVector*>(input_a);
98 const SimdVector* input_b_vector =
99 reinterpret_cast<const SimdVector*>(input_b);
100 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
101#ifdef SIMD_SSE
102 const size_t num_chunks = GetNumChunks(length);
103 const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b);
104 const bool output_aligned = IsAligned(pointer: output);
105 if (inputs_aligned && output_aligned) {
106 for (size_t i = 0; i < num_chunks; ++i) {
107 output_vector[i] = SIMD_ADD(input_a_vector[i], input_b_vector[i]);
108 }
109 } else if (inputs_aligned) {
110 for (size_t i = 0; i < num_chunks; ++i) {
111 const SimdVector output_temp =
112 SIMD_ADD(input_a_vector[i], input_b_vector[i]);
113 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
114 }
115 } else if (output_aligned) {
116 for (size_t i = 0; i < num_chunks; ++i) {
117 const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]);
118 const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]);
119 output_vector[i] = SIMD_ADD(input_a_temp, input_b_temp);
120 }
121 } else {
122 for (size_t i = 0; i < num_chunks; ++i) {
123 const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]);
124 const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]);
125 const SimdVector output_temp = SIMD_ADD(input_a_temp, input_b_temp);
126 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
127 }
128 }
129#else
130 for (size_t i = 0; i < GetNumChunks(length); ++i) {
131 output_vector[i] = SIMD_ADD(input_a_vector[i], input_b_vector[i]);
132 }
133#endif // SIMD_SSE
134
135 // Add samples at the end that were missed by the SIMD chunking.
136 const size_t leftover_samples = GetLeftoverSamples(length);
137 DCHECK_GE(length, leftover_samples);
138 for (size_t i = length - leftover_samples; i < length; ++i) {
139 output[i] = input_a[i] + input_b[i];
140 }
141}
142
143void SubtractPointwise(size_t length, const float* input_a,
144 const float* input_b, float* output) {
145 DCHECK(input_a);
146 DCHECK(input_b);
147 DCHECK(output);
148
149 const SimdVector* input_a_vector =
150 reinterpret_cast<const SimdVector*>(input_a);
151 const SimdVector* input_b_vector =
152 reinterpret_cast<const SimdVector*>(input_b);
153 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
154
155#ifdef SIMD_SSE
156 const size_t num_chunks = GetNumChunks(length);
157 const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b);
158 const bool output_aligned = IsAligned(pointer: output);
159 if (inputs_aligned && output_aligned) {
160 for (size_t i = 0; i < num_chunks; ++i) {
161 output_vector[i] = SIMD_SUB(input_b_vector[i], input_a_vector[i]);
162 }
163 } else if (inputs_aligned) {
164 for (size_t i = 0; i < num_chunks; ++i) {
165 const SimdVector output_temp =
166 SIMD_SUB(input_b_vector[i], input_a_vector[i]);
167 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
168 }
169 } else if (output_aligned) {
170 for (size_t i = 0; i < num_chunks; ++i) {
171 const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]);
172 const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]);
173 output_vector[i] = SIMD_SUB(input_b_temp, input_a_temp);
174 }
175 } else {
176 for (size_t i = 0; i < num_chunks; ++i) {
177 const SimdVector input_a_temp = _mm_load_ps(p: &input_a[i * SIMD_LENGTH]);
178 const SimdVector input_b_temp = _mm_load_ps(p: &input_b[i * SIMD_LENGTH]);
179 const SimdVector output_temp = SIMD_SUB(input_b_temp, input_a_temp);
180 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
181 }
182 }
183#else
184 for (size_t i = 0; i < GetNumChunks(length); ++i) {
185 output_vector[i] = SIMD_SUB(input_b_vector[i], input_a_vector[i]);
186 }
187#endif // SIMD_SSE
188
189 // Subtract samples at the end that were missed by the SIMD chunking.
190 const size_t leftover_samples = GetLeftoverSamples(length);
191 DCHECK_GE(length, leftover_samples);
192 for (size_t i = length - leftover_samples; i < length; ++i) {
193 output[i] = input_b[i] - input_a[i];
194 }
195}
196
197void MultiplyPointwise(size_t length, const float* input_a,
198 const float* input_b, float* output) {
199 DCHECK(input_a);
200 DCHECK(input_b);
201 DCHECK(output);
202
203 const SimdVector* input_a_vector =
204 reinterpret_cast<const SimdVector*>(input_a);
205 const SimdVector* input_b_vector =
206 reinterpret_cast<const SimdVector*>(input_b);
207 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
208
209#ifdef SIMD_SSE
210 const size_t num_chunks = GetNumChunks(length);
211 const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b);
212 const bool output_aligned = IsAligned(pointer: output);
213 if (inputs_aligned && output_aligned) {
214 for (size_t i = 0; i < num_chunks; ++i) {
215 output_vector[i] = SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]);
216 }
217 } else if (inputs_aligned) {
218 for (size_t i = 0; i < num_chunks; ++i) {
219 const SimdVector output_temp =
220 SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]);
221 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
222 }
223 } else if (output_aligned) {
224 for (size_t i = 0; i < num_chunks; ++i) {
225 const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]);
226 const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]);
227 output_vector[i] = SIMD_MULTIPLY(input_a_temp, input_b_temp);
228 }
229 } else {
230 for (size_t i = 0; i < num_chunks; ++i) {
231 const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]);
232 const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]);
233 const SimdVector output_temp = SIMD_MULTIPLY(input_a_temp, input_b_temp);
234 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
235 }
236 }
237#else
238 for (size_t i = 0; i < GetNumChunks(length); ++i) {
239 output_vector[i] = SIMD_MULTIPLY(input_a_vector[i], input_b_vector[i]);
240 }
241#endif // SIMD_SSE
242
243 // Multiply samples at the end that were missed by the SIMD chunking.
244 const size_t leftover_samples = GetLeftoverSamples(length);
245 DCHECK_GE(length, leftover_samples);
246 for (size_t i = length - leftover_samples; i < length; ++i) {
247 output[i] = input_a[i] * input_b[i];
248 }
249}
250
251void MultiplyAndAccumulatePointwise(size_t length, const float* input_a,
252 const float* input_b, float* accumulator) {
253 DCHECK(input_a);
254 DCHECK(input_b);
255 DCHECK(accumulator);
256
257 const SimdVector* input_a_vector =
258 reinterpret_cast<const SimdVector*>(input_a);
259 const SimdVector* input_b_vector =
260 reinterpret_cast<const SimdVector*>(input_b);
261 SimdVector* accumulator_vector = reinterpret_cast<SimdVector*>(accumulator);
262
263#ifdef SIMD_SSE
264 const size_t num_chunks = GetNumChunks(length);
265 const bool inputs_aligned = IsAligned(pointer: input_a) && IsAligned(pointer: input_b);
266 const bool accumulator_aligned = IsAligned(pointer: accumulator);
267 if (inputs_aligned && accumulator_aligned) {
268 for (size_t i = 0; i < num_chunks; ++i) {
269 accumulator_vector[i] = SIMD_MULTIPLY_ADD(
270 input_a_vector[i], input_b_vector[i], accumulator_vector[i]);
271 }
272 } else if (inputs_aligned) {
273 for (size_t i = 0; i < num_chunks; ++i) {
274 SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]);
275 accumulator_temp = SIMD_MULTIPLY_ADD(input_a_vector[i], input_b_vector[i],
276 accumulator_temp);
277 _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp);
278 }
279 } else if (accumulator_aligned) {
280 for (size_t i = 0; i < num_chunks; ++i) {
281 const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]);
282 const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]);
283 accumulator_vector[i] =
284 SIMD_MULTIPLY_ADD(input_a_temp, input_b_temp, accumulator_vector[i]);
285 }
286 } else {
287 for (size_t i = 0; i < num_chunks; ++i) {
288 const SimdVector input_a_temp = _mm_loadu_ps(p: &input_a[i * SIMD_LENGTH]);
289 const SimdVector input_b_temp = _mm_loadu_ps(p: &input_b[i * SIMD_LENGTH]);
290 SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]);
291 accumulator_temp =
292 SIMD_MULTIPLY_ADD(input_a_temp, input_b_temp, accumulator_temp);
293 _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp);
294 }
295 }
296#else
297 for (size_t i = 0; i < GetNumChunks(length); ++i) {
298 accumulator_vector[i] = SIMD_MULTIPLY_ADD(
299 input_a_vector[i], input_b_vector[i], accumulator_vector[i]);
300 }
301#endif // SIMD_SSE
302
303 // Apply gain and accumulate to samples at the end that were missed by the
304 // SIMD chunking.
305 const size_t leftover_samples = GetLeftoverSamples(length);
306 DCHECK_GE(length, leftover_samples);
307 for (size_t i = length - leftover_samples; i < length; ++i) {
308 accumulator[i] += input_a[i] * input_b[i];
309 }
310}
311
312void ScalarMultiply(size_t length, float gain, const float* input,
313 float* output) {
314 DCHECK(input);
315 DCHECK(output);
316
317 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
318 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
319
320 const SimdVector gain_vector = SIMD_LOAD_ONE_FLOAT(gain);
321#ifdef SIMD_SSE
322 const size_t num_chunks = GetNumChunks(length);
323 const bool input_aligned = IsAligned(pointer: input);
324 const bool output_aligned = IsAligned(pointer: output);
325 if (input_aligned && output_aligned) {
326 for (size_t i = 0; i < num_chunks; ++i) {
327 output_vector[i] = SIMD_MULTIPLY(gain_vector, input_vector[i]);
328 }
329 } else if (input_aligned) {
330 for (size_t i = 0; i < num_chunks; ++i) {
331 const SimdVector output_temp =
332 SIMD_MULTIPLY(gain_vector, input_vector[i]);
333 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
334 }
335 } else if (output_aligned) {
336 for (size_t i = 0; i < num_chunks; ++i) {
337 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
338 output_vector[i] = SIMD_MULTIPLY(gain_vector, input_temp);
339 }
340 } else {
341 for (size_t i = 0; i < num_chunks; ++i) {
342 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
343 const SimdVector output_temp = SIMD_MULTIPLY(gain_vector, input_temp);
344 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
345 }
346 }
347#else
348 for (size_t i = 0; i < GetNumChunks(length); ++i) {
349 output_vector[i] = SIMD_MULTIPLY(gain_vector, input_vector[i]);
350 }
351#endif // SIMD_SSE
352
353 // Apply gain to samples at the end that were missed by the SIMD chunking.
354 const size_t leftover_samples = GetLeftoverSamples(length);
355 DCHECK_GE(length, leftover_samples);
356 for (size_t i = length - leftover_samples; i < length; ++i) {
357 output[i] = input[i] * gain;
358 }
359}
360
361void ScalarMultiplyAndAccumulate(size_t length, float gain, const float* input,
362 float* accumulator) {
363 DCHECK(input);
364 DCHECK(accumulator);
365
366 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
367 SimdVector* accumulator_vector = reinterpret_cast<SimdVector*>(accumulator);
368
369 const SimdVector gain_vector = SIMD_LOAD_ONE_FLOAT(gain);
370#ifdef SIMD_SSE
371 const size_t num_chunks = GetNumChunks(length);
372 const bool input_aligned = IsAligned(pointer: input);
373 const bool accumulator_aligned = IsAligned(pointer: accumulator);
374 if (input_aligned && accumulator_aligned) {
375 for (size_t i = 0; i < num_chunks; ++i) {
376 accumulator_vector[i] = SIMD_MULTIPLY_ADD(gain_vector, input_vector[i],
377 accumulator_vector[i]);
378 }
379 } else if (input_aligned) {
380 for (size_t i = 0; i < num_chunks; ++i) {
381 SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]);
382 accumulator_temp =
383 SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], accumulator_temp);
384 _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp);
385 }
386 } else if (accumulator_aligned) {
387 for (size_t i = 0; i < num_chunks; ++i) {
388 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
389 accumulator_vector[i] =
390 SIMD_MULTIPLY_ADD(gain_vector, input_temp, accumulator_vector[i]);
391 }
392 } else {
393 for (size_t i = 0; i < num_chunks; ++i) {
394 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
395 SimdVector accumulator_temp = _mm_loadu_ps(p: &accumulator[i * SIMD_LENGTH]);
396 accumulator_temp =
397 SIMD_MULTIPLY_ADD(gain_vector, input_temp, accumulator_temp);
398 _mm_storeu_ps(p: &accumulator[i * SIMD_LENGTH], a: accumulator_temp);
399 }
400 }
401#else
402 for (size_t i = 0; i < GetNumChunks(length); ++i) {
403 accumulator_vector[i] =
404 SIMD_MULTIPLY_ADD(gain_vector, input_vector[i], accumulator_vector[i]);
405 }
406#endif // SIMD_SSE
407
408 // Apply gain and accumulate to samples at the end that were missed by the
409 // SIMD chunking.
410 const size_t leftover_samples = GetLeftoverSamples(length);
411 DCHECK_GE(length, leftover_samples);
412 for (size_t i = length - leftover_samples; i < length; ++i) {
413 accumulator[i] += input[i] * gain;
414 }
415}
416
417void ReciprocalSqrt(size_t length, const float* input, float* output) {
418 DCHECK(input);
419 DCHECK(output);
420
421#if !defined(SIMD_DISABLED)
422 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
423 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
424#endif // !defined(SIMD_DISABLED)
425
426#ifdef SIMD_SSE
427 const size_t num_chunks = GetNumChunks(length);
428 const bool input_aligned = IsAligned(pointer: input);
429 const bool output_aligned = IsAligned(pointer: output);
430 if (input_aligned && output_aligned) {
431 for (size_t i = 0; i < num_chunks; ++i) {
432 output_vector[i] = SIMD_RECIPROCAL_SQRT(input_vector[i]);
433 }
434 } else if (input_aligned) {
435 for (size_t i = 0; i < num_chunks; ++i) {
436 const SimdVector output_temp = SIMD_RECIPROCAL_SQRT(input_vector[i]);
437 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
438 }
439 } else if (output_aligned) {
440 for (size_t i = 0; i < num_chunks; ++i) {
441 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
442 output_vector[i] = SIMD_RECIPROCAL_SQRT(input_temp);
443 }
444 } else {
445 for (size_t i = 0; i < num_chunks; ++i) {
446 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
447 const SimdVector output_temp = SIMD_RECIPROCAL_SQRT(input_temp);
448 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
449 }
450 }
451#elif defined SIMD_NEON
452 for (size_t i = 0; i < GetNumChunks(length); ++i) {
453 output_vector[i] = SIMD_RECIPROCAL_SQRT(input_vector[i]);
454 }
455#endif // SIMD_SSE
456
457 // Apply to samples at the end that were missed by the SIMD chunking.
458 const size_t leftover_samples = GetLeftoverSamples(length);
459 DCHECK_GE(length, leftover_samples);
460 for (size_t i = length - leftover_samples; i < length; ++i) {
461 output[i] = FastReciprocalSqrt(input: input[i]);
462 }
463}
464
465void Sqrt(size_t length, const float* input, float* output) {
466 DCHECK(input);
467 DCHECK(output);
468
469#if !defined(SIMD_DISABLED)
470 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
471 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
472#endif // !defined(SIMD_DISABLED)
473
474#ifdef SIMD_SSE
475 const size_t num_chunks = GetNumChunks(length);
476 const bool input_aligned = IsAligned(pointer: input);
477 const bool output_aligned = IsAligned(pointer: output);
478 if (input_aligned && output_aligned) {
479 for (size_t i = 0; i < num_chunks; ++i) {
480 output_vector[i] = SIMD_SQRT(input_vector[i]);
481 }
482 } else if (input_aligned) {
483 for (size_t i = 0; i < num_chunks; ++i) {
484 const SimdVector output_temp = SIMD_SQRT(input_vector[i]);
485 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
486 }
487 } else if (output_aligned) {
488 for (size_t i = 0; i < num_chunks; ++i) {
489 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
490 output_vector[i] = SIMD_SQRT(input_temp);
491 }
492 } else {
493 for (size_t i = 0; i < num_chunks; ++i) {
494 const SimdVector input_temp = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
495 const SimdVector output_temp = SIMD_SQRT(input_temp);
496 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: output_temp);
497 }
498 }
499#elif defined SIMD_NEON
500 for (size_t i = 0; i < GetNumChunks(length); ++i) {
501 // This should be faster than using a sqrt method : https://goo.gl/XRKwFp
502 output_vector[i] = SIMD_SQRT(input_vector[i]);
503 }
504#endif // SIMD_SSE
505
506 // Apply to samples at the end that were missed by the SIMD chunking.
507 const size_t leftover_samples = GetLeftoverSamples(length);
508 DCHECK_GE(length, leftover_samples);
509 for (size_t i = length - leftover_samples; i < length; ++i) {
510 output[i] = 1.0f / FastReciprocalSqrt(input: input[i]);
511 }
512}
513
514void ApproxComplexMagnitude(size_t length, const float* input, float* output) {
515 DCHECK(input);
516 DCHECK(output);
517
518#if !defined(SIMD_DISABLED)
519 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
520 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
521 const size_t num_chunks = GetNumChunks(length);
522 const bool input_aligned = IsAligned(pointer: input);
523 const bool output_aligned = IsAligned(pointer: output);
524#endif // !defined(SIMD_DISABLED)
525
526#ifdef SIMD_SSE
527 if (input_aligned && output_aligned) {
528 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
529 const size_t first_index = out_index * 2;
530 const size_t second_index = first_index + 1;
531 const SimdVector squared_1 =
532 SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]);
533 const SimdVector squared_2 =
534 SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]);
535 const SimdVector unshuffled_1 =
536 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0));
537 const SimdVector unshuffled_2 =
538 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1));
539 output_vector[out_index] = SIMD_ADD(unshuffled_1, unshuffled_2);
540 output_vector[out_index] = SIMD_SQRT(output_vector[out_index]);
541 }
542 } else if (input_aligned) {
543 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
544 const size_t first_index = out_index * 2;
545 const size_t second_index = first_index + 1;
546 const SimdVector squared_1 =
547 SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]);
548 const SimdVector squared_2 =
549 SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]);
550 const SimdVector unshuffled_1 =
551 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0));
552 const SimdVector unshuffled_2 =
553 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1));
554 SimdVector output_temp = SIMD_ADD(unshuffled_1, unshuffled_2);
555 output_vector[out_index] = SIMD_SQRT(output_temp);
556 _mm_storeu_ps(p: &output[out_index * SIMD_LENGTH], a: output_temp);
557 }
558 } else if (output_aligned) {
559 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
560 const size_t first_index = out_index * 2;
561 const size_t second_index = first_index + 1;
562 const SimdVector first_temp =
563 _mm_loadu_ps(p: &input[first_index * SIMD_LENGTH]);
564 const SimdVector second_temp =
565 _mm_loadu_ps(p: &input[second_index * SIMD_LENGTH]);
566 const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp);
567 const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp);
568 const SimdVector unshuffled_1 =
569 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0));
570 const SimdVector unshuffled_2 =
571 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1));
572 output_vector[out_index] = SIMD_ADD(unshuffled_1, unshuffled_2);
573 output_vector[out_index] = SIMD_SQRT(output_vector[out_index]);
574 }
575 } else {
576 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
577 const size_t first_index = out_index * 2;
578 const size_t second_index = first_index + 1;
579 const SimdVector first_temp =
580 _mm_loadu_ps(p: &input[first_index * SIMD_LENGTH]);
581 const SimdVector second_temp =
582 _mm_loadu_ps(p: &input[second_index * SIMD_LENGTH]);
583 const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp);
584 const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp);
585 const SimdVector unshuffled_1 =
586 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(2, 0, 2, 0));
587 const SimdVector unshuffled_2 =
588 _mm_shuffle_ps(squared_1, squared_2, _MM_SHUFFLE(3, 1, 3, 1));
589 SimdVector output_temp = SIMD_ADD(unshuffled_1, unshuffled_2);
590 output_temp = SIMD_SQRT(output_temp);
591 _mm_storeu_ps(p: &output[out_index * SIMD_LENGTH], a: output_temp);
592 }
593 }
594#elif defined SIMD_NEON
595 if (input_aligned && output_aligned) {
596 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
597 const size_t first_index = out_index * 2;
598 const size_t second_index = first_index + 1;
599 const SimdVector squared_1 =
600 SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]);
601 const SimdVector squared_2 =
602 SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]);
603 const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2);
604 output_vector[out_index] = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]);
605 output_vector[out_index] = SIMD_SQRT(output_vector[out_index]);
606 }
607 } else if (input_aligned) {
608 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
609 const size_t first_index = out_index * 2;
610 const size_t second_index = first_index + 1;
611 const SimdVector squared_1 =
612 SIMD_MULTIPLY(input_vector[first_index], input_vector[first_index]);
613 const SimdVector squared_2 =
614 SIMD_MULTIPLY(input_vector[second_index], input_vector[second_index]);
615 const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2);
616 SimdVector output_temp = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]);
617 output_temp = SIMD_SQRT(output_temp);
618 vst1q_f32(&output[out_index * SIMD_LENGTH], output_temp);
619 }
620 } else if (output_aligned) {
621 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
622 const size_t first_index = out_index * 2;
623 const size_t second_index = first_index + 1;
624 const SimdVector first_temp =
625 vld1q_f32(&input[first_index * SIMD_LENGTH]);
626 const SimdVector second_temp =
627 vld1q_f32(&input[second_index * SIMD_LENGTH]);
628 const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp);
629 const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp);
630 const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2);
631 output_vector[out_index] = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]);
632 output_vector[out_index] = SIMD_SQRT(output_vector[out_index]);
633 }
634 } else {
635 for (size_t out_index = 0; out_index < num_chunks; ++out_index) {
636 const size_t first_index = out_index * 2;
637 const size_t second_index = first_index + 1;
638 const SimdVector first_temp =
639 vld1q_f32(&input[first_index * SIMD_LENGTH]);
640 const SimdVector second_temp =
641 vld1q_f32(&input[second_index * SIMD_LENGTH]);
642 const SimdVector squared_1 = SIMD_MULTIPLY(first_temp, first_temp);
643 const SimdVector squared_2 = SIMD_MULTIPLY(second_temp, second_temp);
644 const float32x4x2_t unshuffled = vuzpq_f32(squared_1, squared_2);
645 SimdVector output_temp = SIMD_ADD(unshuffled.val[0], unshuffled.val[1]);
646 output_temp = SIMD_SQRT(output_temp);
647 vst1q_f32(&output[out_index * SIMD_LENGTH], output_temp);
648 }
649 }
650#endif // SIMD_SSE
651
652 // Apply to samples at the end that were missed by the SIMD chunking.
653 const size_t leftover_samples = GetLeftoverSamples(length);
654 DCHECK_GE(length, leftover_samples);
655 for (size_t i = length - leftover_samples; i < length; ++i) {
656 const size_t real_index = i * 2;
657 const size_t imag_index = real_index + 1;
658 const float squared_sum = (input[real_index] * input[real_index]) +
659 (input[imag_index] * input[imag_index]);
660 output[i] = 1.0f / FastReciprocalSqrt(input: squared_sum);
661 }
662}
663
664void ComplexInterleavedFormatFromMagnitudeAndSinCosPhase(
665 size_t length, const float* magnitude, const float* cos_phase,
666 const float* sin_phase, float* complex_interleaved_format_output) {
667 size_t leftover_samples = 0;
668#ifdef SIMD_NEON
669 if (IsAligned(complex_interleaved_format_output) && IsAligned(cos_phase) &&
670 IsAligned(sin_phase) && IsAligned(magnitude)) {
671 const SimdVector* cos_vec = reinterpret_cast<const SimdVector*>(cos_phase);
672 const SimdVector* sin_vec = reinterpret_cast<const SimdVector*>(sin_phase);
673 const SimdVector* magnitude_vec =
674 reinterpret_cast<const SimdVector*>(magnitude);
675
676 const size_t num_chunks = GetNumChunks(length);
677 float32x4x2_t interleaved_pair;
678
679 SimdVector* interleaved_vec =
680 reinterpret_cast<SimdVector*>(complex_interleaved_format_output);
681 for (size_t i = 0, j = 0; j < num_chunks; ++i, j += 2) {
682 interleaved_pair = vzipq_f32(cos_vec[i], sin_vec[i]);
683 interleaved_vec[j] =
684 SIMD_MULTIPLY(interleaved_pair.val[0], magnitude_vec[i]);
685 interleaved_vec[j + 1] =
686 SIMD_MULTIPLY(interleaved_pair.val[1], magnitude_vec[i]);
687 }
688
689 leftover_samples = GetLeftoverSamples(length);
690 }
691#endif // SIMD_NEON
692 DCHECK_EQ(leftover_samples % 2U, 0U);
693 for (size_t i = leftover_samples, j = leftover_samples / 2; i < length;
694 i += 2, ++j) {
695 const size_t imaginary_offset = i + 1;
696 complex_interleaved_format_output[i] = magnitude[j] * cos_phase[j];
697 complex_interleaved_format_output[imaginary_offset] =
698 magnitude[j] * sin_phase[j];
699 }
700}
701
702void StereoFromMonoSimd(size_t length, const float* mono, float* left,
703 float* right) {
704 ScalarMultiply(length, gain: kInverseSqrtTwo, input: mono, output: left);
705 std::copy_n(first: left, n: length, result: right);
706}
707
708void MonoFromStereoSimd(size_t length, const float* left, const float* right,
709 float* mono) {
710 DCHECK(left);
711 DCHECK(right);
712 DCHECK(mono);
713
714 const SimdVector* left_vector = reinterpret_cast<const SimdVector*>(left);
715 const SimdVector* right_vector = reinterpret_cast<const SimdVector*>(right);
716 SimdVector* mono_vector = reinterpret_cast<SimdVector*>(mono);
717
718 const SimdVector inv_root_two_vec = SIMD_LOAD_ONE_FLOAT(kInverseSqrtTwo);
719#ifdef SIMD_SSE
720 const size_t num_chunks = GetNumChunks(length);
721 const bool inputs_aligned = IsAligned(pointer: left) && IsAligned(pointer: right);
722 const bool mono_aligned = IsAligned(pointer: mono);
723 if (inputs_aligned && mono_aligned) {
724 for (size_t i = 0; i < num_chunks; ++i) {
725 mono_vector[i] = SIMD_MULTIPLY(inv_root_two_vec,
726 SIMD_ADD(left_vector[i], right_vector[i]));
727 }
728 } else if (inputs_aligned) {
729 for (size_t i = 0; i < num_chunks; ++i) {
730 const SimdVector mono_temp = SIMD_MULTIPLY(
731 inv_root_two_vec, SIMD_ADD(left_vector[i], right_vector[i]));
732 _mm_storeu_ps(p: &mono[i * SIMD_LENGTH], a: mono_temp);
733 }
734 } else if (mono_aligned) {
735 for (size_t i = 0; i < num_chunks; ++i) {
736 const SimdVector left_temp = _mm_loadu_ps(p: &left[i * SIMD_LENGTH]);
737 const SimdVector right_temp = _mm_loadu_ps(p: &right[i * SIMD_LENGTH]);
738 mono_vector[i] =
739 SIMD_MULTIPLY(inv_root_two_vec, SIMD_ADD(left_temp, right_temp));
740 }
741 } else {
742 for (size_t i = 0; i < num_chunks; ++i) {
743 const SimdVector left_temp = _mm_loadu_ps(p: &left[i * SIMD_LENGTH]);
744 const SimdVector right_temp = _mm_loadu_ps(p: &right[i * SIMD_LENGTH]);
745 const SimdVector mono_temp =
746 SIMD_MULTIPLY(inv_root_two_vec, SIMD_ADD(left_temp, right_temp));
747 _mm_storeu_ps(p: &mono[i * SIMD_LENGTH], a: mono_temp);
748 }
749 }
750#else
751 for (size_t i = 0; i < GetNumChunks(length); ++i) {
752 mono_vector[i] = SIMD_MULTIPLY(inv_root_two_vec,
753 SIMD_ADD(left_vector[i], right_vector[i]));
754 }
755#endif // SIMD_SSE
756 const size_t leftover_samples = GetLeftoverSamples(length);
757 // Downmix samples at the end that were missed by the SIMD chunking.
758 DCHECK_GE(length, leftover_samples);
759 for (size_t i = length - leftover_samples; i < length; ++i) {
760 mono[i] = kInverseSqrtTwo * (left[i] + right[i]);
761 }
762}
763
764#ifdef SIMD_NEON
765
766void Int16FromFloat(size_t length, const float* input, int16_t* output) {
767 DCHECK(input);
768 DCHECK(output);
769
770 // if (input_aligned || output_aligned) {
771 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
772 int16x4_t* output_vector = reinterpret_cast<int16x4_t*>(output);
773
774 // A temporary 32 bit integer vector is needed as we only have intrinsics to
775 // convert from 32 bit floats to 32 bit ints. Then truncate to 16 bit ints.
776 int32x4_t temporary_wide_vector;
777 SimdVector temporary_float_vector;
778
779 const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat);
780
781 for (size_t i = 0; i < GetNumChunks(length); ++i) {
782 temporary_float_vector = SIMD_MULTIPLY(scaling_vector, input_vector[i]);
783 temporary_wide_vector = vcvtq_s32_f32(temporary_float_vector);
784 output_vector[i] = vqmovn_s32(temporary_wide_vector);
785 }
786
787 // The remainder.
788 const size_t leftover_samples = GetLeftoverSamples(length);
789 DCHECK_GE(length, leftover_samples);
790 float temp_float;
791 for (size_t i = length - leftover_samples; i < length; ++i) {
792 temp_float = input[i] * kInt16FromFloat;
793 temp_float = std::min(kInt16Max, std::max(kInt16Min, temp_float));
794 output[i] = static_cast<int16_t>(temp_float);
795 }
796}
797
798void FloatFromInt16(size_t length, const int16_t* input, float* output) {
799 DCHECK(input);
800 DCHECK(output);
801
802 size_t leftover_samples = length;
803 const bool input_aligned = IsAligned(input);
804 const bool output_aligned = IsAligned(output);
805 if (input_aligned || output_aligned) {
806 const int16x4_t* input_vector = reinterpret_cast<const int16x4_t*>(input);
807 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
808
809 int16x4_t temporary_narrow_vector;
810 SimdVector temporary_float_vector;
811 int32x4_t temporary_wide_vector;
812 const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16);
813
814 if (input_aligned && output_aligned) {
815 for (size_t i = 0; i < GetNumChunks(length); ++i) {
816 temporary_wide_vector = vmovl_s16(input_vector[i]);
817 output_vector[i] = vcvtq_f32_s32(temporary_wide_vector);
818 output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]);
819 }
820 } else if (input_aligned) {
821 for (size_t i = 0; i < GetNumChunks(length); ++i) {
822 temporary_wide_vector = vmovl_s16(input_vector[i]);
823 temporary_float_vector = vcvtq_f32_s32(temporary_wide_vector);
824 temporary_float_vector =
825 SIMD_MULTIPLY(scaling_vector, temporary_float_vector);
826 vst1q_f32(&output[i * SIMD_LENGTH], temporary_float_vector);
827 }
828 } else {
829 for (size_t i = 0; i < GetNumChunks(length); ++i) {
830 temporary_narrow_vector = vld1_s16(&input[i * SIMD_LENGTH]);
831 temporary_wide_vector = vmovl_s16(temporary_narrow_vector);
832 output_vector[i] = vcvtq_f32_s32(temporary_wide_vector);
833 output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]);
834 }
835 }
836 leftover_samples = GetLeftoverSamples(length);
837 }
838
839 // The remainder.
840 for (size_t i = length - leftover_samples; i < length; ++i) {
841 output[i] = static_cast<float>(input[i]) * kFloatFromInt16;
842 }
843}
844
845#elif (defined SIMD_SSE && !defined(_MSC_VER))
846
847void Int16FromFloat(size_t length, const float* input, int16_t* output) {
848 DCHECK(input);
849 DCHECK(output);
850
851 size_t leftover_samples = length;
852 const bool input_aligned = IsAligned(pointer: input);
853 const bool output_aligned = IsAligned(pointer: output);
854 if (output_aligned) {
855 const SimdVector* input_vector = reinterpret_cast<const SimdVector*>(input);
856 __m64* output_vector = reinterpret_cast<__m64*>(output);
857
858 const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat);
859 const SimdVector min_vector = SIMD_LOAD_ONE_FLOAT(kInt16Min);
860 const SimdVector max_vector = SIMD_LOAD_ONE_FLOAT(kInt16Max);
861
862 SimdVector temporary_float_vector;
863
864 if (input_aligned) {
865 for (size_t i = 0; i < GetNumChunks(length); ++i) {
866 temporary_float_vector = SIMD_MULTIPLY(scaling_vector, input_vector[i]);
867 temporary_float_vector = _mm_max_ps(a: temporary_float_vector, b: min_vector);
868 temporary_float_vector = _mm_min_ps(a: temporary_float_vector, b: max_vector);
869 output_vector[i] = _mm_cvtps_pi16(a: temporary_float_vector);
870 }
871 } else {
872 for (size_t i = 0; i < GetNumChunks(length); ++i) {
873 temporary_float_vector = _mm_loadu_ps(p: &input[i * SIMD_LENGTH]);
874 temporary_float_vector =
875 SIMD_MULTIPLY(scaling_vector, temporary_float_vector);
876 temporary_float_vector = _mm_max_ps(a: temporary_float_vector, b: min_vector);
877 temporary_float_vector = _mm_min_ps(a: temporary_float_vector, b: max_vector);
878 output_vector[i] = _mm_cvtps_pi16(a: temporary_float_vector);
879 }
880 }
881 // There is no easy way to simply store the 16 bit ints so we dont have an
882 // |input_aligned| only case.
883 leftover_samples = GetLeftoverSamples(length);
884 }
885
886 // The remainder.
887 float temp_float;
888 for (size_t i = length - GetLeftoverSamples(length); i < length; ++i) {
889 temp_float = input[i] * kInt16FromFloat;
890 temp_float = std::min(a: kInt16Max, b: std::max(a: kInt16Min, b: temp_float));
891 output[i] = static_cast<int16_t>(temp_float);
892 }
893}
894
895void FloatFromInt16(size_t length, const int16_t* input, float* output) {
896 DCHECK(input);
897 DCHECK(output);
898
899 size_t leftover_samples = length;
900 const bool input_aligned = IsAligned(pointer: input);
901 const bool output_aligned = IsAligned(pointer: output);
902 if (input_aligned) {
903 SimdVector* output_vector = reinterpret_cast<SimdVector*>(output);
904 const __m64* input_vector = reinterpret_cast<const __m64*>(input);
905
906 const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16);
907
908 if (output_aligned) {
909 for (size_t i = 0; i < GetNumChunks(length); ++i) {
910 output_vector[i] = _mm_cvtpi16_ps(a: input_vector[i]);
911 output_vector[i] = SIMD_MULTIPLY(scaling_vector, output_vector[i]);
912 }
913 } else {
914 SimdVector temporary_float_vector;
915 for (size_t i = 0; i < GetNumChunks(length); ++i) {
916 temporary_float_vector = _mm_cvtpi16_ps(a: input_vector[i]);
917 temporary_float_vector =
918 SIMD_MULTIPLY(scaling_vector, temporary_float_vector);
919 _mm_storeu_ps(p: &output[i * SIMD_LENGTH], a: temporary_float_vector);
920 }
921 }
922 // There is no easy way to simply load the 16 bit ints so we dont have an
923 // |output_aligned| only case.
924 leftover_samples = GetLeftoverSamples(length);
925 }
926
927 // The remainder.
928 for (size_t i = length - leftover_samples; i < length; ++i) {
929 output[i] = static_cast<float>(input[i]) * kFloatFromInt16;
930 }
931}
932
933#else // SIMD disabled or Windows build.
934
935void Int16FromFloat(size_t length, const float* input, int16_t* output) {
936 DCHECK(input);
937 DCHECK(output);
938
939 float temp_float;
940 for (size_t i = 0; i < length; ++i) {
941 temp_float = input[i] * kInt16FromFloat;
942 temp_float = std::min(kInt16Max, std::max(kInt16Min, temp_float));
943 output[i] = static_cast<int16_t>(temp_float);
944 }
945}
946
947void FloatFromInt16(size_t length, const int16_t* input, float* output) {
948 DCHECK(input);
949 DCHECK(output);
950
951 for (size_t i = 0; i < length; ++i) {
952 output[i] = static_cast<float>(input[i]) * kFloatFromInt16;
953 }
954}
955
956#endif // SIMD_NEON
957
958void InterleaveStereo(size_t length, const int16_t* channel_0,
959 const int16_t* channel_1, int16_t* interleaved_buffer) {
960 DCHECK(interleaved_buffer);
961 DCHECK(channel_0);
962 DCHECK(channel_1);
963
964 size_t leftover_samples = length;
965#ifdef SIMD_NEON
966 if (IsAligned(interleaved_buffer) && IsAligned(channel_0) &&
967 IsAligned(channel_1)) {
968 const int16x8_t* channel_0_vec =
969 reinterpret_cast<const int16x8_t*>(channel_0);
970 const int16x8_t* channel_1_vec =
971 reinterpret_cast<const int16x8_t*>(channel_1);
972
973 const size_t num_chunks = length / kSixteenBitSimdLength;
974 int16x8x2_t interleaved_pair;
975
976 int16x8_t* interleaved_vec =
977 reinterpret_cast<int16x8_t*>(interleaved_buffer);
978 for (size_t i = 0, j = 0; i < num_chunks; ++i, j += 2) {
979 interleaved_pair = vzipq_s16(channel_0_vec[i], channel_1_vec[i]);
980 interleaved_vec[j] = interleaved_pair.val[0];
981 interleaved_vec[j + 1] = interleaved_pair.val[1];
982 }
983
984 leftover_samples = length % kSixteenBitSimdLength;
985 }
986#endif // SIMD_NEON
987 for (size_t i = length - leftover_samples; i < length; ++i) {
988 const size_t interleaved_index = kNumStereoChannels * i;
989 interleaved_buffer[interleaved_index] = channel_0[i];
990 interleaved_buffer[interleaved_index + 1] = channel_1[i];
991 }
992}
993
994void InterleaveStereo(size_t length, const float* channel_0,
995 const float* channel_1, float* interleaved_buffer) {
996 DCHECK(interleaved_buffer);
997 DCHECK(channel_0);
998 DCHECK(channel_1);
999
1000 size_t leftover_samples = length;
1001#ifdef SIMD_NEON
1002 if (IsAligned(interleaved_buffer) && IsAligned(channel_0) &&
1003 IsAligned(channel_1)) {
1004 const SimdVector* channel_0_vec =
1005 reinterpret_cast<const SimdVector*>(channel_0);
1006 const SimdVector* channel_1_vec =
1007 reinterpret_cast<const SimdVector*>(channel_1);
1008
1009 const size_t num_chunks = GetNumChunks(length);
1010 float32x4x2_t interleaved_pair;
1011
1012 SimdVector* interleaved_vec =
1013 reinterpret_cast<SimdVector*>(interleaved_buffer);
1014 for (size_t i = 0, j = 0; i < num_chunks; ++i, j += 2) {
1015 interleaved_pair = vzipq_f32(channel_0_vec[i], channel_1_vec[i]);
1016 interleaved_vec[j] = interleaved_pair.val[0];
1017 interleaved_vec[j + 1] = interleaved_pair.val[1];
1018 }
1019
1020 leftover_samples = GetLeftoverSamples(length);
1021 }
1022#endif // SIMD_NEON
1023 for (size_t i = length - leftover_samples; i < length; ++i) {
1024 const size_t interleaved_index = kNumStereoChannels * i;
1025 interleaved_buffer[interleaved_index] = channel_0[i];
1026 interleaved_buffer[interleaved_index + 1] = channel_1[i];
1027 }
1028}
1029
1030void InterleaveStereo(size_t length, const float* channel_0,
1031 const float* channel_1, int16_t* interleaved_buffer) {
1032 DCHECK(interleaved_buffer);
1033 DCHECK(channel_0);
1034 DCHECK(channel_1);
1035
1036 size_t leftover_samples = length;
1037#ifdef SIMD_NEON
1038 if (IsAligned(interleaved_buffer) && IsAligned(channel_0) &&
1039 IsAligned(channel_1)) {
1040 const SimdVector* channel_0_vec =
1041 reinterpret_cast<const SimdVector*>(channel_0);
1042 const SimdVector* channel_1_vec =
1043 reinterpret_cast<const SimdVector*>(channel_1);
1044
1045 const size_t num_chunks = GetNumChunks(length);
1046 float32x4x2_t interleaved_pair;
1047 int32x4_t temporary_wide_vector;
1048
1049 const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kInt16FromFloat);
1050 const SimdVector min_vector = SIMD_LOAD_ONE_FLOAT(kInt16Min);
1051 const SimdVector max_vector = SIMD_LOAD_ONE_FLOAT(kInt16Max);
1052
1053 int16x4_t* interleaved_vec =
1054 reinterpret_cast<int16x4_t*>(interleaved_buffer);
1055 for (size_t i = 0; i < num_chunks; ++i) {
1056 const size_t interleaved_index = kNumStereoChannels * i;
1057 interleaved_pair = vzipq_f32(channel_0_vec[i], channel_1_vec[i]);
1058 interleaved_pair.val[0] =
1059 SIMD_MULTIPLY(scaling_vector, interleaved_pair.val[0]);
1060 interleaved_pair.val[0] = vmaxq_f32(interleaved_pair.val[0], min_vector);
1061 interleaved_pair.val[0] = vminq_f32(interleaved_pair.val[0], max_vector);
1062 temporary_wide_vector = vcvtq_s32_f32(interleaved_pair.val[0]);
1063 interleaved_vec[interleaved_index] = vqmovn_s32(temporary_wide_vector);
1064 interleaved_pair.val[1] =
1065 SIMD_MULTIPLY(scaling_vector, interleaved_pair.val[1]);
1066 interleaved_pair.val[1] = vmaxq_f32(interleaved_pair.val[1], min_vector);
1067 interleaved_pair.val[1] = vminq_f32(interleaved_pair.val[1], max_vector);
1068 temporary_wide_vector = vcvtq_s32_f32(interleaved_pair.val[1]);
1069 interleaved_vec[interleaved_index + 1] =
1070 vqmovn_s32(temporary_wide_vector);
1071 }
1072
1073 leftover_samples = GetLeftoverSamples(length);
1074 }
1075#endif // SIMD_NEON
1076 for (size_t i = length - leftover_samples; i < length; ++i) {
1077 const size_t interleaved_index = kNumStereoChannels * i;
1078 interleaved_buffer[interleaved_index] = static_cast<int16_t>(std::max(
1079 a: kInt16Min, b: std::min(a: kInt16Max, b: kInt16FromFloat * channel_0[i])));
1080 interleaved_buffer[interleaved_index + 1] = static_cast<int16_t>(std::max(
1081 a: kInt16Min, b: std::min(a: kInt16Max, b: kInt16FromFloat * channel_1[i])));
1082 }
1083}
1084
1085void DeinterleaveStereo(size_t length, const int16_t* interleaved_buffer,
1086 int16_t* channel_0, int16_t* channel_1) {
1087 DCHECK(interleaved_buffer);
1088 DCHECK(channel_0);
1089 DCHECK(channel_1);
1090
1091 size_t leftover_samples = length;
1092#ifdef SIMD_NEON
1093 if (IsAligned(interleaved_buffer) && IsAligned(channel_0) &&
1094 IsAligned(channel_1)) {
1095 const size_t num_chunks = length / kSixteenBitSimdLength;
1096 leftover_samples = length % kSixteenBitSimdLength;
1097 int16x8_t* channel_0_vec = reinterpret_cast<int16x8_t*>(channel_0);
1098 int16x8_t* channel_1_vec = reinterpret_cast<int16x8_t*>(channel_1);
1099 int16x8x2_t deinterleaved_pair;
1100 const int16x8_t* interleaved_vec =
1101 reinterpret_cast<const int16x8_t*>(interleaved_buffer);
1102 for (size_t chunk = 0; chunk < num_chunks; ++chunk) {
1103 const size_t interleaved_index = chunk * kNumStereoChannels;
1104 deinterleaved_pair = vuzpq_s16(interleaved_vec[interleaved_index],
1105 interleaved_vec[interleaved_index + 1]);
1106 channel_0_vec[chunk] = deinterleaved_pair.val[0];
1107 channel_1_vec[chunk] = deinterleaved_pair.val[1];
1108 }
1109 }
1110#endif // SIMD_NEON
1111 for (size_t i = length - leftover_samples; i < length; ++i) {
1112 const size_t interleaved_index = kNumStereoChannels * i;
1113 channel_0[i] = interleaved_buffer[interleaved_index];
1114 channel_1[i] = interleaved_buffer[interleaved_index + 1];
1115 }
1116}
1117
1118void DeinterleaveStereo(size_t length, const float* interleaved_buffer,
1119 float* channel_0, float* channel_1) {
1120 DCHECK(interleaved_buffer);
1121 DCHECK(channel_0);
1122 DCHECK(channel_1);
1123
1124 size_t leftover_samples = length;
1125#ifdef SIMD_NEON
1126 if (IsAligned(interleaved_buffer) && IsAligned(channel_0) &&
1127 IsAligned(channel_1)) {
1128 const size_t num_chunks = GetNumChunks(length);
1129 leftover_samples = GetLeftoverSamples(length);
1130 SimdVector* channel_0_vec = reinterpret_cast<SimdVector*>(channel_0);
1131 SimdVector* channel_1_vec = reinterpret_cast<SimdVector*>(channel_1);
1132 float32x4x2_t deinterleaved_pair;
1133
1134 const SimdVector* interleaved_vec =
1135 reinterpret_cast<const SimdVector*>(interleaved_buffer);
1136 for (size_t chunk = 0; chunk < num_chunks; ++chunk) {
1137 const size_t interleaved_index = chunk * kNumStereoChannels;
1138 deinterleaved_pair = vuzpq_f32(interleaved_vec[interleaved_index],
1139 interleaved_vec[interleaved_index + 1]);
1140 channel_0_vec[chunk] = deinterleaved_pair.val[0];
1141 channel_1_vec[chunk] = deinterleaved_pair.val[1];
1142 }
1143 }
1144#endif // SIMD_NEON
1145 for (size_t i = length - leftover_samples; i < length; ++i) {
1146 const size_t interleaved_index = kNumStereoChannels * i;
1147 channel_0[i] = interleaved_buffer[interleaved_index];
1148 channel_1[i] = interleaved_buffer[interleaved_index + 1];
1149 }
1150}
1151
1152void DeinterleaveStereo(size_t length, const int16_t* interleaved_buffer,
1153 float* channel_0, float* channel_1) {
1154 DCHECK(interleaved_buffer);
1155 DCHECK(channel_0);
1156 DCHECK(channel_1);
1157
1158 size_t leftover_samples = length;
1159#ifdef SIMD_NEON
1160 if (IsAligned(interleaved_buffer) && IsAligned(channel_0) &&
1161 IsAligned(channel_1)) {
1162 const size_t num_chunks = GetNumChunks(length);
1163 leftover_samples = GetLeftoverSamples(length);
1164 SimdVector* channel_0_vec = reinterpret_cast<SimdVector*>(channel_0);
1165 SimdVector* channel_1_vec = reinterpret_cast<SimdVector*>(channel_1);
1166 int16x4x2_t deinterleaved_pair;
1167 int32x4_t temporary_wide;
1168 const SimdVector scaling_vector = SIMD_LOAD_ONE_FLOAT(kFloatFromInt16);
1169
1170 const int16x4_t* interleaved_vec =
1171 reinterpret_cast<const int16x4_t*>(interleaved_buffer);
1172 for (size_t chunk = 0; chunk < num_chunks; ++chunk) {
1173 const size_t interleaved_index = chunk * kNumStereoChannels;
1174 deinterleaved_pair = vuzp_s16(interleaved_vec[interleaved_index],
1175 interleaved_vec[interleaved_index + 1]);
1176 temporary_wide = vmovl_s16(deinterleaved_pair.val[0]);
1177 channel_0_vec[chunk] = vcvtq_f32_s32(temporary_wide);
1178 channel_0_vec[chunk] =
1179 SIMD_MULTIPLY(scaling_vector, channel_0_vec[chunk]);
1180 temporary_wide = vmovl_s16(deinterleaved_pair.val[1]);
1181 channel_1_vec[chunk] = vcvtq_f32_s32(temporary_wide);
1182 channel_1_vec[chunk] =
1183 SIMD_MULTIPLY(scaling_vector, channel_1_vec[chunk]);
1184 }
1185 }
1186#endif // SIMD_NEON
1187 for (size_t i = length - leftover_samples; i < length; ++i) {
1188 const size_t interleaved_index = kNumStereoChannels * i;
1189 channel_0[i] = static_cast<float>(interleaved_buffer[interleaved_index]) *
1190 kFloatFromInt16;
1191 channel_1[i] =
1192 static_cast<float>(interleaved_buffer[interleaved_index + 1]) *
1193 kFloatFromInt16;
1194 }
1195}
1196
1197void InterleaveQuad(size_t length, const int16_t* channel_0,
1198 const int16_t* channel_1, const int16_t* channel_2,
1199 const int16_t* channel_3, int16_t* workspace,
1200 int16_t* interleaved_buffer) {
1201#ifdef SIMD_NEON
1202 DCHECK(IsAligned(workspace));
1203 const size_t double_length = length * 2;
1204 int16_t* workspace_half_point =
1205 workspace + FindNextAlignedArrayIndex(double_length, sizeof(int16_t),
1206 kMemoryAlignmentBytes);
1207 InterleaveStereo(length, channel_0, channel_2, workspace);
1208 InterleaveStereo(length, channel_1, channel_3, workspace_half_point);
1209 InterleaveStereo(double_length, workspace, workspace_half_point,
1210 interleaved_buffer);
1211#else
1212 for (size_t i = 0; i < length; ++i) {
1213 const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i;
1214 interleaved_buffer[interleaved_index] = channel_0[i];
1215 interleaved_buffer[interleaved_index + 1] = channel_1[i];
1216 interleaved_buffer[interleaved_index + 2] = channel_2[i];
1217 interleaved_buffer[interleaved_index + 3] = channel_3[i];
1218 }
1219#endif // SIMD_NEON
1220}
1221
1222void InterleaveQuad(size_t length, const float* channel_0,
1223 const float* channel_1, const float* channel_2,
1224 const float* channel_3, float* workspace,
1225 float* interleaved_buffer) {
1226#ifdef SIMD_NEON
1227 DCHECK(IsAligned(workspace));
1228 const size_t double_length = length * 2;
1229 float* workspace_half_point =
1230 workspace + FindNextAlignedArrayIndex(double_length, sizeof(float),
1231 kMemoryAlignmentBytes);
1232 DCHECK(IsAligned(workspace_half_point));
1233 InterleaveStereo(length, channel_0, channel_2, workspace);
1234 InterleaveStereo(length, channel_1, channel_3, workspace_half_point);
1235 InterleaveStereo(double_length, workspace, workspace_half_point,
1236 interleaved_buffer);
1237#else
1238 for (size_t i = 0; i < length; ++i) {
1239 const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i;
1240 interleaved_buffer[interleaved_index] = channel_0[i];
1241 interleaved_buffer[interleaved_index + 1] = channel_1[i];
1242 interleaved_buffer[interleaved_index + 2] = channel_2[i];
1243 interleaved_buffer[interleaved_index + 3] = channel_3[i];
1244 }
1245#endif // SIMD_NEON
1246}
1247
1248void DeinterleaveQuad(size_t length, const int16_t* interleaved_buffer,
1249 int16_t* workspace, int16_t* channel_0,
1250 int16_t* channel_1, int16_t* channel_2,
1251 int16_t* channel_3) {
1252#ifdef SIMD_NEON
1253 DCHECK(IsAligned(workspace));
1254 const size_t double_length = length * 2;
1255 int16_t* workspace_half_point =
1256 workspace + FindNextAlignedArrayIndex(double_length, sizeof(int16_t),
1257 kMemoryAlignmentBytes);
1258 DCHECK(IsAligned(workspace_half_point));
1259 DeinterleaveStereo(double_length, interleaved_buffer, workspace,
1260 workspace_half_point);
1261 DeinterleaveStereo(length, workspace, channel_0, channel_2);
1262 DeinterleaveStereo(length, workspace_half_point, channel_1, channel_3);
1263#else
1264 for (size_t i = 0; i < length; ++i) {
1265 const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i;
1266 channel_0[i] = interleaved_buffer[interleaved_index];
1267 channel_1[i] = interleaved_buffer[interleaved_index + 1];
1268 channel_2[i] = interleaved_buffer[interleaved_index + 2];
1269 channel_3[i] = interleaved_buffer[interleaved_index + 3];
1270 }
1271#endif // SIMD_NEON
1272}
1273
1274void DeinterleaveQuad(size_t length, const float* interleaved_buffer,
1275 float* workspace, float* channel_0, float* channel_1,
1276 float* channel_2, float* channel_3) {
1277#ifdef SIMD_NEON
1278 DCHECK(IsAligned(workspace));
1279 const size_t double_length = length * 2;
1280 float* workspace_half_point =
1281 workspace + FindNextAlignedArrayIndex(double_length, sizeof(float),
1282 kMemoryAlignmentBytes);
1283 DCHECK(IsAligned(workspace_half_point));
1284 DeinterleaveStereo(double_length, interleaved_buffer, workspace,
1285 workspace_half_point);
1286 DeinterleaveStereo(length, workspace, channel_0, channel_2);
1287 DeinterleaveStereo(length, workspace_half_point, channel_1, channel_3);
1288#else
1289 for (size_t i = 0; i < length; ++i) {
1290 const size_t interleaved_index = kNumFirstOrderAmbisonicChannels * i;
1291 channel_0[i] = interleaved_buffer[interleaved_index];
1292 channel_1[i] = interleaved_buffer[interleaved_index + 1];
1293 channel_2[i] = interleaved_buffer[interleaved_index + 2];
1294 channel_3[i] = interleaved_buffer[interleaved_index + 3];
1295 }
1296#endif // SIMD_NEON
1297}
1298
1299} // namespace vraudio
1300

source code of qtmultimedia/src/3rdparty/resonance-audio/resonance_audio/base/simd_utils.cc