1 | /* graphene-simd4f.h: SIMD wrappers and operations |
2 | * |
3 | * SPDX-License-Identifier: MIT |
4 | * |
5 | * Copyright 2014 Emmanuele Bassi |
6 | * |
7 | * Permission is hereby granted, free of charge, to any person obtaining a copy |
8 | * of this software and associated documentation files (the "Software"), to deal |
9 | * in the Software without restriction, including without limitation the rights |
10 | * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell |
11 | * copies of the Software, and to permit persons to whom the Software is |
12 | * furnished to do so, subject to the following conditions: |
13 | * |
14 | * The above copyright notice and this permission notice shall be included in |
15 | * all copies or substantial portions of the Software. |
16 | * |
17 | * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR |
18 | * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, |
19 | * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SH1_0 THE |
20 | * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER |
21 | * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, |
22 | * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN |
23 | * THE SOFTWARE. |
24 | */ |
25 | |
26 | #pragma once |
27 | |
28 | #if !defined(GRAPHENE_H_INSIDE) && !defined(GRAPHENE_COMPILATION) |
29 | #error "Only graphene.h can be included directly." |
30 | #endif |
31 | |
32 | /* needed for memcpy() */ |
33 | #include <string.h> |
34 | #include <math.h> |
35 | #include <float.h> |
36 | |
37 | #include "graphene-config.h" |
38 | #include "graphene-macros.h" |
39 | #include "graphene-version-macros.h" |
40 | |
41 | GRAPHENE_BEGIN_DECLS |
42 | |
43 | /* Platform specific operations */ |
44 | |
45 | GRAPHENE_AVAILABLE_IN_1_0 |
46 | graphene_simd4f_t graphene_simd4f_init (float x, |
47 | float y, |
48 | float z, |
49 | float w); |
50 | GRAPHENE_AVAILABLE_IN_1_0 |
51 | graphene_simd4f_t graphene_simd4f_init_zero (void); |
52 | GRAPHENE_AVAILABLE_IN_1_0 |
53 | graphene_simd4f_t graphene_simd4f_init_4f (const float *v); |
54 | GRAPHENE_AVAILABLE_IN_1_0 |
55 | graphene_simd4f_t graphene_simd4f_init_3f (const float *v); |
56 | GRAPHENE_AVAILABLE_IN_1_0 |
57 | graphene_simd4f_t graphene_simd4f_init_2f (const float *v); |
58 | |
59 | GRAPHENE_AVAILABLE_IN_1_0 |
60 | void graphene_simd4f_dup_4f (const graphene_simd4f_t s, |
61 | float *v); |
62 | GRAPHENE_AVAILABLE_IN_1_0 |
63 | void graphene_simd4f_dup_3f (const graphene_simd4f_t s, |
64 | float *v); |
65 | GRAPHENE_AVAILABLE_IN_1_0 |
66 | void graphene_simd4f_dup_2f (const graphene_simd4f_t s, |
67 | float *v); |
68 | |
69 | GRAPHENE_AVAILABLE_IN_1_2 |
70 | float graphene_simd4f_get (const graphene_simd4f_t s, |
71 | unsigned int i); |
72 | GRAPHENE_AVAILABLE_IN_1_0 |
73 | float graphene_simd4f_get_x (const graphene_simd4f_t s); |
74 | GRAPHENE_AVAILABLE_IN_1_0 |
75 | float graphene_simd4f_get_y (const graphene_simd4f_t s); |
76 | GRAPHENE_AVAILABLE_IN_1_0 |
77 | float graphene_simd4f_get_z (const graphene_simd4f_t s); |
78 | GRAPHENE_AVAILABLE_IN_1_0 |
79 | float graphene_simd4f_get_w (const graphene_simd4f_t s); |
80 | |
81 | GRAPHENE_AVAILABLE_IN_1_0 |
82 | graphene_simd4f_t graphene_simd4f_splat (float v); |
83 | GRAPHENE_AVAILABLE_IN_1_0 |
84 | graphene_simd4f_t graphene_simd4f_splat_x (const graphene_simd4f_t s); |
85 | GRAPHENE_AVAILABLE_IN_1_0 |
86 | graphene_simd4f_t graphene_simd4f_splat_y (const graphene_simd4f_t s); |
87 | GRAPHENE_AVAILABLE_IN_1_0 |
88 | graphene_simd4f_t graphene_simd4f_splat_z (const graphene_simd4f_t s); |
89 | GRAPHENE_AVAILABLE_IN_1_0 |
90 | graphene_simd4f_t graphene_simd4f_splat_w (const graphene_simd4f_t s); |
91 | |
92 | GRAPHENE_AVAILABLE_IN_1_0 |
93 | graphene_simd4f_t graphene_simd4f_add (const graphene_simd4f_t a, |
94 | const graphene_simd4f_t b); |
95 | GRAPHENE_AVAILABLE_IN_1_0 |
96 | graphene_simd4f_t graphene_simd4f_sub (const graphene_simd4f_t a, |
97 | const graphene_simd4f_t b); |
98 | GRAPHENE_AVAILABLE_IN_1_0 |
99 | graphene_simd4f_t graphene_simd4f_mul (const graphene_simd4f_t a, |
100 | const graphene_simd4f_t b); |
101 | GRAPHENE_AVAILABLE_IN_1_0 |
102 | graphene_simd4f_t graphene_simd4f_div (const graphene_simd4f_t a, |
103 | const graphene_simd4f_t b); |
104 | |
105 | GRAPHENE_AVAILABLE_IN_1_0 |
106 | graphene_simd4f_t graphene_simd4f_sqrt (const graphene_simd4f_t s); |
107 | GRAPHENE_AVAILABLE_IN_1_0 |
108 | graphene_simd4f_t graphene_simd4f_reciprocal (const graphene_simd4f_t s); |
109 | GRAPHENE_AVAILABLE_IN_1_0 |
110 | graphene_simd4f_t graphene_simd4f_rsqrt (const graphene_simd4f_t s); |
111 | |
112 | GRAPHENE_AVAILABLE_IN_1_0 |
113 | graphene_simd4f_t graphene_simd4f_cross3 (const graphene_simd4f_t a, |
114 | const graphene_simd4f_t b); |
115 | GRAPHENE_AVAILABLE_IN_1_0 |
116 | graphene_simd4f_t graphene_simd4f_dot3 (const graphene_simd4f_t a, |
117 | const graphene_simd4f_t b); |
118 | GRAPHENE_AVAILABLE_IN_1_4 |
119 | float graphene_simd4f_dot3_scalar (const graphene_simd4f_t a, |
120 | const graphene_simd4f_t b); |
121 | |
122 | GRAPHENE_AVAILABLE_IN_1_0 |
123 | graphene_simd4f_t graphene_simd4f_min (const graphene_simd4f_t a, |
124 | const graphene_simd4f_t b); |
125 | GRAPHENE_AVAILABLE_IN_1_0 |
126 | graphene_simd4f_t graphene_simd4f_max (const graphene_simd4f_t a, |
127 | const graphene_simd4f_t b); |
128 | |
129 | GRAPHENE_AVAILABLE_IN_1_0 |
130 | graphene_simd4f_t graphene_simd4f_shuffle_wxyz (const graphene_simd4f_t s); |
131 | GRAPHENE_AVAILABLE_IN_1_0 |
132 | graphene_simd4f_t graphene_simd4f_shuffle_zwxy (const graphene_simd4f_t s); |
133 | GRAPHENE_AVAILABLE_IN_1_0 |
134 | graphene_simd4f_t graphene_simd4f_shuffle_yzwx (const graphene_simd4f_t s); |
135 | |
136 | GRAPHENE_AVAILABLE_IN_1_0 |
137 | graphene_simd4f_t graphene_simd4f_zero_w (const graphene_simd4f_t s); |
138 | GRAPHENE_AVAILABLE_IN_1_0 |
139 | graphene_simd4f_t graphene_simd4f_zero_zw (const graphene_simd4f_t s); |
140 | |
141 | GRAPHENE_AVAILABLE_IN_1_0 |
142 | graphene_simd4f_t graphene_simd4f_merge_high (const graphene_simd4f_t a, |
143 | const graphene_simd4f_t b); |
144 | GRAPHENE_AVAILABLE_IN_1_0 |
145 | graphene_simd4f_t graphene_simd4f_merge_low (const graphene_simd4f_t a, |
146 | const graphene_simd4f_t b); |
147 | GRAPHENE_AVAILABLE_IN_1_0 |
148 | graphene_simd4f_t graphene_simd4f_merge_w (const graphene_simd4f_t s, |
149 | float v); |
150 | |
151 | GRAPHENE_AVAILABLE_IN_1_0 |
152 | graphene_simd4f_t graphene_simd4f_flip_sign_0101 (const graphene_simd4f_t s); |
153 | GRAPHENE_AVAILABLE_IN_1_0 |
154 | graphene_simd4f_t graphene_simd4f_flip_sign_1010 (const graphene_simd4f_t s); |
155 | |
156 | GRAPHENE_AVAILABLE_IN_1_0 |
157 | bool graphene_simd4f_cmp_eq (const graphene_simd4f_t a, |
158 | const graphene_simd4f_t b); |
159 | GRAPHENE_AVAILABLE_IN_1_0 |
160 | bool graphene_simd4f_cmp_neq (const graphene_simd4f_t a, |
161 | const graphene_simd4f_t b); |
162 | GRAPHENE_AVAILABLE_IN_1_2 |
163 | bool graphene_simd4f_cmp_lt (const graphene_simd4f_t a, |
164 | const graphene_simd4f_t b); |
165 | GRAPHENE_AVAILABLE_IN_1_2 |
166 | bool graphene_simd4f_cmp_le (const graphene_simd4f_t a, |
167 | const graphene_simd4f_t b); |
168 | GRAPHENE_AVAILABLE_IN_1_2 |
169 | bool graphene_simd4f_cmp_ge (const graphene_simd4f_t a, |
170 | const graphene_simd4f_t b); |
171 | GRAPHENE_AVAILABLE_IN_1_2 |
172 | bool graphene_simd4f_cmp_gt (const graphene_simd4f_t a, |
173 | const graphene_simd4f_t b); |
174 | GRAPHENE_AVAILABLE_IN_1_0 |
175 | graphene_simd4f_t graphene_simd4f_neg (const graphene_simd4f_t s); |
176 | |
177 | #if !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_SSE) |
178 | |
179 | /* SSE2 implementation of SIMD 4f */ |
180 | |
181 | /* Union type used to do single lane reading without memcpy */ |
182 | typedef union { |
183 | graphene_simd4f_t s; |
184 | float f[4]; |
185 | } graphene_simd4f_union_t; |
186 | |
187 | /* On GCC, we use __extension__ macros to avoid a static inline */ |
188 | # if defined(__GNUC__) |
189 | |
190 | /* Use GCC statement __extension__ to inline all these functions */ |
191 | |
192 | # define graphene_simd4f_init(x,y,z,w) \ |
193 | (__extension__ ({ \ |
194 | (graphene_simd4f_t) { (x), (y), (z), (w) }; \ |
195 | })) |
196 | |
197 | # define graphene_simd4f_init_zero() \ |
198 | (__extension__ ({ \ |
199 | (graphene_simd4f_t) _mm_setzero_ps(); \ |
200 | })) |
201 | |
202 | # define graphene_simd4f_init_4f(v) \ |
203 | (__extension__ ({ \ |
204 | (graphene_simd4f_t) _mm_loadu_ps (v); \ |
205 | })) |
206 | |
207 | # define graphene_simd4f_init_3f(v) \ |
208 | (__extension__ ({ \ |
209 | (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \ |
210 | })) |
211 | |
212 | # define graphene_simd4f_init_2f(v) \ |
213 | (__extension__ ({ \ |
214 | (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \ |
215 | })) |
216 | |
217 | # define graphene_simd4f_dup_4f(s,v) \ |
218 | (__extension__ ({ \ |
219 | _mm_storeu_ps ((v), (s)); \ |
220 | })) |
221 | |
222 | # define graphene_simd4f_dup_3f(s,v) \ |
223 | (__extension__ ({ \ |
224 | memcpy ((v), &(s), sizeof (float) * 3); \ |
225 | })) |
226 | |
227 | # define graphene_simd4f_dup_2f(s,v) \ |
228 | (__extension__ ({ \ |
229 | memcpy ((v), &(s), sizeof (float) * 2); \ |
230 | })) |
231 | |
232 | # define graphene_simd4f_get(s,i) \ |
233 | (__extension__ ({ \ |
234 | graphene_simd4f_union_t __u = { (s) }; \ |
235 | (float) __u.f[(i)]; \ |
236 | })) |
237 | |
238 | # define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0) |
239 | # define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1) |
240 | # define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2) |
241 | # define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3) |
242 | |
243 | # define graphene_simd4f_splat(v) \ |
244 | (__extension__ ({ \ |
245 | (graphene_simd4f_t) _mm_set1_ps ((v)); \ |
246 | })) |
247 | |
248 | # define graphene_simd4f_splat_x(v) \ |
249 | (__extension__ ({ \ |
250 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 0, 0, 0)); \ |
251 | })) |
252 | |
253 | # define graphene_simd4f_splat_y(v) \ |
254 | (__extension__ ({ \ |
255 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 1, 1, 1)); \ |
256 | })) |
257 | |
258 | # define graphene_simd4f_splat_z(v) \ |
259 | (__extension__ ({ \ |
260 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 2, 2, 2)); \ |
261 | })) |
262 | |
263 | # define graphene_simd4f_splat_w(v) \ |
264 | (__extension__ ({ \ |
265 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (3, 3, 3, 3)); \ |
266 | })) |
267 | |
268 | # define graphene_simd4f_add(a,b) \ |
269 | (__extension__ ({ \ |
270 | (graphene_simd4f_t) _mm_add_ps ((a), (b)); \ |
271 | })) |
272 | |
273 | # define graphene_simd4f_sub(a,b) \ |
274 | (__extension__ ({ \ |
275 | (graphene_simd4f_t) _mm_sub_ps ((a), (b)); \ |
276 | })) |
277 | |
278 | # define graphene_simd4f_mul(a,b) \ |
279 | (__extension__ ({ \ |
280 | (graphene_simd4f_t) _mm_mul_ps ((a), (b)); \ |
281 | })) |
282 | |
283 | # define graphene_simd4f_div(a,b) \ |
284 | (__extension__ ({ \ |
285 | (graphene_simd4f_t) _mm_div_ps ((a), (b)); \ |
286 | })) |
287 | |
288 | # define graphene_simd4f_sqrt(v) \ |
289 | (__extension__ ({ \ |
290 | (graphene_simd4f_t) _mm_sqrt_ps ((v)); \ |
291 | })) |
292 | |
293 | # define graphene_simd4f_reciprocal(v) \ |
294 | (__extension__ ({ \ |
295 | const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f); \ |
296 | graphene_simd4f_t __s = _mm_rcp_ps ((v)); \ |
297 | graphene_simd4f_mul (__s, graphene_simd4f_sub (__two, graphene_simd4f_mul ((v), __s))); \ |
298 | })) |
299 | |
300 | # define graphene_simd4f_rsqrt(v) \ |
301 | (__extension__ ({ \ |
302 | const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f); \ |
303 | const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f); \ |
304 | graphene_simd4f_t __s = _mm_rsqrt_ps ((v)); \ |
305 | graphene_simd4f_mul (graphene_simd4f_mul (__s, __half), \ |
306 | graphene_simd4f_sub (__three, \ |
307 | graphene_simd4f_mul (__s, graphene_simd4f_mul ((v), __s)))); \ |
308 | })) |
309 | |
310 | # define graphene_simd4f_cross3(a,b) \ |
311 | (__extension__ ({ \ |
312 | const graphene_simd4f_t __a_yzx = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 0, 2, 1)); \ |
313 | const graphene_simd4f_t __a_zxy = _mm_shuffle_ps ((a), (a), _MM_SHUFFLE (3, 1, 0, 2)); \ |
314 | const graphene_simd4f_t __b_yzx = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 0, 2, 1)); \ |
315 | const graphene_simd4f_t __b_zxy = _mm_shuffle_ps ((b), (b), _MM_SHUFFLE (3, 1, 0, 2)); \ |
316 | (graphene_simd4f_t) _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); \ |
317 | })) |
318 | |
319 | # if defined(GRAPHENE_USE_SSE4_1) |
320 | # define graphene_simd4f_dot3(a,b) \ |
321 | (__extension__ ({ \ |
322 | (graphene_simd4f_t) _mm_dp_ps ((a), (b), 0x7f); \ |
323 | })) |
324 | # else |
325 | # define graphene_simd4f_dot3(a,b) \ |
326 | (__extension__ ({ \ |
327 | const unsigned int __mask_bits[] GRAPHENE_ALIGN16 = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \ |
328 | const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); \ |
329 | const graphene_simd4f_t __m = _mm_mul_ps ((a), (b)); \ |
330 | const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask); \ |
331 | const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0)); \ |
332 | const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1)); \ |
333 | (graphene_simd4f_t) _mm_shuffle_ps (__s2, __s2, 0); \ |
334 | })) |
335 | # endif |
336 | |
337 | # define graphene_simd4f_dot3_scalar(a,b) \ |
338 | (__extension__ ({ \ |
339 | float __res; \ |
340 | _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b)); \ |
341 | __res; \ |
342 | })) |
343 | |
344 | # define graphene_simd4f_min(a,b) \ |
345 | (__extension__ ({ \ |
346 | (graphene_simd4f_t) _mm_min_ps ((a), (b)); \ |
347 | })) |
348 | |
349 | # define graphene_simd4f_max(a,b) \ |
350 | (__extension__ ({ \ |
351 | (graphene_simd4f_t) _mm_max_ps ((a), (b)); \ |
352 | })) |
353 | |
354 | # define graphene_simd4f_shuffle_wxyz(v) \ |
355 | (__extension__ ({ \ |
356 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (2, 1, 0, 3)); \ |
357 | })) |
358 | |
359 | # define graphene_simd4f_shuffle_zwxy(v) \ |
360 | (__extension__ ({ \ |
361 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (1, 0, 3, 2)); \ |
362 | })) |
363 | |
364 | # define graphene_simd4f_shuffle_yzwx(v) \ |
365 | (__extension__ ({ \ |
366 | (graphene_simd4f_t) _mm_shuffle_ps ((v), (v), _MM_SHUFFLE (0, 3, 2, 1)); \ |
367 | })) |
368 | |
369 | # define graphene_simd4f_zero_w(v) \ |
370 | (__extension__ ({ \ |
371 | graphene_simd4f_t __s = _mm_unpackhi_ps ((v), _mm_setzero_ps ()); \ |
372 | (graphene_simd4f_t) _mm_movelh_ps ((v), __s); \ |
373 | })) |
374 | |
375 | # define graphene_simd4f_zero_zw(v) \ |
376 | (__extension__ ({ \ |
377 | (graphene_simd4f_t) _mm_movelh_ps ((v), _mm_setzero_ps ()); \ |
378 | })) |
379 | |
380 | # define graphene_simd4f_merge_w(s,v) \ |
381 | (__extension__ ({ \ |
382 | graphene_simd4f_t __s = _mm_unpackhi_ps ((s), _mm_set1_ps ((v))); \ |
383 | (graphene_simd4f_t) _mm_movelh_ps ((s), __s); \ |
384 | })) |
385 | |
386 | # define graphene_simd4f_merge_high(a,b) \ |
387 | (__extension__ ({ \ |
388 | (graphene_simd4f_t) _mm_movehl_ps ((b), (a)); \ |
389 | })) |
390 | |
391 | # define graphene_simd4f_merge_low(a,b) \ |
392 | (__extension__ ({ \ |
393 | (graphene_simd4f_t) _mm_movelh_ps ((a), (b)); \ |
394 | })) |
395 | |
396 | typedef GRAPHENE_ALIGN16 union { |
397 | unsigned int ui[4]; |
398 | float f[4]; |
399 | } graphene_simd4f_uif_t; |
400 | |
401 | # define graphene_simd4f_flip_sign_0101(v) \ |
402 | (__extension__ ({ \ |
403 | const graphene_simd4f_uif_t __pnpn = { { \ |
404 | 0x00000000, \ |
405 | 0x80000000, \ |
406 | 0x00000000, \ |
407 | 0x80000000 \ |
408 | } }; \ |
409 | (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__pnpn.f)); \ |
410 | })) |
411 | |
412 | # define graphene_simd4f_flip_sign_1010(v) \ |
413 | (__extension__ ({ \ |
414 | const graphene_simd4f_uif_t __npnp = { { \ |
415 | 0x80000000, \ |
416 | 0x00000000, \ |
417 | 0x80000000, \ |
418 | 0x00000000, \ |
419 | } }; \ |
420 | (graphene_simd4f_t) _mm_xor_ps ((v), _mm_load_ps (__npnp.f)); \ |
421 | })) |
422 | |
423 | # define graphene_simd4f_cmp_eq(a,b) \ |
424 | (__extension__ ({ \ |
425 | __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \ |
426 | (bool) (_mm_movemask_epi8 (__res) == 0); \ |
427 | })) |
428 | |
429 | # define graphene_simd4f_cmp_neq(a,b) \ |
430 | (__extension__ ({ \ |
431 | __m128i __res = (__m128i) _mm_cmpneq_ps ((a), (b)); \ |
432 | (bool) (_mm_movemask_epi8 (__res) != 0); \ |
433 | })) |
434 | |
435 | # define graphene_simd4f_cmp_lt(a,b) \ |
436 | (__extension__ ({ \ |
437 | __m128i __res = (__m128i) _mm_cmplt_ps ((a), (b)); \ |
438 | (bool) (_mm_movemask_epi8 (__res) == 0xffff); \ |
439 | })) |
440 | |
441 | # define graphene_simd4f_cmp_le(a,b) \ |
442 | (__extension__ ({ \ |
443 | __m128i __res = (__m128i) _mm_cmple_ps ((a), (b)); \ |
444 | (bool) (_mm_movemask_epi8 (__res) == 0xffff); \ |
445 | })) |
446 | |
447 | # define graphene_simd4f_cmp_ge(a,b) \ |
448 | (__extension__ ({ \ |
449 | __m128i __res = (__m128i) _mm_cmpge_ps ((a), (b)); \ |
450 | (bool) (_mm_movemask_epi8 (__res) == 0xffff); \ |
451 | })) |
452 | |
453 | # define graphene_simd4f_cmp_gt(a,b) \ |
454 | (__extension__ ({ \ |
455 | __m128i __res = (__m128i) _mm_cmpgt_ps ((a), (b)); \ |
456 | (bool) (_mm_movemask_epi8 (__res) == 0xffff); \ |
457 | })) |
458 | |
459 | # define graphene_simd4f_neg(s) \ |
460 | (__extension__ ({ \ |
461 | const graphene_simd4f_uif_t __mask = { { \ |
462 | 0x80000000, \ |
463 | 0x80000000, \ |
464 | 0x80000000, \ |
465 | 0x80000000, \ |
466 | } }; \ |
467 | (graphene_simd4f_t) _mm_xor_ps ((s), _mm_load_ps (__mask.f)); \ |
468 | })) |
469 | |
470 | /* On MSVC, we use static inlines */ |
471 | # elif defined (_MSC_VER) /* Visual Studio SSE intrinsics */ |
472 | |
473 | /* Use static inline to inline all these functions */ |
474 | |
475 | #define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w) |
476 | |
477 | static inline graphene_simd4f_t |
478 | _simd4f_init (float x, float y, float z, float w) |
479 | { |
480 | graphene_simd4f_t __s = { x, y, z, w }; |
481 | return __s; |
482 | } |
483 | |
484 | #define graphene_simd4f_init_zero() \ |
485 | _mm_setzero_ps() |
486 | |
487 | #define graphene_simd4f_init_4f(v) \ |
488 | _mm_loadu_ps(v) |
489 | |
490 | #define graphene_simd4f_init_3f(v) \ |
491 | graphene_simd4f_init (v[0], v[1], v[2], 0.f) |
492 | |
493 | #define graphene_simd4f_init_2f(v) \ |
494 | graphene_simd4f_init (v[0], v[1], 0.f, 0.f) |
495 | |
496 | #define graphene_simd4f_dup_4f(s,v) \ |
497 | _mm_storeu_ps (v, s) |
498 | |
499 | #define graphene_simd4f_dup_3f(s,v) \ |
500 | memcpy (v, &s, sizeof (float) * 3) |
501 | |
502 | #define graphene_simd4f_dup_2f(s,v) \ |
503 | memcpy (v, &s, sizeof (float) * 2) |
504 | |
505 | #define graphene_simd4f_get(s,i) _simd4f_get_xyzw(s, i) |
506 | #define graphene_simd4f_get_x(s) _simd4f_get_xyzw(s, 0) |
507 | #define graphene_simd4f_get_y(s) _simd4f_get_xyzw(s, 1) |
508 | #define graphene_simd4f_get_z(s) _simd4f_get_xyzw(s, 2) |
509 | #define graphene_simd4f_get_w(s) _simd4f_get_xyzw(s, 3) |
510 | |
511 | static inline float |
512 | _simd4f_get_xyzw (graphene_simd4f_t s, int mode) |
513 | { |
514 | /* mode: get_x=0 |
515 | get_y=1 |
516 | get_z=2 |
517 | get_w=3 */ |
518 | |
519 | graphene_simd4f_union_t u; |
520 | u.s = s; |
521 | return u.f[mode]; |
522 | } |
523 | |
524 | #define graphene_simd4f_splat(v) \ |
525 | _mm_set1_ps (v) |
526 | |
527 | #define graphene_simd4f_splat_x(v) \ |
528 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 0, 0, 0)) |
529 | |
530 | #define graphene_simd4f_splat_y(v) \ |
531 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 1, 1, 1)) |
532 | |
533 | #define graphene_simd4f_splat_z(v) \ |
534 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 2, 2, 2)) |
535 | |
536 | #define graphene_simd4f_splat_w(v) \ |
537 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (3, 3, 3, 3)) |
538 | |
539 | #define graphene_simd4f_add(a,b) \ |
540 | _mm_add_ps (a, b) |
541 | |
542 | #define graphene_simd4f_sub(a,b) \ |
543 | _mm_sub_ps (a, b) |
544 | |
545 | #define graphene_simd4f_mul(a,b) \ |
546 | _mm_mul_ps (a, b) |
547 | |
548 | #define graphene_simd4f_div(a,b) \ |
549 | _mm_div_ps (a, b) |
550 | |
551 | #define graphene_simd4f_sqrt(v) \ |
552 | _mm_sqrt_ps (v) |
553 | |
554 | #define graphene_simd4f_reciprocal(v) _simd4f_reciprocal(v) |
555 | |
556 | static inline graphene_simd4f_t |
557 | _simd4f_reciprocal(const graphene_simd4f_t v) |
558 | { |
559 | const graphene_simd4f_t __two = graphene_simd4f_init (2.0f, 2.0f, 2.0f, 2.0f); |
560 | graphene_simd4f_t __s = _mm_rcp_ps (v); |
561 | return graphene_simd4f_mul (__s, |
562 | graphene_simd4f_sub (__two, |
563 | graphene_simd4f_mul (v, __s))); |
564 | } |
565 | |
566 | #define graphene_simd4f_rsqrt(v) _simd4f_rsqrt(v) |
567 | |
568 | static inline graphene_simd4f_t |
569 | _simd4f_rsqrt(const graphene_simd4f_t v) |
570 | { |
571 | const graphene_simd4f_t __half = graphene_simd4f_init (0.5f, 0.5f, 0.5f, 0.5f); |
572 | const graphene_simd4f_t __three = graphene_simd4f_init (3.0f, 3.0f, 3.0f, 3.0f); |
573 | graphene_simd4f_t __s = _mm_rsqrt_ps (v); |
574 | return graphene_simd4f_mul (graphene_simd4f_mul (__s, __half), |
575 | graphene_simd4f_sub (__three, |
576 | graphene_simd4f_mul (__s, graphene_simd4f_mul (v, __s)))); |
577 | } |
578 | |
579 | #define graphene_simd4f_cross3(a,b) \ |
580 | _simd4f_cross3(a,b) |
581 | |
582 | static inline graphene_simd4f_t |
583 | _simd4f_cross3 (const graphene_simd4f_t a, |
584 | const graphene_simd4f_t b) |
585 | { |
586 | const graphene_simd4f_t __a_yzx = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 0, 2, 1)); |
587 | const graphene_simd4f_t __a_zxy = _mm_shuffle_ps (a, a, _MM_SHUFFLE (3, 1, 0, 2)); |
588 | const graphene_simd4f_t __b_yzx = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 0, 2, 1)); |
589 | const graphene_simd4f_t __b_zxy = _mm_shuffle_ps (b, b, _MM_SHUFFLE (3, 1, 0, 2)); |
590 | |
591 | return _mm_sub_ps (_mm_mul_ps (__a_yzx, __b_zxy), _mm_mul_ps (__a_zxy, __b_yzx)); |
592 | } |
593 | |
594 | #define graphene_simd4f_dot3(a,b) \ |
595 | _simd4f_dot3(a,b) |
596 | |
597 | static inline graphene_simd4f_t |
598 | _simd4f_dot3 (const graphene_simd4f_t a, |
599 | const graphene_simd4f_t b) |
600 | { |
601 | #if defined(GRAPHENE_USE_SSE4_1) |
602 | return _mm_dp_ps (a, b, 0x7f); |
603 | #else |
604 | GRAPHENE_ALIGN16 const unsigned int __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; |
605 | const graphene_simd4f_t __mask = _mm_load_ps ((const float *) __mask_bits); |
606 | const graphene_simd4f_t __m = _mm_mul_ps ((a), (b)); |
607 | const graphene_simd4f_t __s0 = _mm_and_ps (__m, __mask); |
608 | const graphene_simd4f_t __s1 = _mm_add_ps (__s0, _mm_movehl_ps (__s0, __s0)); |
609 | const graphene_simd4f_t __s2 = _mm_add_ss (__s1, _mm_shuffle_ps (__s1, __s1, 1)); |
610 | |
611 | return _mm_shuffle_ps (__s2, __s2, 0); |
612 | #endif |
613 | } |
614 | |
615 | #define graphene_simd4f_dot3_scalar(a,b) \ |
616 | _simd4f_dot3_scalar(a,b) |
617 | |
618 | static inline float |
619 | _simd4f_dot3_scalar (const graphene_simd4f_t a, |
620 | const graphene_simd4f_t b) |
621 | { |
622 | float __res; |
623 | _mm_store_ss (&__res, graphene_simd4f_dot3 (a, b)); |
624 | return __res; |
625 | } |
626 | |
627 | #define graphene_simd4f_min(a,b) \ |
628 | _mm_min_ps (a, b) |
629 | |
630 | #define graphene_simd4f_max(a,b) \ |
631 | _mm_max_ps (a, b) |
632 | |
633 | |
634 | #define graphene_simd4f_shuffle_wxyz(v) \ |
635 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (2, 1, 0, 3)) |
636 | |
637 | #define graphene_simd4f_shuffle_zwxy(v) \ |
638 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (1, 0, 3, 2)) |
639 | |
640 | #define graphene_simd4f_shuffle_yzwx(v) \ |
641 | _mm_shuffle_ps (v, v, _MM_SHUFFLE (0, 3, 2, 1)) |
642 | |
643 | #define graphene_simd4f_zero_w(v) \ |
644 | _mm_movelh_ps (v, _mm_unpackhi_ps (v, _mm_setzero_ps ())) |
645 | |
646 | #define graphene_simd4f_zero_zw(v) \ |
647 | _mm_movelh_ps (v, _mm_setzero_ps ()) |
648 | |
649 | #define graphene_simd4f_merge_w(s,v) \ |
650 | _mm_movelh_ps (s, _mm_unpackhi_ps (s, _mm_set1_ps (v))) |
651 | |
652 | #define graphene_simd4f_merge_high(a,b) \ |
653 | _mm_movehl_ps (b, a) |
654 | |
655 | #define graphene_simd4f_merge_low(a,b) \ |
656 | _mm_movelh_ps (a, b) |
657 | |
658 | typedef GRAPHENE_ALIGN16 union { |
659 | unsigned int ui[4]; |
660 | float f[4]; |
661 | } graphene_simd4f_uif_t; |
662 | |
663 | #define graphene_simd4f_flip_sign_0101(v) _simd4f_flip_sign_0101(v) |
664 | |
665 | static inline graphene_simd4f_t |
666 | _simd4f_flip_sign_0101 (const graphene_simd4f_t v) |
667 | { |
668 | const graphene_simd4f_uif_t __pnpn = { { |
669 | 0x00000000, |
670 | 0x80000000, |
671 | 0x00000000, |
672 | 0x80000000 |
673 | } }; |
674 | |
675 | return _mm_xor_ps (v, _mm_load_ps (__pnpn.f)); |
676 | } |
677 | |
678 | #define graphene_simd4f_flip_sign_1010(v) _simd4f_flip_sign_1010(v) |
679 | |
680 | static inline graphene_simd4f_t |
681 | _simd4f_flip_sign_1010(const graphene_simd4f_t v) |
682 | { |
683 | const graphene_simd4f_uif_t __npnp = { { |
684 | 0x80000000, |
685 | 0x00000000, |
686 | 0x80000000, |
687 | 0x00000000, |
688 | } }; |
689 | |
690 | return _mm_xor_ps (v, _mm_load_ps (__npnp.f)); |
691 | } |
692 | |
693 | #define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b) |
694 | |
695 | static inline bool |
696 | _simd4f_cmp_eq (const graphene_simd4f_t a, |
697 | const graphene_simd4f_t b) |
698 | { |
699 | __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b)); |
700 | return (_mm_movemask_epi8 (__res) == 0); |
701 | } |
702 | |
703 | #define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b) |
704 | |
705 | static inline bool |
706 | _simd4f_cmp_neq (const graphene_simd4f_t a, |
707 | const graphene_simd4f_t b) |
708 | { |
709 | __m128i __res = _mm_castps_si128 (_mm_cmpneq_ps (a, b)); |
710 | return (_mm_movemask_epi8 (__res) != 0); |
711 | } |
712 | |
713 | #define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b) |
714 | |
715 | static inline bool |
716 | _simd4f_cmp_lt (const graphene_simd4f_t a, |
717 | const graphene_simd4f_t b) |
718 | { |
719 | __m128i __res = _mm_castps_si128 (_mm_cmplt_ps (a, b)); |
720 | return (_mm_movemask_epi8 (__res) == 0xffff); |
721 | } |
722 | |
723 | #define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b) |
724 | |
725 | static inline bool |
726 | _simd4f_cmp_le (const graphene_simd4f_t a, |
727 | const graphene_simd4f_t b) |
728 | { |
729 | __m128i __res = _mm_castps_si128 (_mm_cmple_ps (a, b)); |
730 | return (_mm_movemask_epi8 (__res) == 0xffff); |
731 | } |
732 | |
733 | #define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b) |
734 | |
735 | static inline bool |
736 | _simd4f_cmp_ge (const graphene_simd4f_t a, |
737 | const graphene_simd4f_t b) |
738 | { |
739 | __m128i __res = _mm_castps_si128 (_mm_cmpge_ps (a, b)); |
740 | return (_mm_movemask_epi8 (__res) == 0xffff); |
741 | } |
742 | |
743 | #define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b) |
744 | |
745 | static inline bool |
746 | _simd4f_cmp_gt (const graphene_simd4f_t a, |
747 | const graphene_simd4f_t b) |
748 | { |
749 | __m128i __res = _mm_castps_si128 (_mm_cmpgt_ps (a, b)); |
750 | return (_mm_movemask_epi8 (__res) == 0xffff); |
751 | } |
752 | |
753 | #define graphene_simd4f_neg(s) _simd4f_neg(s) |
754 | |
755 | static inline graphene_simd4f_t |
756 | _simd4f_neg (const graphene_simd4f_t s) |
757 | { |
758 | const graphene_simd4f_uif_t __mask = { { |
759 | 0x80000000, |
760 | 0x80000000, |
761 | 0x80000000, |
762 | 0x80000000, |
763 | } }; |
764 | |
765 | return _mm_xor_ps (s, _mm_load_ps (__mask.f)); |
766 | } |
767 | |
768 | #else /* SSE intrinsics-not GCC or Visual Studio */ |
769 | |
770 | # error "Need GCC-compatible or Visual Studio compiler for SSE extensions." |
771 | |
772 | /* Use static inline to inline all these functions */ |
773 | |
774 | # endif /* !__GNUC__ && !_MSC_VER */ |
775 | |
776 | #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_GCC) |
777 | |
778 | /* GCC vector intrinsic implementation of SIMD 4f */ |
779 | |
780 | typedef int graphene_simd4i_t __attribute__((vector_size (16))); |
781 | |
782 | # define graphene_simd4f_init(x,y,z,w) \ |
783 | (__extension__ ({ \ |
784 | (graphene_simd4f_t) { (x), (y), (z), (w) }; \ |
785 | })) |
786 | |
787 | # define graphene_simd4f_init_zero() \ |
788 | (__extension__ ({ \ |
789 | (graphene_simd4f_t) { 0.f, 0.f, 0.f, 0.f }; \ |
790 | })) |
791 | |
792 | # define graphene_simd4f_init_4f(v) \ |
793 | (__extension__ ({ \ |
794 | (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], (v)[3] }; \ |
795 | })) |
796 | |
797 | # define graphene_simd4f_init_3f(v) \ |
798 | (__extension__ ({ \ |
799 | (graphene_simd4f_t) { (v)[0], (v)[1], (v)[2], 0.f }; \ |
800 | })) |
801 | |
802 | # define graphene_simd4f_init_2f(v) \ |
803 | (__extension__ ({ \ |
804 | (graphene_simd4f_t) { (v)[0], (v)[1], 0.f, 0.f }; \ |
805 | })) |
806 | |
807 | # define graphene_simd4f_dup_4f(s,v) \ |
808 | (__extension__ ({ \ |
809 | memcpy ((v), &(s), sizeof (float) * 4); \ |
810 | })) |
811 | |
812 | # define graphene_simd4f_dup_3f(s,v) \ |
813 | (__extension__ ({ \ |
814 | memcpy ((v), &(s), sizeof (float) * 3); \ |
815 | })) |
816 | |
817 | # define graphene_simd4f_dup_2f(s,v) \ |
818 | (__extension__ ({ \ |
819 | memcpy ((v), &(s), sizeof (float) * 2); \ |
820 | })) |
821 | |
822 | # define graphene_simd4f_get(s,i) (__extension__ ({ (float) (s)[(i)]; })) |
823 | # define graphene_simd4f_get_x(s) graphene_simd4f_get ((s), 0) |
824 | # define graphene_simd4f_get_y(s) graphene_simd4f_get ((s), 1) |
825 | # define graphene_simd4f_get_z(s) graphene_simd4f_get ((s), 2) |
826 | # define graphene_simd4f_get_w(s) graphene_simd4f_get ((s), 3) |
827 | |
828 | # define graphene_simd4f_splat(v) \ |
829 | (__extension__ ({ \ |
830 | (graphene_simd4f_t) { (v), (v), (v), (v) }; \ |
831 | })) |
832 | |
833 | # define graphene_simd4f_splat_x(v) \ |
834 | (__extension__ ({ \ |
835 | float __val = graphene_simd4f_get_x ((v)); \ |
836 | (graphene_simd4f_t) { __val, __val, __val, __val }; \ |
837 | })) |
838 | |
839 | # define graphene_simd4f_splat_y(v) \ |
840 | (__extension__ ({ \ |
841 | float __val = graphene_simd4f_get_y ((v)); \ |
842 | (graphene_simd4f_t) { __val, __val, __val, __val }; \ |
843 | })) |
844 | |
845 | # define graphene_simd4f_splat_z(v) \ |
846 | (__extension__ ({ \ |
847 | float __val = graphene_simd4f_get_z ((v)); \ |
848 | (graphene_simd4f_t) { __val, __val, __val, __val }; \ |
849 | })) |
850 | |
851 | # define graphene_simd4f_splat_w(v) \ |
852 | (__extension__ ({ \ |
853 | float __val = graphene_simd4f_get_w ((v)); \ |
854 | (graphene_simd4f_t) { __val, __val, __val, __val }; \ |
855 | })) |
856 | |
857 | # define graphene_simd4f_reciprocal(v) \ |
858 | (__extension__ ({ \ |
859 | (graphene_simd4f_t) { \ |
860 | (v)[0] != 0.f ? 1.f / (v)[0] : 0.f, \ |
861 | (v)[1] != 0.f ? 1.f / (v)[1] : 0.f, \ |
862 | (v)[2] != 0.f ? 1.f / (v)[2] : 0.f, \ |
863 | (v)[3] != 0.f ? 1.f / (v)[3] : 0.f, \ |
864 | }; \ |
865 | })) |
866 | |
867 | # define graphene_simd4f_sqrt(v) \ |
868 | (__extension__ ({ \ |
869 | (graphene_simd4f_t) { \ |
870 | sqrtf ((v)[0]), \ |
871 | sqrtf ((v)[1]), \ |
872 | sqrtf ((v)[2]), \ |
873 | sqrtf ((v)[3]), \ |
874 | }; \ |
875 | })) |
876 | |
877 | # define graphene_simd4f_rsqrt(v) \ |
878 | (__extension__ ({ \ |
879 | (graphene_simd4f_t) { \ |
880 | (v)[0] != 0.f ? 1.f / sqrtf ((v)[0]) : 0.f, \ |
881 | (v)[1] != 0.f ? 1.f / sqrtf ((v)[1]) : 0.f, \ |
882 | (v)[2] != 0.f ? 1.f / sqrtf ((v)[2]) : 0.f, \ |
883 | (v)[3] != 0.f ? 1.f / sqrtf ((v)[3]) : 0.f, \ |
884 | }; \ |
885 | })) |
886 | |
887 | # define graphene_simd4f_add(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) + (b)); })) |
888 | # define graphene_simd4f_sub(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) - (b)); })) |
889 | # define graphene_simd4f_mul(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) * (b)); })) |
890 | # define graphene_simd4f_div(a,b) (__extension__ ({ (graphene_simd4f_t) ((a) / (b)); })) |
891 | |
892 | # define graphene_simd4f_cross3(a,b) \ |
893 | (__extension__ ({ \ |
894 | const graphene_simd4f_t __a = (a); \ |
895 | const graphene_simd4f_t __b = (b); \ |
896 | graphene_simd4f_init (__a[1] * __b[2] - __a[2] * __b[1], \ |
897 | __a[2] * __b[0] - __a[0] * __b[2], \ |
898 | __a[0] * __b[1] - __a[1] * __b[0], \ |
899 | 0.f); \ |
900 | })) |
901 | |
902 | # define graphene_simd4f_dot3(a,b) \ |
903 | (__extension__ ({ \ |
904 | const graphene_simd4f_t __a = (a); \ |
905 | const graphene_simd4f_t __b = (b); \ |
906 | const float __res = __a[0] * __b[0] + __a[1] * __b[1] + __a[2] * __b[2]; \ |
907 | graphene_simd4f_init (__res, __res, __res, __res); \ |
908 | })) |
909 | |
910 | # define graphene_simd4f_dot3_scalar(a,b) \ |
911 | (__extension__ ({ \ |
912 | graphene_simd4f_get_x (graphene_simd4f_dot3 (a, b)); \ |
913 | })) |
914 | |
915 | # define graphene_simd4f_min(a,b) \ |
916 | (__extension__ ({ \ |
917 | const graphene_simd4f_t __a = (a); \ |
918 | const graphene_simd4f_t __b = (b); \ |
919 | graphene_simd4f_init (__a[0] < __b[0] ? __a[0] : __b[0], \ |
920 | __a[1] < __b[1] ? __a[1] : __b[1], \ |
921 | __a[2] < __b[2] ? __a[2] : __b[2], \ |
922 | __a[3] < __b[3] ? __a[3] : __b[3]); \ |
923 | })) |
924 | |
925 | # define graphene_simd4f_max(a,b) \ |
926 | (__extension__ ({ \ |
927 | const graphene_simd4f_t __a = (a); \ |
928 | const graphene_simd4f_t __b = (b); \ |
929 | graphene_simd4f_init (__a[0] > __b[0] ? __a[0] : __b[0], \ |
930 | __a[1] > __b[1] ? __a[1] : __b[1], \ |
931 | __a[2] > __b[2] ? __a[2] : __b[2], \ |
932 | __a[3] > __b[3] ? __a[3] : __b[3]); \ |
933 | })) |
934 | |
935 | # define graphene_simd4f_shuffle_wxyz(v) \ |
936 | (__extension__ ({ \ |
937 | const graphene_simd4i_t __mask = { 3, 0, 1, 2 }; \ |
938 | (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \ |
939 | })) |
940 | |
941 | # define graphene_simd4f_shuffle_zwxy(v) \ |
942 | (__extension__ ({ \ |
943 | const graphene_simd4i_t __mask = { 2, 3, 0, 1 }; \ |
944 | (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \ |
945 | })) |
946 | |
947 | # define graphene_simd4f_shuffle_yzwx(v) \ |
948 | (__extension__ ({ \ |
949 | const graphene_simd4i_t __mask = { 1, 2, 3, 0 }; \ |
950 | (graphene_simd4f_t) __builtin_shuffle ((v), __mask); \ |
951 | })) |
952 | |
953 | # define graphene_simd4f_zero_w(v) \ |
954 | (__extension__ ({ \ |
955 | const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \ |
956 | (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \ |
957 | })) |
958 | |
959 | # define graphene_simd4f_zero_zw(v) \ |
960 | (__extension__ ({ \ |
961 | const graphene_simd4i_t __mask = { 0, 1, 4, 4 }; \ |
962 | (graphene_simd4f_t) __builtin_shuffle ((v), graphene_simd4f_init_zero (), __mask); \ |
963 | })) |
964 | |
965 | # define graphene_simd4f_merge_w(s,v) \ |
966 | (__extension__ ({ \ |
967 | const graphene_simd4i_t __mask = { 0, 1, 2, 4 }; \ |
968 | (graphene_simd4f_t) __builtin_shuffle ((s), graphene_simd4f_splat ((v)), __mask); \ |
969 | })) |
970 | |
971 | # define graphene_simd4f_merge_high(a,b) \ |
972 | (__extension__ ({ \ |
973 | const graphene_simd4i_t __mask = { 2, 3, 6, 7 }; \ |
974 | (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \ |
975 | })) |
976 | |
977 | # define graphene_simd4f_merge_low(a,b) \ |
978 | (__extension__ ({ \ |
979 | const graphene_simd4i_t __mask = { 0, 1, 4, 5 }; \ |
980 | (graphene_simd4f_t) __builtin_shuffle ((a), (b), __mask); \ |
981 | })) |
982 | |
983 | # define graphene_simd4f_flip_sign_0101(v) \ |
984 | (__extension__ ({ \ |
985 | const graphene_simd4f_t __v = (v); \ |
986 | graphene_simd4f_init (__v[0], -__v[1], __v[2], -__v[3]); \ |
987 | })) |
988 | |
989 | # define graphene_simd4f_flip_sign_1010(v) \ |
990 | (__extension__ ({ \ |
991 | const graphene_simd4f_t __v = (v); \ |
992 | graphene_simd4f_init (-__v[0], __v[1], -__v[2], __v[3]); \ |
993 | })) |
994 | |
995 | # define graphene_simd4f_cmp_eq(a,b) \ |
996 | (__extension__ ({ \ |
997 | const graphene_simd4i_t __res = (a) == (b); \ |
998 | (bool) (__res[0] != 0 && \ |
999 | __res[1] != 0 && \ |
1000 | __res[2] != 0 && \ |
1001 | __res[3] != 0); \ |
1002 | })) |
1003 | |
1004 | # define graphene_simd4f_cmp_neq(a,b) (!graphene_simd4f_cmp_eq (a,b)) |
1005 | |
1006 | # define graphene_simd4f_cmp_lt(a,b) \ |
1007 | (__extension__ ({ \ |
1008 | const graphene_simd4i_t __res = (a) < (b); \ |
1009 | (bool) (__res[0] != 0 && \ |
1010 | __res[1] != 0 && \ |
1011 | __res[2] != 0 && \ |
1012 | __res[3] != 0); \ |
1013 | })) |
1014 | |
1015 | # define graphene_simd4f_cmp_le(a,b) \ |
1016 | (__extension__ ({ \ |
1017 | const graphene_simd4i_t __res = (a) <= (b); \ |
1018 | (bool) (__res[0] != 0 && \ |
1019 | __res[1] != 0 && \ |
1020 | __res[2] != 0 && \ |
1021 | __res[3] != 0); \ |
1022 | })) |
1023 | |
1024 | # define graphene_simd4f_cmp_ge(a,b) \ |
1025 | (__extension__ ({ \ |
1026 | const graphene_simd4i_t __res = (a) >= (b); \ |
1027 | (bool) (__res[0] != 0 && \ |
1028 | __res[1] != 0 && \ |
1029 | __res[2] != 0 && \ |
1030 | __res[3] != 0); \ |
1031 | })) |
1032 | |
1033 | # define graphene_simd4f_cmp_gt(a,b) \ |
1034 | (__extension__ ({ \ |
1035 | const graphene_simd4i_t __res = (a) > (b); \ |
1036 | (bool) (__res[0] != 0 && \ |
1037 | __res[1] != 0 && \ |
1038 | __res[2] != 0 && \ |
1039 | __res[3] != 0); \ |
1040 | })) |
1041 | |
1042 | # define graphene_simd4f_neg(s) \ |
1043 | (__extension__ ({ \ |
1044 | const graphene_simd4f_t __s = (s); \ |
1045 | const graphene_simd4f_t __minus_one = graphene_simd4f_splat (-1.f); \ |
1046 | graphene_simd4f_mul (__s, __minus_one); \ |
1047 | })) |
1048 | |
1049 | #elif !defined(__GI_SCANNER__) && defined(GRAPHENE_USE_ARM_NEON) |
1050 | |
1051 | /* ARM Neon implementation of SIMD4f */ |
1052 | |
1053 | /* Union type used for single lane reading without memcpy */ |
1054 | typedef union { |
1055 | graphene_simd4f_t s; |
1056 | float f[4]; |
1057 | } graphene_simd4f_union_t; |
1058 | |
1059 | /* NEON has optimised 2-lanes vectors we can use */ |
1060 | typedef float32x2_t graphene_simd2f_t; |
1061 | |
1062 | #ifdef __GNUC__ |
1063 | # define graphene_simd4f_init(x,y,z,w) \ |
1064 | (__extension__ ({ \ |
1065 | const float32_t __v[4] = { (x), (y), (z), (w) }; \ |
1066 | (graphene_simd4f_t) vld1q_f32 (__v); \ |
1067 | })) |
1068 | |
1069 | # define graphene_simd4f_init_zero() \ |
1070 | (__extension__ ({ \ |
1071 | (graphene_simd4f_t) vdupq_n_f32 (0.f); \ |
1072 | })) |
1073 | |
1074 | # define graphene_simd4f_init_4f(v) \ |
1075 | (__extension__ ({ \ |
1076 | const float32_t *__v32 = (const float32_t *) (v); \ |
1077 | (graphene_simd4f_t) vld1q_f32 (__v32); \ |
1078 | })) |
1079 | |
1080 | # define graphene_simd4f_init_3f(v) \ |
1081 | (__extension__ ({ \ |
1082 | graphene_simd4f_init (v[0], v[1], v[2], 0.f); \ |
1083 | })) |
1084 | |
1085 | # define graphene_simd4f_init_2f(v) \ |
1086 | (__extension__ ({ \ |
1087 | const float32_t *__v32 = (const float32_t *) (v); \ |
1088 | const graphene_simd2f_t __low = vld1_f32 (__v32); \ |
1089 | const float32_t __zero = 0; \ |
1090 | const graphene_simd2f_t __high = vld1_dup_f32 (&__zero); \ |
1091 | (graphene_simd4f_t) vcombine_f32 (__low, __high); \ |
1092 | })) |
1093 | |
1094 | # define graphene_simd4f_dup_4f(s,v) \ |
1095 | (__extension__ ({ \ |
1096 | vst1q_f32 ((float32_t *) (v), (s)); \ |
1097 | })) |
1098 | |
1099 | # define graphene_simd4f_dup_3f(s,v) \ |
1100 | (__extension__ ({ \ |
1101 | float *__v = (v); \ |
1102 | vst1q_lane_f32 (__v++, (s), 0); \ |
1103 | vst1q_lane_f32 (__v++, (s), 1); \ |
1104 | vst1q_lane_f32 (__v, (s), 2); \ |
1105 | })) |
1106 | |
1107 | # define graphene_simd4f_dup_2f(s,v) \ |
1108 | (__extension__ ({ \ |
1109 | const graphene_simd2f_t __low = vget_low_f32 ((s)); \ |
1110 | vst1_f32 ((float32_t *) (v), __low); \ |
1111 | })) |
1112 | |
1113 | # define graphene_simd4f_get(s,i) \ |
1114 | (__extension__ ({ \ |
1115 | (float) vgetq_lane_f32 ((s), (i)); \ |
1116 | })) |
1117 | |
1118 | # define graphene_simd4f_splat(v) \ |
1119 | (__extension__ ({ \ |
1120 | (graphene_simd4f_t) vdupq_n_f32 ((v)); \ |
1121 | })) |
1122 | |
1123 | # define graphene_simd4f_splat_x(s) \ |
1124 | (__extension__ ({ \ |
1125 | graphene_simd4f_splat (graphene_simd4f_get_x ((s))); \ |
1126 | })) |
1127 | |
1128 | # define graphene_simd4f_splat_y(s) \ |
1129 | (__extension__ ({ \ |
1130 | graphene_simd4f_splat (graphene_simd4f_get_y ((s))); \ |
1131 | })) |
1132 | |
1133 | # define graphene_simd4f_splat_z(s) \ |
1134 | (__extension__ ({ \ |
1135 | graphene_simd4f_splat (graphene_simd4f_get_z ((s))); \ |
1136 | })) |
1137 | |
1138 | # define graphene_simd4f_splat_w(s) \ |
1139 | (__extension__ ({ \ |
1140 | graphene_simd4f_splat (graphene_simd4f_get_w ((s))); \ |
1141 | })) |
1142 | |
1143 | # define graphene_simd4f_reciprocal(s) \ |
1144 | (__extension__ ({ \ |
1145 | graphene_simd4f_t __est = vrecpeq_f32 ((s)); \ |
1146 | __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \ |
1147 | (graphene_simd4f_t) vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); \ |
1148 | })) |
1149 | |
1150 | # define graphene_simd4f_add(a,b) \ |
1151 | (__extension__ ({ \ |
1152 | (graphene_simd4f_t) vaddq_f32 ((a), (b)); \ |
1153 | })) |
1154 | |
1155 | # define graphene_simd4f_sub(a,b) \ |
1156 | (__extension__ ({ \ |
1157 | (graphene_simd4f_t) vsubq_f32 ((a), (b)); \ |
1158 | })) |
1159 | |
1160 | # define graphene_simd4f_mul(a,b) \ |
1161 | (__extension__ ({ \ |
1162 | (graphene_simd4f_t) vmulq_f32 ((a), (b)); \ |
1163 | })) |
1164 | |
1165 | # define graphene_simd4f_div(a,b) \ |
1166 | (__extension__ ({ \ |
1167 | graphene_simd4f_t __rec = graphene_simd4f_reciprocal ((b)); \ |
1168 | (graphene_simd4f_t) vmulq_f32 ((a), __rec); \ |
1169 | })) |
1170 | |
1171 | # define _simd4f_rsqrt_iter(v,estimate) \ |
1172 | (__extension__ ({ \ |
1173 | const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); \ |
1174 | (graphene_simd4f_t) vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate))); \ |
1175 | })) |
1176 | |
1177 | # define graphene_simd4f_rsqrt(s) \ |
1178 | (__extension__ ({ \ |
1179 | graphene_simd4f_t __estimate = vrsqrteq_f32 ((s)); \ |
1180 | __estimate = _simd4f_rsqrt_iter ((s), __estimate); \ |
1181 | __estimate = _simd4f_rsqrt_iter ((s), __estimate); \ |
1182 | _simd4f_rsqrt_iter ((s), __estimate); \ |
1183 | })) |
1184 | |
1185 | # define graphene_simd4f_sqrt(s) \ |
1186 | (__extension__ ({ \ |
1187 | graphene_simd4f_t __rsq = graphene_simd4f_rsqrt ((s)); \ |
1188 | graphene_simd4f_t __rrsq = graphene_simd4f_reciprocal (__rsq); \ |
1189 | uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \ |
1190 | (graphene_simd4f_t) vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq))); \ |
1191 | })) |
1192 | |
1193 | # define graphene_simd4f_cross3(a,b) \ |
1194 | (__extension__ ({ \ |
1195 | const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; \ |
1196 | const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits); \ |
1197 | const graphene_simd4f_t __a = (a), __b = (b); \ |
1198 | const graphene_simd2f_t __a_low = vget_low_f32 (__a); \ |
1199 | const graphene_simd2f_t __b_low = vget_low_f32 (__b); \ |
1200 | const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low); \ |
1201 | const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low); \ |
1202 | graphene_simd4f_t __s3 = graphene_simd4f_sub (graphene_simd4f_mul (__b_yzx, __a), \ |
1203 | graphene_simd4f_mul (__a_yzx, __b)); \ |
1204 | graphene_simd2f_t __s3_low = vget_low_f32 (__s3); \ |
1205 | __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low); \ |
1206 | (graphene_simd4f_t) vandq_s32 ((int32x4_t) __s3, __mask); \ |
1207 | })) |
1208 | |
1209 | # define graphene_simd4f_dot3(a,b) \ |
1210 | (__extension__ ({ \ |
1211 | graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)); \ |
1212 | })) |
1213 | |
1214 | # define graphene_simd4f_dot3_scalar(a,b) \ |
1215 | (__extension__ ({ \ |
1216 | const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); \ |
1217 | const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m)); \ |
1218 | (float) vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); \ |
1219 | })) |
1220 | |
1221 | # define graphene_simd4f_min(a,b) \ |
1222 | (__extension__ ({ \ |
1223 | (graphene_simd4f_t) vminq_f32 ((a), (b)); \ |
1224 | })) |
1225 | |
1226 | # define graphene_simd4f_max(a,b) \ |
1227 | (__extension__ ({ \ |
1228 | (graphene_simd4f_t) vmaxq_f32 (a, b); \ |
1229 | })) |
1230 | |
1231 | # define graphene_simd4f_shuffle_wxyz(v) \ |
1232 | (__extension__ ({ \ |
1233 | graphene_simd4f_union_t __u = { (v) }; \ |
1234 | graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]); \ |
1235 | })) |
1236 | |
1237 | # define graphene_simd4f_shuffle_zwxy(v) \ |
1238 | (__extension__ ({ \ |
1239 | graphene_simd4f_union_t __u = { (v) }; \ |
1240 | graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]); \ |
1241 | })) |
1242 | |
1243 | # define graphene_simd4f_shuffle_yzwx(v) \ |
1244 | (__extension__ ({ \ |
1245 | graphene_simd4f_union_t __u = { (v) }; \ |
1246 | graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]); \ |
1247 | })) |
1248 | |
1249 | # define graphene_simd4f_zero_w(v) \ |
1250 | (__extension__ ({ \ |
1251 | graphene_simd4f_union_t __u = { (v) }; \ |
1252 | graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f); \ |
1253 | })) |
1254 | |
1255 | # define graphene_simd4f_zero_zw(v) \ |
1256 | (__extension__ ({ \ |
1257 | graphene_simd4f_union_t __u = { (v) }; \ |
1258 | graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f); \ |
1259 | })) |
1260 | |
1261 | # define graphene_simd4f_merge_w(s,v) \ |
1262 | (__extension__ ({ \ |
1263 | graphene_simd4f_union_t __u = { (s) }; \ |
1264 | graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v)); \ |
1265 | })) |
1266 | |
1267 | # define graphene_simd4f_merge_high(a,b) \ |
1268 | (__extension__ ({ \ |
1269 | graphene_simd4f_union_t __u_a = { (a) }; \ |
1270 | graphene_simd4f_union_t __u_b = { (b) }; \ |
1271 | graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]); \ |
1272 | })) |
1273 | |
1274 | # define graphene_simd4f_merge_low(a,b) \ |
1275 | (__extension__ ({ \ |
1276 | graphene_simd4f_union_t __u_a = { (a) }; \ |
1277 | graphene_simd4f_union_t __u_b = { (b) }; \ |
1278 | graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]); \ |
1279 | })) |
1280 | |
1281 | # define graphene_simd4f_flip_sign_0101(s) \ |
1282 | (__extension__ ({ \ |
1283 | const unsigned int __upnpn[4] = { \ |
1284 | 0x00000000, \ |
1285 | 0x80000000, \ |
1286 | 0x00000000, \ |
1287 | 0x80000000 \ |
1288 | }; \ |
1289 | const uint32x4_t __pnpn = vld1q_u32 (__upnpn); \ |
1290 | (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn)); \ |
1291 | })) |
1292 | |
1293 | # define graphene_simd4f_flip_sign_1010(s) \ |
1294 | (__extension__ ({ \ |
1295 | const unsigned int __unpnp[4] = { \ |
1296 | 0x80000000, \ |
1297 | 0x00000000, \ |
1298 | 0x80000000, \ |
1299 | 0x00000000 \ |
1300 | }; \ |
1301 | const uint32x4_t __npnp = vld1q_u32 (__unpnp); \ |
1302 | (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp)); \ |
1303 | })) |
1304 | |
1305 | # define graphene_simd4f_cmp_eq(a,b) \ |
1306 | (__extension__ ({ \ |
1307 | const uint32x4_t __mask = vceqq_f32 ((a), (b)); \ |
1308 | (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \ |
1309 | vgetq_lane_u32 (__mask, 1) != 0 && \ |
1310 | vgetq_lane_u32 (__mask, 2) != 0 && \ |
1311 | vgetq_lane_u32 (__mask, 3) != 0); \ |
1312 | })) |
1313 | |
1314 | # define graphene_simd4f_cmp_neq(a,b) \ |
1315 | (__extension__ ({ \ |
1316 | const uint32x4_t __mask = vceqq_f32 ((a), (b)); \ |
1317 | (bool) (vgetq_lane_u32 (__mask, 0) == 0 || \ |
1318 | vgetq_lane_u32 (__mask, 1) == 0 || \ |
1319 | vgetq_lane_u32 (__mask, 2) == 0 || \ |
1320 | vgetq_lane_u32 (__mask, 3) == 0); \ |
1321 | })) |
1322 | |
1323 | # define graphene_simd4f_cmp_lt(a,b) \ |
1324 | (__extension__ ({ \ |
1325 | const uint32x4_t __mask = vcltq_f32 ((a), (b)); \ |
1326 | (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \ |
1327 | vgetq_lane_u32 (__mask, 1) != 0 && \ |
1328 | vgetq_lane_u32 (__mask, 2) != 0 && \ |
1329 | vgetq_lane_u32 (__mask, 3) != 0); \ |
1330 | })) |
1331 | |
1332 | # define graphene_simd4f_cmp_le(a,b) \ |
1333 | (__extension__ ({ \ |
1334 | const uint32x4_t __mask = vcleq_f32 ((a), (b)); \ |
1335 | (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \ |
1336 | vgetq_lane_u32 (__mask, 1) != 0 && \ |
1337 | vgetq_lane_u32 (__mask, 2) != 0 && \ |
1338 | vgetq_lane_u32 (__mask, 3) != 0); \ |
1339 | })) |
1340 | |
1341 | # define graphene_simd4f_cmp_ge(a,b) \ |
1342 | (__extension__ ({ \ |
1343 | const uint32x4_t __mask = vcgeq_f32 ((a), (b)); \ |
1344 | (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \ |
1345 | vgetq_lane_u32 (__mask, 1) != 0 && \ |
1346 | vgetq_lane_u32 (__mask, 2) != 0 && \ |
1347 | vgetq_lane_u32 (__mask, 3) != 0); \ |
1348 | })) |
1349 | |
1350 | # define graphene_simd4f_cmp_gt(a,b) \ |
1351 | (__extension__ ({ \ |
1352 | const uint32x4_t __mask = vcgtq_f32 ((a), (b)); \ |
1353 | (bool) (vgetq_lane_u32 (__mask, 0) != 0 && \ |
1354 | vgetq_lane_u32 (__mask, 1) != 0 && \ |
1355 | vgetq_lane_u32 (__mask, 2) != 0 && \ |
1356 | vgetq_lane_u32 (__mask, 3) != 0); \ |
1357 | })) |
1358 | |
1359 | # define graphene_simd4f_neg(s) \ |
1360 | (__extension__ ({ \ |
1361 | const unsigned int __umask[4] = { \ |
1362 | 0x80000000, \ |
1363 | 0x80000000, \ |
1364 | 0x80000000, \ |
1365 | 0x80000000 \ |
1366 | }; \ |
1367 | const uint32x4_t __mask = vld1q_u32 (__umask); \ |
1368 | (graphene_simd4f_t) vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask)); \ |
1369 | })) |
1370 | |
1371 | #elif defined _MSC_VER /* Visual Studio ARM */ |
1372 | |
1373 | # define graphene_simd4f_init(x,y,z,w) _simd4f_init(x,y,z,w) |
1374 | static inline graphene_simd4f_t |
1375 | _simd4f_init (float x, float y, float z, float w) |
1376 | { |
1377 | const float32_t __v[4] = { (x), (y), (z), (w) }; |
1378 | return vld1q_f32 (__v); |
1379 | } |
1380 | |
1381 | # define graphene_simd4f_init_zero() vdupq_n_f32 (0.f) |
1382 | |
1383 | # define graphene_simd4f_init_4f(v) vld1q_f32 (v) |
1384 | |
1385 | # define graphene_simd4f_init_3f(v) graphene_simd4f_init (v[0], v[1], v[2], 0.f) |
1386 | |
1387 | # define graphene_simd4f_init_2f(v) _simd4f_init_2f(v) |
1388 | static inline graphene_simd4f_t |
1389 | _simd4f_init_2f (const float *v) |
1390 | { |
1391 | const float32_t *__v32 = (const float32_t *) (v); |
1392 | const graphene_simd2f_t __low = vld1_f32 (__v32); |
1393 | const float32_t __zero = 0; |
1394 | const graphene_simd2f_t __high = vld1_dup_f32 (&__zero); |
1395 | return vcombine_f32 (__low, __high); |
1396 | } |
1397 | |
1398 | # define graphene_simd4f_dup_4f(s,v) vst1q_f32 ((float32_t *) (v), (s)) |
1399 | |
1400 | # define graphene_simd4f_dup_3f(s,v) _simd4f_dup_3f(s,v) |
1401 | static inline |
1402 | void _simd4f_dup_3f (const graphene_simd4f_t s, |
1403 | float *v) |
1404 | { |
1405 | float *__v = (v); |
1406 | vst1q_lane_f32 (__v++, (s), 0); |
1407 | vst1q_lane_f32 (__v++, (s), 1); |
1408 | vst1q_lane_f32 (__v, (s), 2); |
1409 | } |
1410 | |
1411 | # define graphene_simd4f_dup_2f(s,v) vst1_f32 (v, vget_low_f32 (s)) |
1412 | |
1413 | # define graphene_simd4f_get(s,i) vgetq_lane_f32 ((s), (i)) |
1414 | |
1415 | # define graphene_simd4f_splat(v) vdupq_n_f32 ((v)) |
1416 | |
1417 | # define graphene_simd4f_splat_x(s) graphene_simd4f_splat (graphene_simd4f_get_x ((s))) |
1418 | |
1419 | # define graphene_simd4f_splat_y(s) graphene_simd4f_splat (graphene_simd4f_get_y ((s))) |
1420 | |
1421 | # define graphene_simd4f_splat_z(s) graphene_simd4f_splat (graphene_simd4f_get_z ((s))) |
1422 | |
1423 | # define graphene_simd4f_splat_w(s) graphene_simd4f_splat (graphene_simd4f_get_w ((s))) |
1424 | |
1425 | # define graphene_simd4f_reciprocal(s) _simd4f_reciprocal(s) |
1426 | static inline graphene_simd4f_t |
1427 | _simd4f_reciprocal (const graphene_simd4f_t s) |
1428 | { |
1429 | graphene_simd4f_t __est = vrecpeq_f32 ((s)); |
1430 | __est = vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); |
1431 | return vmulq_f32 (vrecpsq_f32 (__est, (s)), __est); |
1432 | } |
1433 | |
1434 | # define graphene_simd4f_add(a,b) vaddq_f32 ((a), (b)) |
1435 | |
1436 | # define graphene_simd4f_sub(a,b) vsubq_f32 ((a), (b)) |
1437 | |
1438 | # define graphene_simd4f_mul(a,b) vmulq_f32 ((a), (b)) |
1439 | |
1440 | # define graphene_simd4f_div(a,b) vmulq_f32 (a, graphene_simd4f_reciprocal (b)) |
1441 | |
1442 | static inline graphene_simd4f_t |
1443 | _simd4f_rsqrt_iter (const graphene_simd4f_t v, |
1444 | const graphene_simd4f_t estimate) |
1445 | { |
1446 | const graphene_simd4f_t __est1 = vmulq_f32 ((estimate), (v)); |
1447 | return vmulq_f32 ((estimate), vrsqrtsq_f32 (__est1, (estimate))); |
1448 | } |
1449 | |
1450 | # define graphene_simd4f_rsqrt(s) _simd4f_rsqrt(s) |
1451 | static inline graphene_simd4f_t |
1452 | _simd4f_rsqrt (const graphene_simd4f_t s) |
1453 | { |
1454 | graphene_simd4f_t __estimate = vrsqrteq_f32 ((s)); |
1455 | __estimate = _simd4f_rsqrt_iter ((s), __estimate); |
1456 | __estimate = _simd4f_rsqrt_iter ((s), __estimate); |
1457 | return _simd4f_rsqrt_iter ((s), __estimate); |
1458 | } |
1459 | |
1460 | # define graphene_simd4f_sqrt(s) _simd4f_sqrt(s) |
1461 | static inline graphene_simd4f_t |
1462 | _simd4f_sqrt (const graphene_simd4f_t s) |
1463 | { |
1464 | graphene_simd4f_t __rsq = graphene_simd4f_rsqrt ((s)); |
1465 | graphene_simd4f_t __rrsq = graphene_simd4f_reciprocal (__rsq); |
1466 | uint32x4_t __tmp = vreinterpretq_u32_f32 ((s)); \ |
1467 | return vreinterpretq_f32_u32 (vandq_u32 (vtstq_u32 (__tmp, __tmp), vreinterpretq_u32_f32 (__rrsq))); |
1468 | } |
1469 | |
1470 | # define graphene_simd4f_cross3(a,b) _simd4f_cross3(a,b) |
1471 | static inline graphene_simd4f_t |
1472 | _simd4f_cross3 (const graphene_simd4f_t a, |
1473 | const graphene_simd4f_t b) |
1474 | { |
1475 | const uint32_t __mask_bits[] = { 0xffffffff, 0xffffffff, 0xffffffff, 0 }; |
1476 | const int32x4_t __mask = vld1q_s32 ((const int32_t *) __mask_bits); |
1477 | const graphene_simd4f_t __a = (a), __b = (b); |
1478 | const graphene_simd2f_t __a_low = vget_low_f32 (__a); |
1479 | const graphene_simd2f_t __b_low = vget_low_f32 (__b); |
1480 | const graphene_simd4f_t __a_yzx = vcombine_f32 (vext_f32 (__a_low, vget_high_f32 (__a), 1), __a_low); |
1481 | const graphene_simd4f_t __b_yzx = vcombine_f32 (vext_f32 (__b_low, vget_high_f32 (__b), 1), __b_low); |
1482 | graphene_simd4f_t __s3 = graphene_simd4f_sub (graphene_simd4f_mul (__b_yzx, __a), |
1483 | graphene_simd4f_mul (__a_yzx, __b)); |
1484 | graphene_simd2f_t __s3_low = vget_low_f32 (__s3); |
1485 | __s3 = vcombine_f32 (vext_f32 (__s3_low, vget_high_f32 (__s3), 1), __s3_low); |
1486 | return vandq_s32 (__s3, __mask); |
1487 | } |
1488 | |
1489 | # define graphene_simd4f_dot3(a,b) graphene_simd4f_splat (graphene_simd4f_dot3_scalar (a, b)) |
1490 | |
1491 | # define graphene_simd4f_dot3_scalar(a,b) _simd4f_dot3_scalar(a,b) |
1492 | static inline float |
1493 | _simd4f_dot3_scalar (const graphene_simd4f_t a, |
1494 | const graphene_simd4f_t b) |
1495 | { |
1496 | const graphene_simd4f_t __m = graphene_simd4f_mul (a, b); |
1497 | const graphene_simd2f_t __s1 = vpadd_f32 (vget_low_f32 (__m), vget_low_f32 (__m)); |
1498 | return vget_lane_f32 (vadd_f32 (__s1, vget_high_f32 (__m)), 0); |
1499 | } |
1500 | |
1501 | # define graphene_simd4f_min(a,b) vminq_f32 ((a), (b)) |
1502 | |
1503 | # define graphene_simd4f_max(a,b) vmaxq_f32 (a, b) |
1504 | |
1505 | # define graphene_simd4f_shuffle_wxyz(v) _simd4f_shuffle_wxyz(v) |
1506 | static inline graphene_simd4f_t |
1507 | _simd4f_shuffle_wxyz (const graphene_simd4f_t v) |
1508 | { |
1509 | graphene_simd4f_union_t __u = { (v) }; |
1510 | return graphene_simd4f_init (__u.f[3], __u.f[0], __u.f[1], __u.f[2]); |
1511 | } |
1512 | |
1513 | # define graphene_simd4f_shuffle_zwxy(v) _simd4f_shuffle_zwxy(v) |
1514 | static inline graphene_simd4f_t |
1515 | _simd4f_shuffle_zwxy (const graphene_simd4f_t v) |
1516 | { |
1517 | graphene_simd4f_union_t __u = { (v) }; |
1518 | return graphene_simd4f_init (__u.f[2], __u.f[3], __u.f[0], __u.f[1]); |
1519 | } |
1520 | |
1521 | # define graphene_simd4f_shuffle_yzwx(v) _simd4f_shuffle_yzwx(v) |
1522 | static inline graphene_simd4f_t |
1523 | _simd4f_shuffle_yzwx (const graphene_simd4f_t v) |
1524 | { |
1525 | graphene_simd4f_union_t __u = { (v) }; |
1526 | return graphene_simd4f_init (__u.f[1], __u.f[2], __u.f[3], __u.f[0]); |
1527 | } |
1528 | |
1529 | # define graphene_simd4f_zero_w(v) _simd4f_zero_w(v) |
1530 | static inline graphene_simd4f_t |
1531 | _simd4f_zero_w (const graphene_simd4f_t v) |
1532 | { |
1533 | graphene_simd4f_union_t __u = { (v) }; |
1534 | return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], 0.f); |
1535 | } |
1536 | |
1537 | # define graphene_simd4f_zero_zw(v) _simd4f_zero_zw(v) |
1538 | static inline graphene_simd4f_t |
1539 | _simd4f_zero_zw (const graphene_simd4f_t v) |
1540 | { |
1541 | graphene_simd4f_union_t __u = { (v) }; |
1542 | return graphene_simd4f_init (__u.f[0], __u.f[1], 0.f, 0.f); |
1543 | } |
1544 | |
1545 | # define graphene_simd4f_merge_w(s,v) _simd4f_merge_w(s,v) |
1546 | static inline graphene_simd4f_t |
1547 | _simd4f_merge_w (const graphene_simd4f_t s, |
1548 | float v) |
1549 | { |
1550 | graphene_simd4f_union_t __u = { (s) }; |
1551 | return graphene_simd4f_init (__u.f[0], __u.f[1], __u.f[2], (v)); |
1552 | } |
1553 | |
1554 | # define graphene_simd4f_merge_high(a,b) _simd4f_merge_high(a,b) |
1555 | static inline graphene_simd4f_t |
1556 | _simd4f_merge_high (const graphene_simd4f_t a, |
1557 | const graphene_simd4f_t b) |
1558 | { |
1559 | graphene_simd4f_union_t __u_a = { (a) }; |
1560 | graphene_simd4f_union_t __u_b = { (b) }; |
1561 | return graphene_simd4f_init (__u_a.f[2], __u_a.f[3], __u_b.f[2], __u_b.f[3]); |
1562 | } |
1563 | |
1564 | # define graphene_simd4f_merge_low(a,b) _simd4f_merge_low(a,b) |
1565 | static inline graphene_simd4f_t |
1566 | _simd4f_merge_low (const graphene_simd4f_t a, |
1567 | const graphene_simd4f_t b) |
1568 | { |
1569 | graphene_simd4f_union_t __u_a = { (a) }; |
1570 | graphene_simd4f_union_t __u_b = { (b) }; |
1571 | return graphene_simd4f_init (__u_a.f[0], __u_a.f[1], __u_b.f[0], __u_b.f[1]); |
1572 | } |
1573 | |
1574 | |
1575 | # define graphene_simd4f_flip_sign_0101(s) _simd4f_flip_sign_0101(s) |
1576 | static inline graphene_simd4f_t |
1577 | _simd4f_flip_sign_0101 (const graphene_simd4f_t s) |
1578 | { |
1579 | const unsigned int __upnpn[4] = { |
1580 | 0x00000000, |
1581 | 0x80000000, |
1582 | 0x00000000, |
1583 | 0x80000000 |
1584 | }; |
1585 | const uint32x4_t __pnpn = vld1q_u32 (__upnpn); |
1586 | return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __pnpn)); |
1587 | } |
1588 | |
1589 | # define graphene_simd4f_flip_sign_1010(s) _simd4f_flip_sign_1010(s) |
1590 | static inline graphene_simd4f_t |
1591 | _simd4f_flip_sign_1010 (const graphene_simd4f_t s) |
1592 | { |
1593 | const unsigned int __unpnp[4] = { |
1594 | 0x80000000, |
1595 | 0x00000000, |
1596 | 0x80000000, |
1597 | 0x00000000 |
1598 | }; |
1599 | |
1600 | const uint32x4_t __npnp = vld1q_u32 (__unpnp); |
1601 | return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __npnp)); |
1602 | } |
1603 | |
1604 | # define graphene_simd4f_cmp_eq(a,b) _simd4f_cmp_eq(a,b) |
1605 | static inline bool |
1606 | _simd4f_cmp_eq (const graphene_simd4f_t a, |
1607 | const graphene_simd4f_t b) |
1608 | { |
1609 | const uint32x4_t __mask = vceqq_f32 ((a), (b)); |
1610 | return (vgetq_lane_u32 (__mask, 0) != 0 && |
1611 | vgetq_lane_u32 (__mask, 1) != 0 && |
1612 | vgetq_lane_u32 (__mask, 2) != 0 && |
1613 | vgetq_lane_u32 (__mask, 3) != 0); |
1614 | } |
1615 | |
1616 | # define graphene_simd4f_cmp_neq(a,b) _simd4f_cmp_neq(a,b) |
1617 | static inline bool |
1618 | _simd4f_cmp_neq (const graphene_simd4f_t a, |
1619 | const graphene_simd4f_t b) |
1620 | { |
1621 | const uint32x4_t __mask = vceqq_f32 ((a), (b)); |
1622 | return (vgetq_lane_u32 (__mask, 0) == 0 || |
1623 | vgetq_lane_u32 (__mask, 1) == 0 || |
1624 | vgetq_lane_u32 (__mask, 2) == 0 || |
1625 | vgetq_lane_u32 (__mask, 3) == 0); |
1626 | } |
1627 | |
1628 | # define graphene_simd4f_cmp_lt(a,b) _simd4f_cmp_lt(a,b) |
1629 | static inline bool |
1630 | _simd4f_cmp_lt (const graphene_simd4f_t a, |
1631 | const graphene_simd4f_t b) |
1632 | { |
1633 | const uint32x4_t __mask = vcltq_f32 ((a), (b)); |
1634 | return (vgetq_lane_u32 (__mask, 0) != 0 && |
1635 | vgetq_lane_u32 (__mask, 1) != 0 && |
1636 | vgetq_lane_u32 (__mask, 2) != 0 && |
1637 | vgetq_lane_u32 (__mask, 3) != 0); |
1638 | } |
1639 | |
1640 | # define graphene_simd4f_cmp_le(a,b) _simd4f_cmp_le(a,b) |
1641 | static inline bool |
1642 | _simd4f_cmp_le (const graphene_simd4f_t a, |
1643 | const graphene_simd4f_t b) |
1644 | { |
1645 | const uint32x4_t __mask = vcleq_f32 ((a), (b)); |
1646 | return (vgetq_lane_u32 (__mask, 0) != 0 && |
1647 | vgetq_lane_u32 (__mask, 1) != 0 && |
1648 | vgetq_lane_u32 (__mask, 2) != 0 && |
1649 | vgetq_lane_u32 (__mask, 3) != 0); |
1650 | } |
1651 | |
1652 | # define graphene_simd4f_cmp_ge(a,b) _simd4f_cmp_ge(a,b) |
1653 | static inline bool |
1654 | _simd4f_cmp_ge (const graphene_simd4f_t a, |
1655 | const graphene_simd4f_t b) |
1656 | { |
1657 | const uint32x4_t __mask = vcgeq_f32 ((a), (b)); |
1658 | return (vgetq_lane_u32 (__mask, 0) != 0 && |
1659 | vgetq_lane_u32 (__mask, 1) != 0 && |
1660 | vgetq_lane_u32 (__mask, 2) != 0 && |
1661 | vgetq_lane_u32 (__mask, 3) != 0); |
1662 | } |
1663 | |
1664 | # define graphene_simd4f_cmp_gt(a,b) _simd4f_cmp_gt(a,b) |
1665 | static inline bool |
1666 | _simd4f_cmp_gt (const graphene_simd4f_t a, |
1667 | const graphene_simd4f_t b) |
1668 | { |
1669 | const uint32x4_t __mask = vcgtq_f32 ((a), (b)); |
1670 | return (vgetq_lane_u32 (__mask, 0) != 0 && |
1671 | vgetq_lane_u32 (__mask, 1) != 0 && |
1672 | vgetq_lane_u32 (__mask, 2) != 0 && |
1673 | vgetq_lane_u32 (__mask, 3) != 0); |
1674 | } |
1675 | |
1676 | # define graphene_simd4f_neg(s) _simd4f_neg(s) |
1677 | static inline graphene_simd4f_t |
1678 | _simd4f_neg (const graphene_simd4f_t s) |
1679 | { |
1680 | const unsigned int __umask[4] = { |
1681 | 0x80000000, |
1682 | 0x80000000, |
1683 | 0x80000000, |
1684 | 0x80000000 |
1685 | }; |
1686 | const uint32x4_t __mask = vld1q_u32 (__umask); |
1687 | return vreinterpretq_f32_u32 (veorq_u32 (vreinterpretq_u32_f32 ((s)), __mask)); |
1688 | } |
1689 | |
1690 | #else /* ARM NEON intrinsics-not GCC or Visual Studio */ |
1691 | |
1692 | # error "Need GCC-compatible or Visual Studio compiler for ARM NEON extensions." |
1693 | |
1694 | /* Use static inline to inline all these functions */ |
1695 | |
1696 | # endif /* !__GNUC__ && !_MSC_VER */ |
1697 | |
1698 | /* macros that are not compiler-dependent */ |
1699 | # define graphene_simd4f_get_x(s) graphene_simd4f_get (s, 0) |
1700 | # define graphene_simd4f_get_y(s) graphene_simd4f_get (s, 1) |
1701 | # define graphene_simd4f_get_z(s) graphene_simd4f_get (s, 2) |
1702 | # define graphene_simd4f_get_w(s) graphene_simd4f_get (s, 3) |
1703 | |
1704 | #elif defined(__GI_SCANNER__) || defined(GRAPHENE_USE_SCALAR) |
1705 | |
1706 | /* Fallback implementation using scalar types */ |
1707 | |
1708 | #define graphene_simd4f_init(x,y,z,w) \ |
1709 | (graphene_simd4f_init ((x), (y), (z), (w))) |
1710 | #define graphene_simd4f_init_zero() \ |
1711 | (graphene_simd4f_init_zero ()) |
1712 | #define graphene_simd4f_init_4f(v) \ |
1713 | (graphene_simd4f_init_4f ((const float *) (v))) |
1714 | #define graphene_simd4f_init_3f(v) \ |
1715 | (graphene_simd4f_init_3f ((const float *) (v))) |
1716 | #define graphene_simd4f_init_2f(v) \ |
1717 | (graphene_simd4f_init_2f ((const float *) (v))) |
1718 | #define graphene_simd4f_dup_4f(s,v) \ |
1719 | (graphene_simd4f_dup_4f ((s), (float *) (v))) |
1720 | #define graphene_simd4f_dup_3f(s,v) \ |
1721 | (graphene_simd4f_dup_3f ((s), (float *) (v))) |
1722 | #define graphene_simd4f_dup_2f(s,v) \ |
1723 | (graphene_simd4f_dup_2f ((s), (float *) (v))) |
1724 | #define graphene_simd4f_get(s,i) \ |
1725 | (graphene_simd4f_get ((s), (i))) |
1726 | #define graphene_simd4f_get_x(s) \ |
1727 | (graphene_simd4f_get_x ((s))) |
1728 | #define graphene_simd4f_get_y(s) \ |
1729 | (graphene_simd4f_get_y ((s))) |
1730 | #define graphene_simd4f_get_z(s) \ |
1731 | (graphene_simd4f_get_z ((s))) |
1732 | #define graphene_simd4f_get_w(s) \ |
1733 | (graphene_simd4f_get_w ((s))) |
1734 | #define graphene_simd4f_splat(v) \ |
1735 | (graphene_simd4f_splat ((v))) |
1736 | #define graphene_simd4f_splat_x(s) \ |
1737 | (graphene_simd4f_splat_x ((s))) |
1738 | #define graphene_simd4f_splat_y(s) \ |
1739 | (graphene_simd4f_splat_y ((s))) |
1740 | #define graphene_simd4f_splat_z(s) \ |
1741 | (graphene_simd4f_splat_z ((s))) |
1742 | #define graphene_simd4f_splat_w(s) \ |
1743 | (graphene_simd4f_splat_w ((s))) |
1744 | #define graphene_simd4f_add(a,b) \ |
1745 | (graphene_simd4f_add ((a), (b))) |
1746 | #define graphene_simd4f_sub(a,b) \ |
1747 | (graphene_simd4f_sub ((a), (b))) |
1748 | #define graphene_simd4f_mul(a,b) \ |
1749 | (graphene_simd4f_mul ((a), (b))) |
1750 | #define graphene_simd4f_div(a,b) \ |
1751 | (graphene_simd4f_div ((a), (b))) |
1752 | #define graphene_simd4f_sqrt(s) \ |
1753 | (graphene_simd4f_sqrt ((s))) |
1754 | #define graphene_simd4f_rsqrt(s) \ |
1755 | (graphene_simd4f_rsqrt ((s))) |
1756 | #define graphene_simd4f_reciprocal(s) \ |
1757 | (graphene_simd4f_reciprocal ((s))) |
1758 | #define graphene_simd4f_cross3(a,b) \ |
1759 | (graphene_simd4f_cross3 ((a), (b))) |
1760 | #define graphene_simd4f_dot3(a,b) \ |
1761 | (graphene_simd4f_dot3 ((a), (b))) |
1762 | #define graphene_simd4f_dot3_scalar(a,b) \ |
1763 | (graphene_simd4f_dot3_scalar ((a), (b))) |
1764 | #define graphene_simd4f_min(a,b) \ |
1765 | (graphene_simd4f_min ((a), (b))) |
1766 | #define graphene_simd4f_max(a,b) \ |
1767 | (graphene_simd4f_max ((a), (b))) |
1768 | #define graphene_simd4f_shuffle_wxyz(s) \ |
1769 | (graphene_simd4f_shuffle_wxyz ((s))) |
1770 | #define graphene_simd4f_shuffle_zwxy(s) \ |
1771 | (graphene_simd4f_shuffle_zwxy ((s))) |
1772 | #define graphene_simd4f_shuffle_yzwx(s) \ |
1773 | (graphene_simd4f_shuffle_yzwx ((s))) |
1774 | #define graphene_simd4f_flip_sign_0101(s) \ |
1775 | (graphene_simd4f_flip_sign_0101 ((s))) |
1776 | #define graphene_simd4f_flip_sign_1010(s) \ |
1777 | (graphene_simd4f_flip_sign_1010 ((s))) |
1778 | #define graphene_simd4f_zero_w(v) \ |
1779 | (graphene_simd4f_zero_w ((v))) |
1780 | #define graphene_simd4f_zero_zw(v) \ |
1781 | (graphene_simd4f_zero_zw ((v))) |
1782 | #define graphene_simd4f_merge_w(s,v) \ |
1783 | (graphene_simd4f_merge_w ((s), (v))) |
1784 | #define graphene_simd4f_merge_high(a,b) \ |
1785 | (graphene_simd4f_merge_high ((a), (b))) |
1786 | #define graphene_simd4f_merge_low(a,b) \ |
1787 | (graphene_simd4f_merge_low ((a), (b))) |
1788 | #define graphene_simd4f_cmp_eq(a,b) \ |
1789 | (graphene_simd4f_cmp_eq ((a), (b))) |
1790 | #define graphene_simd4f_cmp_neq(a,b) \ |
1791 | (graphene_simd4f_cmp_neq ((a), (b))) |
1792 | #define graphene_simd4f_cmp_lt(a,b) \ |
1793 | (graphene_simd4f_cmp_lt ((a), (b))) |
1794 | #define graphene_simd4f_cmp_le(a,b) \ |
1795 | (graphene_simd4f_cmp_le ((a), (b))) |
1796 | #define graphene_simd4f_cmp_ge(a,b) \ |
1797 | (graphene_simd4f_cmp_ge ((a), (b))) |
1798 | #define graphene_simd4f_cmp_gt(a,b) \ |
1799 | (graphene_simd4f_cmp_gt ((a), (b))) |
1800 | #define graphene_simd4f_neg(s) \ |
1801 | (graphene_simd4f_neg ((s))) |
1802 | |
1803 | #else |
1804 | # error "Unsupported simd4f implementation." |
1805 | #endif |
1806 | |
1807 | /* Generic operations, inlined */ |
1808 | |
1809 | /** |
1810 | * graphene_simd4f_madd: |
1811 | * @m1: a #graphene_simd4f_t |
1812 | * @m2: a #graphene_simd4f_t |
1813 | * @a: a #graphene_simd4f_t |
1814 | * |
1815 | * Adds @a to the product of @m1 and @m2. |
1816 | * |
1817 | * Returns: the result vector |
1818 | * |
1819 | * Since: 1.0 |
1820 | */ |
1821 | static inline graphene_simd4f_t |
1822 | graphene_simd4f_madd (const graphene_simd4f_t m1, |
1823 | const graphene_simd4f_t m2, |
1824 | const graphene_simd4f_t a) |
1825 | { |
1826 | return graphene_simd4f_add (graphene_simd4f_mul (m1, m2), a); |
1827 | } |
1828 | |
1829 | /** |
1830 | * graphene_simd4f_sum: |
1831 | * @v: a #graphene_simd4f_t |
1832 | * |
1833 | * Sums all components of the given vector. |
1834 | * |
1835 | * Returns: a vector with all components set to be the |
1836 | * sum of the passed #graphene_simd4f_t |
1837 | * |
1838 | * Since: 1.0 |
1839 | */ |
1840 | static inline graphene_simd4f_t |
1841 | graphene_simd4f_sum (const graphene_simd4f_t v) |
1842 | { |
1843 | const graphene_simd4f_t x = graphene_simd4f_splat_x (v); |
1844 | const graphene_simd4f_t y = graphene_simd4f_splat_y (v); |
1845 | const graphene_simd4f_t z = graphene_simd4f_splat_z (v); |
1846 | const graphene_simd4f_t w = graphene_simd4f_splat_w (v); |
1847 | |
1848 | return graphene_simd4f_add (graphene_simd4f_add (x, y), |
1849 | graphene_simd4f_add (z, w)); |
1850 | } |
1851 | |
1852 | /** |
1853 | * graphene_simd4f_sum_scalar: |
1854 | * @v: a #graphene_simd4f_t |
1855 | * |
1856 | * Sums all the components of the given vector. |
1857 | * |
1858 | * Returns: a scalar value with the sum of the components |
1859 | * of the given #graphene_simd4f_t |
1860 | * |
1861 | * Since: 1.0 |
1862 | */ |
1863 | static inline float |
1864 | graphene_simd4f_sum_scalar (const graphene_simd4f_t v) |
1865 | { |
1866 | return graphene_simd4f_get_x (graphene_simd4f_sum (v)); |
1867 | } |
1868 | |
1869 | /** |
1870 | * graphene_simd4f_dot4: |
1871 | * @a: a #graphene_simd4f_t |
1872 | * @b: a #graphene_simd4f_t |
1873 | * |
1874 | * Computes the dot product of all the components of the two |
1875 | * given #graphene_simd4f_t. |
1876 | * |
1877 | * Returns: a vector whose components are all set to be the |
1878 | * dot product of the components of the two operands |
1879 | * |
1880 | * Since: 1.0 |
1881 | */ |
1882 | static inline graphene_simd4f_t |
1883 | graphene_simd4f_dot4 (const graphene_simd4f_t a, |
1884 | const graphene_simd4f_t b) |
1885 | { |
1886 | return graphene_simd4f_sum (graphene_simd4f_mul (a, b)); |
1887 | } |
1888 | |
1889 | /** |
1890 | * graphene_simd4f_dot2: |
1891 | * @a: a #graphene_simd4f_t |
1892 | * @b: a #graphene_simd4f_t |
1893 | * |
1894 | * Computes the dot product of the first two components of the |
1895 | * two given #graphene_simd4f_t. |
1896 | * |
1897 | * Returns: a vector whose components are all set to the |
1898 | * dot product of the components of the two operands |
1899 | * |
1900 | * Since: 1.0 |
1901 | */ |
1902 | static inline graphene_simd4f_t |
1903 | graphene_simd4f_dot2 (const graphene_simd4f_t a, |
1904 | const graphene_simd4f_t b) |
1905 | { |
1906 | const graphene_simd4f_t m = graphene_simd4f_mul (a, b); |
1907 | const graphene_simd4f_t x = graphene_simd4f_splat_x (m); |
1908 | const graphene_simd4f_t y = graphene_simd4f_splat_y (m); |
1909 | |
1910 | return graphene_simd4f_add (x, y); |
1911 | } |
1912 | |
1913 | /** |
1914 | * graphene_simd4f_length4: |
1915 | * @v: a #graphene_simd4f_t |
1916 | * |
1917 | * Computes the length of the given #graphene_simd4f_t vector, |
1918 | * using all four of its components. |
1919 | * |
1920 | * Returns: the length vector |
1921 | * |
1922 | * Since: 1.0 |
1923 | */ |
1924 | static inline graphene_simd4f_t |
1925 | graphene_simd4f_length4 (const graphene_simd4f_t v) |
1926 | { |
1927 | return graphene_simd4f_sqrt (graphene_simd4f_dot4 (v, v)); |
1928 | } |
1929 | |
1930 | /** |
1931 | * graphene_simd4f_length3: |
1932 | * @v: a #graphene_simd4f_t |
1933 | * |
1934 | * Computes the length of the given #graphene_simd4f_t vector, |
1935 | * using the first three of its components. |
1936 | * |
1937 | * Returns: the length vector |
1938 | * |
1939 | * Since: 1.0 |
1940 | */ |
1941 | static inline graphene_simd4f_t |
1942 | graphene_simd4f_length3 (const graphene_simd4f_t v) |
1943 | { |
1944 | return graphene_simd4f_sqrt (graphene_simd4f_dot3 (v, v)); |
1945 | } |
1946 | |
1947 | /** |
1948 | * graphene_simd4f_length2: |
1949 | * @v: a #graphene_simd4f_t |
1950 | * |
1951 | * Computes the length of the given #graphene_simd4f_t vector, |
1952 | * using the first two of its components. |
1953 | * |
1954 | * Returns: the length vector |
1955 | * |
1956 | * Since: 1.0 |
1957 | */ |
1958 | static inline graphene_simd4f_t |
1959 | graphene_simd4f_length2 (const graphene_simd4f_t v) |
1960 | { |
1961 | return graphene_simd4f_sqrt (graphene_simd4f_dot2 (v, v)); |
1962 | } |
1963 | |
1964 | /** |
1965 | * graphene_simd4f_normalize4: |
1966 | * @v: a #graphene_simd4f_t |
1967 | * |
1968 | * Computes the normalization of the given #graphene_simd4f_t vector, |
1969 | * using all of its components. |
1970 | * |
1971 | * Returns: the normalized vector |
1972 | * |
1973 | * Since: 1.0 |
1974 | */ |
1975 | static inline graphene_simd4f_t |
1976 | graphene_simd4f_normalize4 (const graphene_simd4f_t v) |
1977 | { |
1978 | graphene_simd4f_t invlen = graphene_simd4f_rsqrt (graphene_simd4f_dot4 (v, v)); |
1979 | return graphene_simd4f_mul (v, invlen); |
1980 | } |
1981 | |
1982 | /** |
1983 | * graphene_simd4f_normalize3: |
1984 | * @v: a #graphene_simd4f_t |
1985 | * |
1986 | * Computes the normalization of the given #graphene_simd4f_t vector, |
1987 | * using the first three of its components. |
1988 | * |
1989 | * Returns: the normalized vector |
1990 | * |
1991 | * Since: 1.0 |
1992 | */ |
1993 | static inline graphene_simd4f_t |
1994 | graphene_simd4f_normalize3 (const graphene_simd4f_t v) |
1995 | { |
1996 | graphene_simd4f_t invlen = graphene_simd4f_rsqrt (graphene_simd4f_dot3 (v, v)); |
1997 | return graphene_simd4f_mul (v, invlen); |
1998 | } |
1999 | |
2000 | /** |
2001 | * graphene_simd4f_normalize2: |
2002 | * @v: a #graphene_simd4f_t |
2003 | * |
2004 | * Computes the normalization of the given #graphene_simd4f_t vector, |
2005 | * using the first two of its components. |
2006 | * |
2007 | * Returns: the normalized vector |
2008 | * |
2009 | * Since: 1.0 |
2010 | */ |
2011 | static inline graphene_simd4f_t |
2012 | graphene_simd4f_normalize2 (const graphene_simd4f_t v) |
2013 | { |
2014 | graphene_simd4f_t invlen = graphene_simd4f_rsqrt (graphene_simd4f_dot2 (v, v)); |
2015 | return graphene_simd4f_mul (v, invlen); |
2016 | } |
2017 | |
2018 | /** |
2019 | * graphene_simd4f_is_zero4: |
2020 | * @v: a #graphene_simd4f_t |
2021 | * |
2022 | * Checks whether the given #graphene_simd4f_t has all its components |
2023 | * set to 0. |
2024 | * |
2025 | * Returns: `true` if all the vector components are zero |
2026 | * |
2027 | * Since: 1.0 |
2028 | */ |
2029 | static inline bool |
2030 | graphene_simd4f_is_zero4 (const graphene_simd4f_t v) |
2031 | { |
2032 | graphene_simd4f_t zero = graphene_simd4f_init_zero (); |
2033 | return graphene_simd4f_cmp_eq (v, zero); |
2034 | } |
2035 | |
2036 | /** |
2037 | * graphene_simd4f_is_zero3: |
2038 | * @v: a #graphene_simd4f_t |
2039 | * |
2040 | * Checks whether the given #graphene_simd4f_t has the first three of |
2041 | * its components set to 0. |
2042 | * |
2043 | * Returns: `true` if the vector's components are zero |
2044 | * |
2045 | * Since: 1.0 |
2046 | */ |
2047 | static inline bool |
2048 | graphene_simd4f_is_zero3 (const graphene_simd4f_t v) |
2049 | { |
2050 | return fabsf (graphene_simd4f_get_x (v)) <= FLT_EPSILON && |
2051 | fabsf (graphene_simd4f_get_y (v)) <= FLT_EPSILON && |
2052 | fabsf (graphene_simd4f_get_z (v)) <= FLT_EPSILON; |
2053 | } |
2054 | |
2055 | /** |
2056 | * graphene_simd4f_is_zero2: |
2057 | * @v: a #graphene_simd4f_t |
2058 | * |
2059 | * Checks whether the given #graphene_simd4f_t has the first two of |
2060 | * its components set to 0. |
2061 | * |
2062 | * Returns: `true` if the vector's components are zero |
2063 | * |
2064 | * Since: 1.0 |
2065 | */ |
2066 | static inline bool |
2067 | graphene_simd4f_is_zero2 (const graphene_simd4f_t v) |
2068 | { |
2069 | return fabsf (graphene_simd4f_get_x (v)) <= FLT_EPSILON && |
2070 | fabsf (graphene_simd4f_get_y (v)) <= FLT_EPSILON; |
2071 | } |
2072 | |
2073 | /** |
2074 | * graphene_simd4f_interpolate: |
2075 | * @a: a #graphene_simd4f_t |
2076 | * @b: a #graphene_simd4f_t |
2077 | * @f: the interpolation factor |
2078 | * |
2079 | * Linearly interpolates all components of the two given |
2080 | * #graphene_simd4f_t vectors using the given factor @f. |
2081 | * |
2082 | * Returns: the intrerpolated vector |
2083 | * |
2084 | * Since: 1.0 |
2085 | */ |
2086 | static inline graphene_simd4f_t |
2087 | graphene_simd4f_interpolate (const graphene_simd4f_t a, |
2088 | const graphene_simd4f_t b, |
2089 | float f) |
2090 | { |
2091 | const graphene_simd4f_t one_minus_f = graphene_simd4f_sub (graphene_simd4f_splat (1.f), |
2092 | graphene_simd4f_splat (f)); |
2093 | |
2094 | return graphene_simd4f_add (graphene_simd4f_mul (one_minus_f, a), |
2095 | graphene_simd4f_mul (graphene_simd4f_splat (f), b)); |
2096 | } |
2097 | |
2098 | /** |
2099 | * graphene_simd4f_clamp: |
2100 | * @v: a #graphene_simd4f_t |
2101 | * @min: the lower boundary |
2102 | * @max: the upper boundary |
2103 | * |
2104 | * Ensures that all components of the vector @v are within |
2105 | * the components of the @lower and @upper boundaries. |
2106 | * |
2107 | * Returns: the clamped vector |
2108 | * |
2109 | * Since: 1.2 |
2110 | */ |
2111 | static inline graphene_simd4f_t |
2112 | graphene_simd4f_clamp (const graphene_simd4f_t v, |
2113 | const graphene_simd4f_t min, |
2114 | const graphene_simd4f_t max) |
2115 | { |
2116 | const graphene_simd4f_t tmp = graphene_simd4f_max (min, v); |
2117 | |
2118 | return graphene_simd4f_min (tmp, max); |
2119 | } |
2120 | |
2121 | /** |
2122 | * graphene_simd4f_clamp_scalar: |
2123 | * @v: a #graphene_simd4f_t |
2124 | * @min: the lower boundary |
2125 | * @max: the upper boundary |
2126 | * |
2127 | * Ensures that all components of the vector @v are within |
2128 | * the @lower and @upper boundary scalar values. |
2129 | * |
2130 | * Returns: the clamped vector |
2131 | * |
2132 | * Since: 1.2 |
2133 | */ |
2134 | static inline graphene_simd4f_t |
2135 | graphene_simd4f_clamp_scalar (const graphene_simd4f_t v, |
2136 | float min, |
2137 | float max) |
2138 | { |
2139 | return graphene_simd4f_clamp (v, |
2140 | graphene_simd4f_splat (min), |
2141 | graphene_simd4f_splat (max)); |
2142 | } |
2143 | |
2144 | /** |
2145 | * graphene_simd4f_min_val: |
2146 | * @v: a #graphene_simd4f_t |
2147 | * |
2148 | * Computes the minimum value of all the channels in the given vector. |
2149 | * |
2150 | * Returns: a vector whose components are all set to the |
2151 | * minimum value in the original vector |
2152 | * |
2153 | * Since: 1.4 |
2154 | */ |
2155 | static inline graphene_simd4f_t |
2156 | graphene_simd4f_min_val (const graphene_simd4f_t v) |
2157 | { |
2158 | graphene_simd4f_t s = v; |
2159 | |
2160 | s = graphene_simd4f_min (s, graphene_simd4f_shuffle_wxyz (s)); |
2161 | s = graphene_simd4f_min (s, graphene_simd4f_shuffle_zwxy (s)); |
2162 | |
2163 | return s; |
2164 | } |
2165 | |
2166 | /** |
2167 | * graphene_simd4f_max_val: |
2168 | * @v: a #graphene_simd4f_t |
2169 | * |
2170 | * Computes the maximum value of all the channels in the given vector. |
2171 | * |
2172 | * Returns: a vector whose components are all set to the |
2173 | * maximum value in the original vector |
2174 | * |
2175 | * Since: 1.4 |
2176 | */ |
2177 | static inline graphene_simd4f_t |
2178 | graphene_simd4f_max_val (const graphene_simd4f_t v) |
2179 | { |
2180 | graphene_simd4f_t s = v; |
2181 | |
2182 | s = graphene_simd4f_max (s, graphene_simd4f_shuffle_wxyz (s)); |
2183 | s = graphene_simd4f_max (s, graphene_simd4f_shuffle_zwxy (s)); |
2184 | |
2185 | return s; |
2186 | } |
2187 | |
2188 | GRAPHENE_END_DECLS |
2189 | |