Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===---- arm_acle.h - ARM Non-Neon intrinsics -----------------------------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | * The Arm C Language Extensions specifications can be found in the following |
8 | * link: https://github.com/ARM-software/acle/releases |
9 | * |
10 | * The ACLE section numbers are subject to change. When consulting the |
11 | * specifications, it is recommended to search using section titles if |
12 | * the section numbers look outdated. |
13 | * |
14 | *===-----------------------------------------------------------------------=== |
15 | */ |
16 | |
17 | #ifndef __ARM_ACLE_H |
18 | #define __ARM_ACLE_H |
19 | |
20 | #ifndef __ARM_ACLE |
21 | #error "ACLE intrinsics support not enabled." |
22 | #endif |
23 | |
24 | #include <stdint.h> |
25 | |
26 | #if defined(__cplusplus) |
27 | extern "C" { |
28 | #endif |
29 | |
30 | /* 7 SYNCHRONIZATION, BARRIER AND HINT INTRINSICS */ |
31 | /* 7.3 Memory barriers */ |
32 | #if !__has_builtin(__dmb) |
33 | #define __dmb(i) __builtin_arm_dmb(i) |
34 | #endif |
35 | #if !__has_builtin(__dsb) |
36 | #define __dsb(i) __builtin_arm_dsb(i) |
37 | #endif |
38 | #if !__has_builtin(__isb) |
39 | #define __isb(i) __builtin_arm_isb(i) |
40 | #endif |
41 | |
42 | /* 7.4 Hints */ |
43 | |
44 | #if !__has_builtin(__wfi) |
45 | static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfi(void) { |
46 | __builtin_arm_wfi(); |
47 | } |
48 | #endif |
49 | |
50 | #if !__has_builtin(__wfe) |
51 | static __inline__ void __attribute__((__always_inline__, __nodebug__)) __wfe(void) { |
52 | __builtin_arm_wfe(); |
53 | } |
54 | #endif |
55 | |
56 | #if !__has_builtin(__sev) |
57 | static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sev(void) { |
58 | __builtin_arm_sev(); |
59 | } |
60 | #endif |
61 | |
62 | #if !__has_builtin(__sevl) |
63 | static __inline__ void __attribute__((__always_inline__, __nodebug__)) __sevl(void) { |
64 | __builtin_arm_sevl(); |
65 | } |
66 | #endif |
67 | |
68 | #if !__has_builtin(__yield) |
69 | static __inline__ void __attribute__((__always_inline__, __nodebug__)) __yield(void) { |
70 | __builtin_arm_yield(); |
71 | } |
72 | #endif |
73 | |
74 | #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE |
75 | #define __dbg(t) __builtin_arm_dbg(t) |
76 | #endif |
77 | |
78 | /* 7.5 Swap */ |
79 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
80 | __swp(uint32_t __x, volatile uint32_t *__p) { |
81 | uint32_t v; |
82 | do |
83 | v = __builtin_arm_ldrex(__p); |
84 | while (__builtin_arm_strex(__x, __p)); |
85 | return v; |
86 | } |
87 | |
88 | /* 7.6 Memory prefetch intrinsics */ |
89 | /* 7.6.1 Data prefetch */ |
90 | #define __pld(addr) __pldx(0, 0, 0, addr) |
91 | |
92 | #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE |
93 | #define __pldx(access_kind, cache_level, retention_policy, addr) \ |
94 | __builtin_arm_prefetch(addr, access_kind, 1) |
95 | #else |
96 | #define __pldx(access_kind, cache_level, retention_policy, addr) \ |
97 | __builtin_arm_prefetch(addr, access_kind, cache_level, retention_policy, 1) |
98 | #endif |
99 | |
100 | /* 7.6.2 Instruction prefetch */ |
101 | #define __pli(addr) __plix(0, 0, addr) |
102 | |
103 | #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE |
104 | #define __plix(cache_level, retention_policy, addr) \ |
105 | __builtin_arm_prefetch(addr, 0, 0) |
106 | #else |
107 | #define __plix(cache_level, retention_policy, addr) \ |
108 | __builtin_arm_prefetch(addr, 0, cache_level, retention_policy, 0) |
109 | #endif |
110 | |
111 | /* 7.7 NOP */ |
112 | #if !defined(_MSC_VER) || !defined(__aarch64__) |
113 | static __inline__ void __attribute__((__always_inline__, __nodebug__)) __nop(void) { |
114 | __builtin_arm_nop(); |
115 | } |
116 | #endif |
117 | |
118 | /* 8 DATA-PROCESSING INTRINSICS */ |
119 | /* 8.2 Miscellaneous data-processing intrinsics */ |
120 | /* ROR */ |
121 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
122 | __ror(uint32_t __x, uint32_t __y) { |
123 | __y %= 32; |
124 | if (__y == 0) |
125 | return __x; |
126 | return (__x >> __y) | (__x << (32 - __y)); |
127 | } |
128 | |
129 | static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) |
130 | __rorll(uint64_t __x, uint32_t __y) { |
131 | __y %= 64; |
132 | if (__y == 0) |
133 | return __x; |
134 | return (__x >> __y) | (__x << (64 - __y)); |
135 | } |
136 | |
137 | static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) |
138 | __rorl(unsigned long __x, uint32_t __y) { |
139 | #if __SIZEOF_LONG__ == 4 |
140 | return __ror(__x, __y); |
141 | #else |
142 | return __rorll(__x, __y); |
143 | #endif |
144 | } |
145 | |
146 | |
147 | /* CLZ */ |
148 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
149 | __clz(uint32_t __t) { |
150 | return __builtin_arm_clz(__t); |
151 | } |
152 | |
153 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
154 | __clzl(unsigned long __t) { |
155 | #if __SIZEOF_LONG__ == 4 |
156 | return __builtin_arm_clz(__t); |
157 | #else |
158 | return __builtin_arm_clz64(__t); |
159 | #endif |
160 | } |
161 | |
162 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
163 | __clzll(uint64_t __t) { |
164 | return __builtin_arm_clz64(__t); |
165 | } |
166 | |
167 | /* CLS */ |
168 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
169 | __cls(uint32_t __t) { |
170 | return __builtin_arm_cls(__t); |
171 | } |
172 | |
173 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
174 | __clsl(unsigned long __t) { |
175 | #if __SIZEOF_LONG__ == 4 |
176 | return __builtin_arm_cls(__t); |
177 | #else |
178 | return __builtin_arm_cls64(__t); |
179 | #endif |
180 | } |
181 | |
182 | static __inline__ unsigned int __attribute__((__always_inline__, __nodebug__)) |
183 | __clsll(uint64_t __t) { |
184 | return __builtin_arm_cls64(__t); |
185 | } |
186 | |
187 | /* REV */ |
188 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
189 | __rev(uint32_t __t) { |
190 | return __builtin_bswap32(__t); |
191 | } |
192 | |
193 | static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) |
194 | __revl(unsigned long __t) { |
195 | #if __SIZEOF_LONG__ == 4 |
196 | return __builtin_bswap32(__t); |
197 | #else |
198 | return __builtin_bswap64(__t); |
199 | #endif |
200 | } |
201 | |
202 | static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) |
203 | __revll(uint64_t __t) { |
204 | return __builtin_bswap64(__t); |
205 | } |
206 | |
207 | /* REV16 */ |
208 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
209 | __rev16(uint32_t __t) { |
210 | return __ror(__rev(__t), 16); |
211 | } |
212 | |
213 | static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) |
214 | __rev16ll(uint64_t __t) { |
215 | return (((uint64_t)__rev16(__t >> 32)) << 32) | (uint64_t)__rev16((uint32_t)__t); |
216 | } |
217 | |
218 | static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) |
219 | __rev16l(unsigned long __t) { |
220 | #if __SIZEOF_LONG__ == 4 |
221 | return __rev16(__t); |
222 | #else |
223 | return __rev16ll(__t); |
224 | #endif |
225 | } |
226 | |
227 | /* REVSH */ |
228 | static __inline__ int16_t __attribute__((__always_inline__, __nodebug__)) |
229 | __revsh(int16_t __t) { |
230 | return (int16_t)__builtin_bswap16((uint16_t)__t); |
231 | } |
232 | |
233 | /* RBIT */ |
234 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
235 | __rbit(uint32_t __t) { |
236 | return __builtin_arm_rbit(__t); |
237 | } |
238 | |
239 | static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__)) |
240 | __rbitll(uint64_t __t) { |
241 | #if defined(__ARM_32BIT_STATE) && __ARM_32BIT_STATE |
242 | return (((uint64_t)__builtin_arm_rbit(__t)) << 32) | |
243 | __builtin_arm_rbit(__t >> 32); |
244 | #else |
245 | return __builtin_arm_rbit64(__t); |
246 | #endif |
247 | } |
248 | |
249 | static __inline__ unsigned long __attribute__((__always_inline__, __nodebug__)) |
250 | __rbitl(unsigned long __t) { |
251 | #if __SIZEOF_LONG__ == 4 |
252 | return __rbit(__t); |
253 | #else |
254 | return __rbitll(__t); |
255 | #endif |
256 | } |
257 | |
258 | /* 8.3 16-bit multiplications */ |
259 | #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP |
260 | static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) |
261 | __smulbb(int32_t __a, int32_t __b) { |
262 | return __builtin_arm_smulbb(__a, __b); |
263 | } |
264 | static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) |
265 | __smulbt(int32_t __a, int32_t __b) { |
266 | return __builtin_arm_smulbt(__a, __b); |
267 | } |
268 | static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) |
269 | __smultb(int32_t __a, int32_t __b) { |
270 | return __builtin_arm_smultb(__a, __b); |
271 | } |
272 | static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) |
273 | __smultt(int32_t __a, int32_t __b) { |
274 | return __builtin_arm_smultt(__a, __b); |
275 | } |
276 | static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) |
277 | __smulwb(int32_t __a, int32_t __b) { |
278 | return __builtin_arm_smulwb(__a, __b); |
279 | } |
280 | static __inline__ int32_t __attribute__((__always_inline__,__nodebug__)) |
281 | __smulwt(int32_t __a, int32_t __b) { |
282 | return __builtin_arm_smulwt(__a, __b); |
283 | } |
284 | #endif |
285 | |
286 | /* |
287 | * 8.4 Saturating intrinsics |
288 | * |
289 | * FIXME: Change guard to their corresponding __ARM_FEATURE flag when Q flag |
290 | * intrinsics are implemented and the flag is enabled. |
291 | */ |
292 | /* 8.4.1 Width-specified saturation intrinsics */ |
293 | #if defined(__ARM_FEATURE_SAT) && __ARM_FEATURE_SAT |
294 | #define __ssat(x, y) __builtin_arm_ssat(x, y) |
295 | #define __usat(x, y) __builtin_arm_usat(x, y) |
296 | #endif |
297 | |
298 | /* 8.4.2 Saturating addition and subtraction intrinsics */ |
299 | #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP |
300 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
301 | __qadd(int32_t __t, int32_t __v) { |
302 | return __builtin_arm_qadd(__t, __v); |
303 | } |
304 | |
305 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
306 | __qsub(int32_t __t, int32_t __v) { |
307 | return __builtin_arm_qsub(__t, __v); |
308 | } |
309 | |
310 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
311 | __qdbl(int32_t __t) { |
312 | return __builtin_arm_qadd(__t, __t); |
313 | } |
314 | #endif |
315 | |
316 | /* 8.4.3 Accumultating multiplications */ |
317 | #if defined(__ARM_FEATURE_DSP) && __ARM_FEATURE_DSP |
318 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
319 | __smlabb(int32_t __a, int32_t __b, int32_t __c) { |
320 | return __builtin_arm_smlabb(__a, __b, __c); |
321 | } |
322 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
323 | __smlabt(int32_t __a, int32_t __b, int32_t __c) { |
324 | return __builtin_arm_smlabt(__a, __b, __c); |
325 | } |
326 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
327 | __smlatb(int32_t __a, int32_t __b, int32_t __c) { |
328 | return __builtin_arm_smlatb(__a, __b, __c); |
329 | } |
330 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
331 | __smlatt(int32_t __a, int32_t __b, int32_t __c) { |
332 | return __builtin_arm_smlatt(__a, __b, __c); |
333 | } |
334 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
335 | __smlawb(int32_t __a, int32_t __b, int32_t __c) { |
336 | return __builtin_arm_smlawb(__a, __b, __c); |
337 | } |
338 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
339 | __smlawt(int32_t __a, int32_t __b, int32_t __c) { |
340 | return __builtin_arm_smlawt(__a, __b, __c); |
341 | } |
342 | #endif |
343 | |
344 | |
345 | /* 8.5.4 Parallel 16-bit saturation */ |
346 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
347 | #define __ssat16(x, y) __builtin_arm_ssat16(x, y) |
348 | #define __usat16(x, y) __builtin_arm_usat16(x, y) |
349 | #endif |
350 | |
351 | /* 8.5.5 Packing and unpacking */ |
352 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
353 | typedef int32_t int8x4_t; |
354 | typedef int32_t int16x2_t; |
355 | typedef uint32_t uint8x4_t; |
356 | typedef uint32_t uint16x2_t; |
357 | |
358 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
359 | __sxtab16(int16x2_t __a, int8x4_t __b) { |
360 | return __builtin_arm_sxtab16(__a, __b); |
361 | } |
362 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
363 | __sxtb16(int8x4_t __a) { |
364 | return __builtin_arm_sxtb16(__a); |
365 | } |
366 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
367 | __uxtab16(int16x2_t __a, int8x4_t __b) { |
368 | return __builtin_arm_uxtab16(__a, __b); |
369 | } |
370 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
371 | __uxtb16(int8x4_t __a) { |
372 | return __builtin_arm_uxtb16(__a); |
373 | } |
374 | #endif |
375 | |
376 | /* 8.5.6 Parallel selection */ |
377 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
378 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
379 | __sel(uint8x4_t __a, uint8x4_t __b) { |
380 | return __builtin_arm_sel(__a, __b); |
381 | } |
382 | #endif |
383 | |
384 | /* 8.5.7 Parallel 8-bit addition and subtraction */ |
385 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
386 | static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) |
387 | __qadd8(int8x4_t __a, int8x4_t __b) { |
388 | return __builtin_arm_qadd8(__a, __b); |
389 | } |
390 | static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) |
391 | __qsub8(int8x4_t __a, int8x4_t __b) { |
392 | return __builtin_arm_qsub8(__a, __b); |
393 | } |
394 | static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) |
395 | __sadd8(int8x4_t __a, int8x4_t __b) { |
396 | return __builtin_arm_sadd8(__a, __b); |
397 | } |
398 | static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) |
399 | __shadd8(int8x4_t __a, int8x4_t __b) { |
400 | return __builtin_arm_shadd8(__a, __b); |
401 | } |
402 | static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) |
403 | __shsub8(int8x4_t __a, int8x4_t __b) { |
404 | return __builtin_arm_shsub8(__a, __b); |
405 | } |
406 | static __inline__ int8x4_t __attribute__((__always_inline__, __nodebug__)) |
407 | __ssub8(int8x4_t __a, int8x4_t __b) { |
408 | return __builtin_arm_ssub8(__a, __b); |
409 | } |
410 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
411 | __uadd8(uint8x4_t __a, uint8x4_t __b) { |
412 | return __builtin_arm_uadd8(__a, __b); |
413 | } |
414 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
415 | __uhadd8(uint8x4_t __a, uint8x4_t __b) { |
416 | return __builtin_arm_uhadd8(__a, __b); |
417 | } |
418 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
419 | __uhsub8(uint8x4_t __a, uint8x4_t __b) { |
420 | return __builtin_arm_uhsub8(__a, __b); |
421 | } |
422 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
423 | __uqadd8(uint8x4_t __a, uint8x4_t __b) { |
424 | return __builtin_arm_uqadd8(__a, __b); |
425 | } |
426 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
427 | __uqsub8(uint8x4_t __a, uint8x4_t __b) { |
428 | return __builtin_arm_uqsub8(__a, __b); |
429 | } |
430 | static __inline__ uint8x4_t __attribute__((__always_inline__, __nodebug__)) |
431 | __usub8(uint8x4_t __a, uint8x4_t __b) { |
432 | return __builtin_arm_usub8(__a, __b); |
433 | } |
434 | #endif |
435 | |
436 | /* 8.5.8 Sum of 8-bit absolute differences */ |
437 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
438 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
439 | __usad8(uint8x4_t __a, uint8x4_t __b) { |
440 | return __builtin_arm_usad8(__a, __b); |
441 | } |
442 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__)) |
443 | __usada8(uint8x4_t __a, uint8x4_t __b, uint32_t __c) { |
444 | return __builtin_arm_usada8(__a, __b, __c); |
445 | } |
446 | #endif |
447 | |
448 | /* 8.5.9 Parallel 16-bit addition and subtraction */ |
449 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
450 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
451 | __qadd16(int16x2_t __a, int16x2_t __b) { |
452 | return __builtin_arm_qadd16(__a, __b); |
453 | } |
454 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
455 | __qasx(int16x2_t __a, int16x2_t __b) { |
456 | return __builtin_arm_qasx(__a, __b); |
457 | } |
458 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
459 | __qsax(int16x2_t __a, int16x2_t __b) { |
460 | return __builtin_arm_qsax(__a, __b); |
461 | } |
462 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
463 | __qsub16(int16x2_t __a, int16x2_t __b) { |
464 | return __builtin_arm_qsub16(__a, __b); |
465 | } |
466 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
467 | __sadd16(int16x2_t __a, int16x2_t __b) { |
468 | return __builtin_arm_sadd16(__a, __b); |
469 | } |
470 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
471 | __sasx(int16x2_t __a, int16x2_t __b) { |
472 | return __builtin_arm_sasx(__a, __b); |
473 | } |
474 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
475 | __shadd16(int16x2_t __a, int16x2_t __b) { |
476 | return __builtin_arm_shadd16(__a, __b); |
477 | } |
478 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
479 | __shasx(int16x2_t __a, int16x2_t __b) { |
480 | return __builtin_arm_shasx(__a, __b); |
481 | } |
482 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
483 | __shsax(int16x2_t __a, int16x2_t __b) { |
484 | return __builtin_arm_shsax(__a, __b); |
485 | } |
486 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
487 | __shsub16(int16x2_t __a, int16x2_t __b) { |
488 | return __builtin_arm_shsub16(__a, __b); |
489 | } |
490 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
491 | __ssax(int16x2_t __a, int16x2_t __b) { |
492 | return __builtin_arm_ssax(__a, __b); |
493 | } |
494 | static __inline__ int16x2_t __attribute__((__always_inline__, __nodebug__)) |
495 | __ssub16(int16x2_t __a, int16x2_t __b) { |
496 | return __builtin_arm_ssub16(__a, __b); |
497 | } |
498 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
499 | __uadd16(uint16x2_t __a, uint16x2_t __b) { |
500 | return __builtin_arm_uadd16(__a, __b); |
501 | } |
502 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
503 | __uasx(uint16x2_t __a, uint16x2_t __b) { |
504 | return __builtin_arm_uasx(__a, __b); |
505 | } |
506 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
507 | __uhadd16(uint16x2_t __a, uint16x2_t __b) { |
508 | return __builtin_arm_uhadd16(__a, __b); |
509 | } |
510 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
511 | __uhasx(uint16x2_t __a, uint16x2_t __b) { |
512 | return __builtin_arm_uhasx(__a, __b); |
513 | } |
514 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
515 | __uhsax(uint16x2_t __a, uint16x2_t __b) { |
516 | return __builtin_arm_uhsax(__a, __b); |
517 | } |
518 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
519 | __uhsub16(uint16x2_t __a, uint16x2_t __b) { |
520 | return __builtin_arm_uhsub16(__a, __b); |
521 | } |
522 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
523 | __uqadd16(uint16x2_t __a, uint16x2_t __b) { |
524 | return __builtin_arm_uqadd16(__a, __b); |
525 | } |
526 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
527 | __uqasx(uint16x2_t __a, uint16x2_t __b) { |
528 | return __builtin_arm_uqasx(__a, __b); |
529 | } |
530 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
531 | __uqsax(uint16x2_t __a, uint16x2_t __b) { |
532 | return __builtin_arm_uqsax(__a, __b); |
533 | } |
534 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
535 | __uqsub16(uint16x2_t __a, uint16x2_t __b) { |
536 | return __builtin_arm_uqsub16(__a, __b); |
537 | } |
538 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
539 | __usax(uint16x2_t __a, uint16x2_t __b) { |
540 | return __builtin_arm_usax(__a, __b); |
541 | } |
542 | static __inline__ uint16x2_t __attribute__((__always_inline__, __nodebug__)) |
543 | __usub16(uint16x2_t __a, uint16x2_t __b) { |
544 | return __builtin_arm_usub16(__a, __b); |
545 | } |
546 | #endif |
547 | |
548 | /* 8.5.10 Parallel 16-bit multiplications */ |
549 | #if defined(__ARM_FEATURE_SIMD32) && __ARM_FEATURE_SIMD32 |
550 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
551 | __smlad(int16x2_t __a, int16x2_t __b, int32_t __c) { |
552 | return __builtin_arm_smlad(__a, __b, __c); |
553 | } |
554 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
555 | __smladx(int16x2_t __a, int16x2_t __b, int32_t __c) { |
556 | return __builtin_arm_smladx(__a, __b, __c); |
557 | } |
558 | static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) |
559 | __smlald(int16x2_t __a, int16x2_t __b, int64_t __c) { |
560 | return __builtin_arm_smlald(__a, __b, __c); |
561 | } |
562 | static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) |
563 | __smlaldx(int16x2_t __a, int16x2_t __b, int64_t __c) { |
564 | return __builtin_arm_smlaldx(__a, __b, __c); |
565 | } |
566 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
567 | __smlsd(int16x2_t __a, int16x2_t __b, int32_t __c) { |
568 | return __builtin_arm_smlsd(__a, __b, __c); |
569 | } |
570 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
571 | __smlsdx(int16x2_t __a, int16x2_t __b, int32_t __c) { |
572 | return __builtin_arm_smlsdx(__a, __b, __c); |
573 | } |
574 | static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) |
575 | __smlsld(int16x2_t __a, int16x2_t __b, int64_t __c) { |
576 | return __builtin_arm_smlsld(__a, __b, __c); |
577 | } |
578 | static __inline__ int64_t __attribute__((__always_inline__, __nodebug__)) |
579 | __smlsldx(int16x2_t __a, int16x2_t __b, int64_t __c) { |
580 | return __builtin_arm_smlsldx(__a, __b, __c); |
581 | } |
582 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
583 | __smuad(int16x2_t __a, int16x2_t __b) { |
584 | return __builtin_arm_smuad(__a, __b); |
585 | } |
586 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
587 | __smuadx(int16x2_t __a, int16x2_t __b) { |
588 | return __builtin_arm_smuadx(__a, __b); |
589 | } |
590 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
591 | __smusd(int16x2_t __a, int16x2_t __b) { |
592 | return __builtin_arm_smusd(__a, __b); |
593 | } |
594 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__)) |
595 | __smusdx(int16x2_t __a, int16x2_t __b) { |
596 | return __builtin_arm_smusdx(__a, __b); |
597 | } |
598 | #endif |
599 | |
600 | /* 8.6 Floating-point data-processing intrinsics */ |
601 | #if (defined(__ARM_FEATURE_DIRECTED_ROUNDING) && \ |
602 | (__ARM_FEATURE_DIRECTED_ROUNDING)) && \ |
603 | (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) |
604 | static __inline__ double __attribute__((__always_inline__, __nodebug__)) |
605 | __rintn(double __a) { |
606 | return __builtin_roundeven(__a); |
607 | } |
608 | |
609 | static __inline__ float __attribute__((__always_inline__, __nodebug__)) |
610 | __rintnf(float __a) { |
611 | return __builtin_roundevenf(__a); |
612 | } |
613 | #endif |
614 | |
615 | /* 8.8 CRC32 intrinsics */ |
616 | #if (defined(__ARM_FEATURE_CRC32) && __ARM_FEATURE_CRC32) || \ |
617 | (defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE) |
618 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
619 | __crc32b(uint32_t __a, uint8_t __b) { |
620 | return __builtin_arm_crc32b(__a, __b); |
621 | } |
622 | |
623 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
624 | __crc32h(uint32_t __a, uint16_t __b) { |
625 | return __builtin_arm_crc32h(__a, __b); |
626 | } |
627 | |
628 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
629 | __crc32w(uint32_t __a, uint32_t __b) { |
630 | return __builtin_arm_crc32w(__a, __b); |
631 | } |
632 | |
633 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
634 | __crc32d(uint32_t __a, uint64_t __b) { |
635 | return __builtin_arm_crc32d(__a, __b); |
636 | } |
637 | |
638 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
639 | __crc32cb(uint32_t __a, uint8_t __b) { |
640 | return __builtin_arm_crc32cb(__a, __b); |
641 | } |
642 | |
643 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
644 | __crc32ch(uint32_t __a, uint16_t __b) { |
645 | return __builtin_arm_crc32ch(__a, __b); |
646 | } |
647 | |
648 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
649 | __crc32cw(uint32_t __a, uint32_t __b) { |
650 | return __builtin_arm_crc32cw(__a, __b); |
651 | } |
652 | |
653 | static __inline__ uint32_t __attribute__((__always_inline__, __nodebug__, target("crc"))) |
654 | __crc32cd(uint32_t __a, uint64_t __b) { |
655 | return __builtin_arm_crc32cd(__a, __b); |
656 | } |
657 | #endif |
658 | |
659 | /* 8.6 Floating-point data-processing intrinsics */ |
660 | /* Armv8.3-A Javascript conversion intrinsic */ |
661 | #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE |
662 | static __inline__ int32_t __attribute__((__always_inline__, __nodebug__, target("v8.3a"))) |
663 | __jcvt(double __a) { |
664 | return __builtin_arm_jcvt(__a); |
665 | } |
666 | #endif |
667 | |
668 | /* Armv8.5-A FP rounding intrinsics */ |
669 | #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE |
670 | static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
671 | __rint32zf(float __a) { |
672 | return __builtin_arm_rint32zf(__a); |
673 | } |
674 | |
675 | static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
676 | __rint32z(double __a) { |
677 | return __builtin_arm_rint32z(__a); |
678 | } |
679 | |
680 | static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
681 | __rint64zf(float __a) { |
682 | return __builtin_arm_rint64zf(__a); |
683 | } |
684 | |
685 | static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
686 | __rint64z(double __a) { |
687 | return __builtin_arm_rint64z(__a); |
688 | } |
689 | |
690 | static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
691 | __rint32xf(float __a) { |
692 | return __builtin_arm_rint32xf(__a); |
693 | } |
694 | |
695 | static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
696 | __rint32x(double __a) { |
697 | return __builtin_arm_rint32x(__a); |
698 | } |
699 | |
700 | static __inline__ float __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
701 | __rint64xf(float __a) { |
702 | return __builtin_arm_rint64xf(__a); |
703 | } |
704 | |
705 | static __inline__ double __attribute__((__always_inline__, __nodebug__, target("v8.5a"))) |
706 | __rint64x(double __a) { |
707 | return __builtin_arm_rint64x(__a); |
708 | } |
709 | #endif |
710 | |
711 | /* 8.9 Armv8.7-A load/store 64-byte intrinsics */ |
712 | #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE |
713 | typedef struct { |
714 | uint64_t val[8]; |
715 | } data512_t; |
716 | |
717 | static __inline__ data512_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) |
718 | __arm_ld64b(const void *__addr) { |
719 | data512_t __value; |
720 | __builtin_arm_ld64b(__addr, __value.val); |
721 | return __value; |
722 | } |
723 | static __inline__ void __attribute__((__always_inline__, __nodebug__, target("ls64"))) |
724 | __arm_st64b(void *__addr, data512_t __value) { |
725 | __builtin_arm_st64b(__addr, __value.val); |
726 | } |
727 | static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) |
728 | __arm_st64bv(void *__addr, data512_t __value) { |
729 | return __builtin_arm_st64bv(__addr, __value.val); |
730 | } |
731 | static __inline__ uint64_t __attribute__((__always_inline__, __nodebug__, target("ls64"))) |
732 | __arm_st64bv0(void *__addr, data512_t __value) { |
733 | return __builtin_arm_st64bv0(__addr, __value.val); |
734 | } |
735 | #endif |
736 | |
737 | /* 11.1 Special register intrinsics */ |
738 | #define __arm_rsr(sysreg) __builtin_arm_rsr(sysreg) |
739 | #define __arm_rsr64(sysreg) __builtin_arm_rsr64(sysreg) |
740 | #define __arm_rsr128(sysreg) __builtin_arm_rsr128(sysreg) |
741 | #define __arm_rsrp(sysreg) __builtin_arm_rsrp(sysreg) |
742 | #define __arm_rsrf(sysreg) __builtin_bit_cast(float, __arm_rsr(sysreg)) |
743 | #define __arm_rsrf64(sysreg) __builtin_bit_cast(double, __arm_rsr64(sysreg)) |
744 | #define __arm_wsr(sysreg, v) __builtin_arm_wsr(sysreg, v) |
745 | #define __arm_wsr64(sysreg, v) __builtin_arm_wsr64(sysreg, v) |
746 | #define __arm_wsr128(sysreg, v) __builtin_arm_wsr128(sysreg, v) |
747 | #define __arm_wsrp(sysreg, v) __builtin_arm_wsrp(sysreg, v) |
748 | #define __arm_wsrf(sysreg, v) __arm_wsr(sysreg, __builtin_bit_cast(uint32_t, v)) |
749 | #define __arm_wsrf64(sysreg, v) __arm_wsr64(sysreg, __builtin_bit_cast(uint64_t, v)) |
750 | |
751 | /* 10.3 Memory Tagging Extensions (MTE) Intrinsics */ |
752 | #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE |
753 | #define __arm_mte_create_random_tag(__ptr, __mask) __builtin_arm_irg(__ptr, __mask) |
754 | #define __arm_mte_increment_tag(__ptr, __tag_offset) __builtin_arm_addg(__ptr, __tag_offset) |
755 | #define __arm_mte_exclude_tag(__ptr, __excluded) __builtin_arm_gmi(__ptr, __excluded) |
756 | #define __arm_mte_get_tag(__ptr) __builtin_arm_ldg(__ptr) |
757 | #define __arm_mte_set_tag(__ptr) __builtin_arm_stg(__ptr) |
758 | #define __arm_mte_ptrdiff(__ptra, __ptrb) __builtin_arm_subp(__ptra, __ptrb) |
759 | |
760 | /* 18 Memory Operations Intrinsics */ |
761 | #define __arm_mops_memset_tag(__tagged_address, __value, __size) \ |
762 | __builtin_arm_mops_memset_tag(__tagged_address, __value, __size) |
763 | #endif |
764 | |
765 | /* 11.3 Coprocessor Intrinsics */ |
766 | #if defined(__ARM_FEATURE_COPROC) |
767 | |
768 | #if (__ARM_FEATURE_COPROC & 0x1) |
769 | |
770 | #if (__ARM_ARCH < 8) |
771 | #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \ |
772 | __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) |
773 | #endif /* __ARM_ARCH < 8 */ |
774 | |
775 | #define __arm_ldc(coproc, CRd, p) __builtin_arm_ldc(coproc, CRd, p) |
776 | #define __arm_stc(coproc, CRd, p) __builtin_arm_stc(coproc, CRd, p) |
777 | |
778 | #define __arm_mcr(coproc, opc1, value, CRn, CRm, opc2) \ |
779 | __builtin_arm_mcr(coproc, opc1, value, CRn, CRm, opc2) |
780 | #define __arm_mrc(coproc, opc1, CRn, CRm, opc2) \ |
781 | __builtin_arm_mrc(coproc, opc1, CRn, CRm, opc2) |
782 | |
783 | #if (__ARM_ARCH != 4) && (__ARM_ARCH < 8) |
784 | #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p) |
785 | #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p) |
786 | #endif /* (__ARM_ARCH != 4) && (__ARM_ARCH != 8) */ |
787 | |
788 | #if (__ARM_ARCH_8M_MAIN__) || (__ARM_ARCH_8_1M_MAIN__) |
789 | #define __arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) \ |
790 | __builtin_arm_cdp(coproc, opc1, CRd, CRn, CRm, opc2) |
791 | #define __arm_ldcl(coproc, CRd, p) __builtin_arm_ldcl(coproc, CRd, p) |
792 | #define __arm_stcl(coproc, CRd, p) __builtin_arm_stcl(coproc, CRd, p) |
793 | #endif /* ___ARM_ARCH_8M_MAIN__ */ |
794 | |
795 | #endif /* __ARM_FEATURE_COPROC & 0x1 */ |
796 | |
797 | #if (__ARM_FEATURE_COPROC & 0x2) |
798 | #define __arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) \ |
799 | __builtin_arm_cdp2(coproc, opc1, CRd, CRn, CRm, opc2) |
800 | #define __arm_ldc2(coproc, CRd, p) __builtin_arm_ldc2(coproc, CRd, p) |
801 | #define __arm_stc2(coproc, CRd, p) __builtin_arm_stc2(coproc, CRd, p) |
802 | #define __arm_ldc2l(coproc, CRd, p) __builtin_arm_ldc2l(coproc, CRd, p) |
803 | #define __arm_stc2l(coproc, CRd, p) __builtin_arm_stc2l(coproc, CRd, p) |
804 | #define __arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) \ |
805 | __builtin_arm_mcr2(coproc, opc1, value, CRn, CRm, opc2) |
806 | #define __arm_mrc2(coproc, opc1, CRn, CRm, opc2) \ |
807 | __builtin_arm_mrc2(coproc, opc1, CRn, CRm, opc2) |
808 | #endif |
809 | |
810 | #if (__ARM_FEATURE_COPROC & 0x4) |
811 | #define __arm_mcrr(coproc, opc1, value, CRm) \ |
812 | __builtin_arm_mcrr(coproc, opc1, value, CRm) |
813 | #define __arm_mrrc(coproc, opc1, CRm) __builtin_arm_mrrc(coproc, opc1, CRm) |
814 | #endif |
815 | |
816 | #if (__ARM_FEATURE_COPROC & 0x8) |
817 | #define __arm_mcrr2(coproc, opc1, value, CRm) \ |
818 | __builtin_arm_mcrr2(coproc, opc1, value, CRm) |
819 | #define __arm_mrrc2(coproc, opc1, CRm) __builtin_arm_mrrc2(coproc, opc1, CRm) |
820 | #endif |
821 | |
822 | #endif // __ARM_FEATURE_COPROC |
823 | |
824 | /* 17 Transactional Memory Extension (TME) Intrinsics */ |
825 | #if defined(__ARM_FEATURE_TME) && __ARM_FEATURE_TME |
826 | |
827 | #define _TMFAILURE_REASON 0x00007fffu |
828 | #define _TMFAILURE_RTRY 0x00008000u |
829 | #define _TMFAILURE_CNCL 0x00010000u |
830 | #define _TMFAILURE_MEM 0x00020000u |
831 | #define _TMFAILURE_IMP 0x00040000u |
832 | #define _TMFAILURE_ERR 0x00080000u |
833 | #define _TMFAILURE_SIZE 0x00100000u |
834 | #define _TMFAILURE_NEST 0x00200000u |
835 | #define _TMFAILURE_DBG 0x00400000u |
836 | #define _TMFAILURE_INT 0x00800000u |
837 | #define _TMFAILURE_TRIVIAL 0x01000000u |
838 | |
839 | #define __tstart() __builtin_arm_tstart() |
840 | #define __tcommit() __builtin_arm_tcommit() |
841 | #define __tcancel(__arg) __builtin_arm_tcancel(__arg) |
842 | #define __ttest() __builtin_arm_ttest() |
843 | |
844 | #endif /* __ARM_FEATURE_TME */ |
845 | |
846 | /* 8.7 Armv8.5-A Random number generation intrinsics */ |
847 | #if defined(__ARM_64BIT_STATE) && __ARM_64BIT_STATE |
848 | static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) |
849 | __rndr(uint64_t *__p) { |
850 | return __builtin_arm_rndr(__p); |
851 | } |
852 | static __inline__ int __attribute__((__always_inline__, __nodebug__, target("rand"))) |
853 | __rndrrs(uint64_t *__p) { |
854 | return __builtin_arm_rndrrs(__p); |
855 | } |
856 | #endif |
857 | |
858 | #if defined(__cplusplus) |
859 | } |
860 | #endif |
861 | |
862 | #endif /* __ARM_ACLE_H */ |
863 |
Warning: This file is not a C or C++ file. It does not have highlighting.