Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===---- __clang_hip_math.h - Device-side HIP math support ----------------=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | #ifndef __CLANG_HIP_MATH_H__ |
10 | #define __CLANG_HIP_MATH_H__ |
11 | |
12 | #if !defined(__HIP__) && !defined(__OPENMP_AMDGCN__) |
13 | #error "This file is for HIP and OpenMP AMDGCN device compilation only." |
14 | #endif |
15 | |
16 | #if !defined(__HIPCC_RTC__) |
17 | #include <limits.h> |
18 | #include <stdint.h> |
19 | #ifdef __OPENMP_AMDGCN__ |
20 | #include <omp.h> |
21 | #endif |
22 | #endif // !defined(__HIPCC_RTC__) |
23 | |
24 | #pragma push_macro("__DEVICE__") |
25 | |
26 | #ifdef __OPENMP_AMDGCN__ |
27 | #define __DEVICE__ static inline __attribute__((always_inline, nothrow)) |
28 | #else |
29 | #define __DEVICE__ static __device__ inline __attribute__((always_inline)) |
30 | #endif |
31 | |
32 | // Device library provides fast low precision and slow full-recision |
33 | // implementations for some functions. Which one gets selected depends on |
34 | // __CLANG_GPU_APPROX_TRANSCENDENTALS__ which gets defined by clang if |
35 | // -ffast-math or -fgpu-approx-transcendentals are in effect. |
36 | #pragma push_macro("__FAST_OR_SLOW") |
37 | #if defined(__CLANG_GPU_APPROX_TRANSCENDENTALS__) |
38 | #define __FAST_OR_SLOW(fast, slow) fast |
39 | #else |
40 | #define __FAST_OR_SLOW(fast, slow) slow |
41 | #endif |
42 | |
43 | // A few functions return bool type starting only in C++11. |
44 | #pragma push_macro("__RETURN_TYPE") |
45 | #ifdef __OPENMP_AMDGCN__ |
46 | #define __RETURN_TYPE int |
47 | #else |
48 | #if defined(__cplusplus) |
49 | #define __RETURN_TYPE bool |
50 | #else |
51 | #define __RETURN_TYPE int |
52 | #endif |
53 | #endif // __OPENMP_AMDGCN__ |
54 | |
55 | #if defined (__cplusplus) && __cplusplus < 201103L |
56 | // emulate static_assert on type sizes |
57 | template<bool> |
58 | struct __compare_result{}; |
59 | template<> |
60 | struct __compare_result<true> { |
61 | static const __device__ bool valid; |
62 | }; |
63 | |
64 | __DEVICE__ |
65 | void __suppress_unused_warning(bool b){}; |
66 | template <unsigned int S, unsigned int T> |
67 | __DEVICE__ void __static_assert_equal_size() { |
68 | __suppress_unused_warning(__compare_result<S == T>::valid); |
69 | } |
70 | |
71 | #define __static_assert_type_size_equal(A, B) \ |
72 | __static_assert_equal_size<A,B>() |
73 | |
74 | #else |
75 | #define __static_assert_type_size_equal(A,B) \ |
76 | static_assert((A) == (B), "") |
77 | |
78 | #endif |
79 | |
80 | __DEVICE__ |
81 | uint64_t __make_mantissa_base8(const char *__tagp __attribute__((nonnull))) { |
82 | uint64_t __r = 0; |
83 | while (*__tagp != '\0') { |
84 | char __tmp = *__tagp; |
85 | |
86 | if (__tmp >= '0' && __tmp <= '7') |
87 | __r = (__r * 8u) + __tmp - '0'; |
88 | else |
89 | return 0; |
90 | |
91 | ++__tagp; |
92 | } |
93 | |
94 | return __r; |
95 | } |
96 | |
97 | __DEVICE__ |
98 | uint64_t __make_mantissa_base10(const char *__tagp __attribute__((nonnull))) { |
99 | uint64_t __r = 0; |
100 | while (*__tagp != '\0') { |
101 | char __tmp = *__tagp; |
102 | |
103 | if (__tmp >= '0' && __tmp <= '9') |
104 | __r = (__r * 10u) + __tmp - '0'; |
105 | else |
106 | return 0; |
107 | |
108 | ++__tagp; |
109 | } |
110 | |
111 | return __r; |
112 | } |
113 | |
114 | __DEVICE__ |
115 | uint64_t __make_mantissa_base16(const char *__tagp __attribute__((nonnull))) { |
116 | uint64_t __r = 0; |
117 | while (*__tagp != '\0') { |
118 | char __tmp = *__tagp; |
119 | |
120 | if (__tmp >= '0' && __tmp <= '9') |
121 | __r = (__r * 16u) + __tmp - '0'; |
122 | else if (__tmp >= 'a' && __tmp <= 'f') |
123 | __r = (__r * 16u) + __tmp - 'a' + 10; |
124 | else if (__tmp >= 'A' && __tmp <= 'F') |
125 | __r = (__r * 16u) + __tmp - 'A' + 10; |
126 | else |
127 | return 0; |
128 | |
129 | ++__tagp; |
130 | } |
131 | |
132 | return __r; |
133 | } |
134 | |
135 | __DEVICE__ |
136 | uint64_t __make_mantissa(const char *__tagp __attribute__((nonnull))) { |
137 | if (*__tagp == '0') { |
138 | ++__tagp; |
139 | |
140 | if (*__tagp == 'x' || *__tagp == 'X') |
141 | return __make_mantissa_base16(__tagp); |
142 | else |
143 | return __make_mantissa_base8(__tagp); |
144 | } |
145 | |
146 | return __make_mantissa_base10(__tagp); |
147 | } |
148 | |
149 | // BEGIN FLOAT |
150 | |
151 | // BEGIN INTRINSICS |
152 | |
153 | __DEVICE__ |
154 | float __cosf(float __x) { return __ocml_native_cos_f32(__x); } |
155 | |
156 | __DEVICE__ |
157 | float __exp10f(float __x) { |
158 | const float __log2_10 = 0x1.a934f0p+1f; |
159 | return __builtin_amdgcn_exp2f(__log2_10 * __x); |
160 | } |
161 | |
162 | __DEVICE__ |
163 | float __expf(float __x) { |
164 | const float __log2_e = 0x1.715476p+0; |
165 | return __builtin_amdgcn_exp2f(__log2_e * __x); |
166 | } |
167 | |
168 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
169 | __DEVICE__ |
170 | float __fadd_rd(float __x, float __y) { return __ocml_add_rtn_f32(__x, __y); } |
171 | __DEVICE__ |
172 | float __fadd_rn(float __x, float __y) { return __ocml_add_rte_f32(__x, __y); } |
173 | __DEVICE__ |
174 | float __fadd_ru(float __x, float __y) { return __ocml_add_rtp_f32(__x, __y); } |
175 | __DEVICE__ |
176 | float __fadd_rz(float __x, float __y) { return __ocml_add_rtz_f32(__x, __y); } |
177 | #else |
178 | __DEVICE__ |
179 | float __fadd_rn(float __x, float __y) { return __x + __y; } |
180 | #endif |
181 | |
182 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
183 | __DEVICE__ |
184 | float __fdiv_rd(float __x, float __y) { return __ocml_div_rtn_f32(__x, __y); } |
185 | __DEVICE__ |
186 | float __fdiv_rn(float __x, float __y) { return __ocml_div_rte_f32(__x, __y); } |
187 | __DEVICE__ |
188 | float __fdiv_ru(float __x, float __y) { return __ocml_div_rtp_f32(__x, __y); } |
189 | __DEVICE__ |
190 | float __fdiv_rz(float __x, float __y) { return __ocml_div_rtz_f32(__x, __y); } |
191 | #else |
192 | __DEVICE__ |
193 | float __fdiv_rn(float __x, float __y) { return __x / __y; } |
194 | #endif |
195 | |
196 | __DEVICE__ |
197 | float __fdividef(float __x, float __y) { return __x / __y; } |
198 | |
199 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
200 | __DEVICE__ |
201 | float __fmaf_rd(float __x, float __y, float __z) { |
202 | return __ocml_fma_rtn_f32(__x, __y, __z); |
203 | } |
204 | __DEVICE__ |
205 | float __fmaf_rn(float __x, float __y, float __z) { |
206 | return __ocml_fma_rte_f32(__x, __y, __z); |
207 | } |
208 | __DEVICE__ |
209 | float __fmaf_ru(float __x, float __y, float __z) { |
210 | return __ocml_fma_rtp_f32(__x, __y, __z); |
211 | } |
212 | __DEVICE__ |
213 | float __fmaf_rz(float __x, float __y, float __z) { |
214 | return __ocml_fma_rtz_f32(__x, __y, __z); |
215 | } |
216 | #else |
217 | __DEVICE__ |
218 | float __fmaf_rn(float __x, float __y, float __z) { |
219 | return __builtin_fmaf(__x, __y, __z); |
220 | } |
221 | #endif |
222 | |
223 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
224 | __DEVICE__ |
225 | float __fmul_rd(float __x, float __y) { return __ocml_mul_rtn_f32(__x, __y); } |
226 | __DEVICE__ |
227 | float __fmul_rn(float __x, float __y) { return __ocml_mul_rte_f32(__x, __y); } |
228 | __DEVICE__ |
229 | float __fmul_ru(float __x, float __y) { return __ocml_mul_rtp_f32(__x, __y); } |
230 | __DEVICE__ |
231 | float __fmul_rz(float __x, float __y) { return __ocml_mul_rtz_f32(__x, __y); } |
232 | #else |
233 | __DEVICE__ |
234 | float __fmul_rn(float __x, float __y) { return __x * __y; } |
235 | #endif |
236 | |
237 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
238 | __DEVICE__ |
239 | float __frcp_rd(float __x) { return __ocml_div_rtn_f32(1.0f, __x); } |
240 | __DEVICE__ |
241 | float __frcp_rn(float __x) { return __ocml_div_rte_f32(1.0f, __x); } |
242 | __DEVICE__ |
243 | float __frcp_ru(float __x) { return __ocml_div_rtp_f32(1.0f, __x); } |
244 | __DEVICE__ |
245 | float __frcp_rz(float __x) { return __ocml_div_rtz_f32(1.0f, __x); } |
246 | #else |
247 | __DEVICE__ |
248 | float __frcp_rn(float __x) { return 1.0f / __x; } |
249 | #endif |
250 | |
251 | __DEVICE__ |
252 | float __frsqrt_rn(float __x) { return __builtin_amdgcn_rsqf(__x); } |
253 | |
254 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
255 | __DEVICE__ |
256 | float __fsqrt_rd(float __x) { return __ocml_sqrt_rtn_f32(__x); } |
257 | __DEVICE__ |
258 | float __fsqrt_rn(float __x) { return __ocml_sqrt_rte_f32(__x); } |
259 | __DEVICE__ |
260 | float __fsqrt_ru(float __x) { return __ocml_sqrt_rtp_f32(__x); } |
261 | __DEVICE__ |
262 | float __fsqrt_rz(float __x) { return __ocml_sqrt_rtz_f32(__x); } |
263 | #else |
264 | __DEVICE__ |
265 | float __fsqrt_rn(float __x) { return __ocml_native_sqrt_f32(__x); } |
266 | #endif |
267 | |
268 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
269 | __DEVICE__ |
270 | float __fsub_rd(float __x, float __y) { return __ocml_sub_rtn_f32(__x, __y); } |
271 | __DEVICE__ |
272 | float __fsub_rn(float __x, float __y) { return __ocml_sub_rte_f32(__x, __y); } |
273 | __DEVICE__ |
274 | float __fsub_ru(float __x, float __y) { return __ocml_sub_rtp_f32(__x, __y); } |
275 | __DEVICE__ |
276 | float __fsub_rz(float __x, float __y) { return __ocml_sub_rtz_f32(__x, __y); } |
277 | #else |
278 | __DEVICE__ |
279 | float __fsub_rn(float __x, float __y) { return __x - __y; } |
280 | #endif |
281 | |
282 | __DEVICE__ |
283 | float __log10f(float __x) { return __builtin_log10f(__x); } |
284 | |
285 | __DEVICE__ |
286 | float __log2f(float __x) { return __builtin_amdgcn_logf(__x); } |
287 | |
288 | __DEVICE__ |
289 | float __logf(float __x) { return __builtin_logf(__x); } |
290 | |
291 | __DEVICE__ |
292 | float __powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } |
293 | |
294 | __DEVICE__ |
295 | float __saturatef(float __x) { return (__x < 0) ? 0 : ((__x > 1) ? 1 : __x); } |
296 | |
297 | __DEVICE__ |
298 | void __sincosf(float __x, float *__sinptr, float *__cosptr) { |
299 | *__sinptr = __ocml_native_sin_f32(__x); |
300 | *__cosptr = __ocml_native_cos_f32(__x); |
301 | } |
302 | |
303 | __DEVICE__ |
304 | float __sinf(float __x) { return __ocml_native_sin_f32(__x); } |
305 | |
306 | __DEVICE__ |
307 | float __tanf(float __x) { |
308 | return __sinf(__x) * __builtin_amdgcn_rcpf(__cosf(__x)); |
309 | } |
310 | // END INTRINSICS |
311 | |
312 | #if defined(__cplusplus) |
313 | __DEVICE__ |
314 | int abs(int __x) { |
315 | return __builtin_abs(__x); |
316 | } |
317 | __DEVICE__ |
318 | long labs(long __x) { |
319 | return __builtin_labs(__x); |
320 | } |
321 | __DEVICE__ |
322 | long long llabs(long long __x) { |
323 | return __builtin_llabs(__x); |
324 | } |
325 | #endif |
326 | |
327 | __DEVICE__ |
328 | float acosf(float __x) { return __ocml_acos_f32(__x); } |
329 | |
330 | __DEVICE__ |
331 | float acoshf(float __x) { return __ocml_acosh_f32(__x); } |
332 | |
333 | __DEVICE__ |
334 | float asinf(float __x) { return __ocml_asin_f32(__x); } |
335 | |
336 | __DEVICE__ |
337 | float asinhf(float __x) { return __ocml_asinh_f32(__x); } |
338 | |
339 | __DEVICE__ |
340 | float atan2f(float __x, float __y) { return __ocml_atan2_f32(__x, __y); } |
341 | |
342 | __DEVICE__ |
343 | float atanf(float __x) { return __ocml_atan_f32(__x); } |
344 | |
345 | __DEVICE__ |
346 | float atanhf(float __x) { return __ocml_atanh_f32(__x); } |
347 | |
348 | __DEVICE__ |
349 | float cbrtf(float __x) { return __ocml_cbrt_f32(__x); } |
350 | |
351 | __DEVICE__ |
352 | float ceilf(float __x) { return __builtin_ceilf(__x); } |
353 | |
354 | __DEVICE__ |
355 | float copysignf(float __x, float __y) { return __builtin_copysignf(__x, __y); } |
356 | |
357 | __DEVICE__ |
358 | float cosf(float __x) { return __FAST_OR_SLOW(__cosf, __ocml_cos_f32)(__x); } |
359 | |
360 | __DEVICE__ |
361 | float coshf(float __x) { return __ocml_cosh_f32(__x); } |
362 | |
363 | __DEVICE__ |
364 | float cospif(float __x) { return __ocml_cospi_f32(__x); } |
365 | |
366 | __DEVICE__ |
367 | float cyl_bessel_i0f(float __x) { return __ocml_i0_f32(__x); } |
368 | |
369 | __DEVICE__ |
370 | float cyl_bessel_i1f(float __x) { return __ocml_i1_f32(__x); } |
371 | |
372 | __DEVICE__ |
373 | float erfcf(float __x) { return __ocml_erfc_f32(__x); } |
374 | |
375 | __DEVICE__ |
376 | float erfcinvf(float __x) { return __ocml_erfcinv_f32(__x); } |
377 | |
378 | __DEVICE__ |
379 | float erfcxf(float __x) { return __ocml_erfcx_f32(__x); } |
380 | |
381 | __DEVICE__ |
382 | float erff(float __x) { return __ocml_erf_f32(__x); } |
383 | |
384 | __DEVICE__ |
385 | float erfinvf(float __x) { return __ocml_erfinv_f32(__x); } |
386 | |
387 | __DEVICE__ |
388 | float exp10f(float __x) { return __ocml_exp10_f32(__x); } |
389 | |
390 | __DEVICE__ |
391 | float exp2f(float __x) { return __builtin_exp2f(__x); } |
392 | |
393 | __DEVICE__ |
394 | float expf(float __x) { return __builtin_expf(__x); } |
395 | |
396 | __DEVICE__ |
397 | float expm1f(float __x) { return __ocml_expm1_f32(__x); } |
398 | |
399 | __DEVICE__ |
400 | float fabsf(float __x) { return __builtin_fabsf(__x); } |
401 | |
402 | __DEVICE__ |
403 | float fdimf(float __x, float __y) { return __ocml_fdim_f32(__x, __y); } |
404 | |
405 | __DEVICE__ |
406 | float fdividef(float __x, float __y) { return __x / __y; } |
407 | |
408 | __DEVICE__ |
409 | float floorf(float __x) { return __builtin_floorf(__x); } |
410 | |
411 | __DEVICE__ |
412 | float fmaf(float __x, float __y, float __z) { |
413 | return __builtin_fmaf(__x, __y, __z); |
414 | } |
415 | |
416 | __DEVICE__ |
417 | float fmaxf(float __x, float __y) { return __builtin_fmaxf(__x, __y); } |
418 | |
419 | __DEVICE__ |
420 | float fminf(float __x, float __y) { return __builtin_fminf(__x, __y); } |
421 | |
422 | __DEVICE__ |
423 | float fmodf(float __x, float __y) { return __ocml_fmod_f32(__x, __y); } |
424 | |
425 | __DEVICE__ |
426 | float frexpf(float __x, int *__nptr) { |
427 | return __builtin_frexpf(__x, __nptr); |
428 | } |
429 | |
430 | __DEVICE__ |
431 | float hypotf(float __x, float __y) { return __ocml_hypot_f32(__x, __y); } |
432 | |
433 | __DEVICE__ |
434 | int ilogbf(float __x) { return __ocml_ilogb_f32(__x); } |
435 | |
436 | __DEVICE__ |
437 | __RETURN_TYPE __finitef(float __x) { return __builtin_isfinite(__x); } |
438 | |
439 | __DEVICE__ |
440 | __RETURN_TYPE __isinff(float __x) { return __builtin_isinf(__x); } |
441 | |
442 | __DEVICE__ |
443 | __RETURN_TYPE __isnanf(float __x) { return __builtin_isnan(__x); } |
444 | |
445 | __DEVICE__ |
446 | float j0f(float __x) { return __ocml_j0_f32(__x); } |
447 | |
448 | __DEVICE__ |
449 | float j1f(float __x) { return __ocml_j1_f32(__x); } |
450 | |
451 | __DEVICE__ |
452 | float jnf(int __n, float __x) { // TODO: we could use Ahmes multiplication |
453 | // and the Miller & Brown algorithm |
454 | // for linear recurrences to get O(log n) steps, but it's unclear if |
455 | // it'd be beneficial in this case. |
456 | if (__n == 0) |
457 | return j0f(__x); |
458 | if (__n == 1) |
459 | return j1f(__x); |
460 | |
461 | float __x0 = j0f(__x); |
462 | float __x1 = j1f(__x); |
463 | for (int __i = 1; __i < __n; ++__i) { |
464 | float __x2 = (2 * __i) / __x * __x1 - __x0; |
465 | __x0 = __x1; |
466 | __x1 = __x2; |
467 | } |
468 | |
469 | return __x1; |
470 | } |
471 | |
472 | __DEVICE__ |
473 | float ldexpf(float __x, int __e) { return __builtin_amdgcn_ldexpf(__x, __e); } |
474 | |
475 | __DEVICE__ |
476 | float lgammaf(float __x) { return __ocml_lgamma_f32(__x); } |
477 | |
478 | __DEVICE__ |
479 | long long int llrintf(float __x) { return __builtin_rintf(__x); } |
480 | |
481 | __DEVICE__ |
482 | long long int llroundf(float __x) { return __builtin_roundf(__x); } |
483 | |
484 | __DEVICE__ |
485 | float log10f(float __x) { return __builtin_log10f(__x); } |
486 | |
487 | __DEVICE__ |
488 | float log1pf(float __x) { return __ocml_log1p_f32(__x); } |
489 | |
490 | __DEVICE__ |
491 | float log2f(float __x) { return __FAST_OR_SLOW(__log2f, __ocml_log2_f32)(__x); } |
492 | |
493 | __DEVICE__ |
494 | float logbf(float __x) { return __ocml_logb_f32(__x); } |
495 | |
496 | __DEVICE__ |
497 | float logf(float __x) { return __FAST_OR_SLOW(__logf, __ocml_log_f32)(__x); } |
498 | |
499 | __DEVICE__ |
500 | long int lrintf(float __x) { return __builtin_rintf(__x); } |
501 | |
502 | __DEVICE__ |
503 | long int lroundf(float __x) { return __builtin_roundf(__x); } |
504 | |
505 | __DEVICE__ |
506 | float modff(float __x, float *__iptr) { |
507 | float __tmp; |
508 | #ifdef __OPENMP_AMDGCN__ |
509 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
510 | #endif |
511 | float __r = |
512 | __ocml_modf_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); |
513 | *__iptr = __tmp; |
514 | return __r; |
515 | } |
516 | |
517 | __DEVICE__ |
518 | float nanf(const char *__tagp __attribute__((nonnull))) { |
519 | union { |
520 | float val; |
521 | struct ieee_float { |
522 | unsigned int mantissa : 22; |
523 | unsigned int quiet : 1; |
524 | unsigned int exponent : 8; |
525 | unsigned int sign : 1; |
526 | } bits; |
527 | } __tmp; |
528 | __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); |
529 | |
530 | __tmp.bits.sign = 0u; |
531 | __tmp.bits.exponent = ~0u; |
532 | __tmp.bits.quiet = 1u; |
533 | __tmp.bits.mantissa = __make_mantissa(__tagp); |
534 | |
535 | return __tmp.val; |
536 | } |
537 | |
538 | __DEVICE__ |
539 | float nearbyintf(float __x) { return __builtin_nearbyintf(__x); } |
540 | |
541 | __DEVICE__ |
542 | float nextafterf(float __x, float __y) { |
543 | return __ocml_nextafter_f32(__x, __y); |
544 | } |
545 | |
546 | __DEVICE__ |
547 | float norm3df(float __x, float __y, float __z) { |
548 | return __ocml_len3_f32(__x, __y, __z); |
549 | } |
550 | |
551 | __DEVICE__ |
552 | float norm4df(float __x, float __y, float __z, float __w) { |
553 | return __ocml_len4_f32(__x, __y, __z, __w); |
554 | } |
555 | |
556 | __DEVICE__ |
557 | float normcdff(float __x) { return __ocml_ncdf_f32(__x); } |
558 | |
559 | __DEVICE__ |
560 | float normcdfinvf(float __x) { return __ocml_ncdfinv_f32(__x); } |
561 | |
562 | __DEVICE__ |
563 | float normf(int __dim, |
564 | const float *__a) { // TODO: placeholder until OCML adds support. |
565 | float __r = 0; |
566 | while (__dim--) { |
567 | __r += __a[0] * __a[0]; |
568 | ++__a; |
569 | } |
570 | |
571 | return __builtin_sqrtf(__r); |
572 | } |
573 | |
574 | __DEVICE__ |
575 | float powf(float __x, float __y) { return __ocml_pow_f32(__x, __y); } |
576 | |
577 | __DEVICE__ |
578 | float powif(float __x, int __y) { return __ocml_pown_f32(__x, __y); } |
579 | |
580 | __DEVICE__ |
581 | float rcbrtf(float __x) { return __ocml_rcbrt_f32(__x); } |
582 | |
583 | __DEVICE__ |
584 | float remainderf(float __x, float __y) { |
585 | return __ocml_remainder_f32(__x, __y); |
586 | } |
587 | |
588 | __DEVICE__ |
589 | float remquof(float __x, float __y, int *__quo) { |
590 | int __tmp; |
591 | #ifdef __OPENMP_AMDGCN__ |
592 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
593 | #endif |
594 | float __r = __ocml_remquo_f32( |
595 | __x, __y, (__attribute__((address_space(5))) int *)&__tmp); |
596 | *__quo = __tmp; |
597 | |
598 | return __r; |
599 | } |
600 | |
601 | __DEVICE__ |
602 | float rhypotf(float __x, float __y) { return __ocml_rhypot_f32(__x, __y); } |
603 | |
604 | __DEVICE__ |
605 | float rintf(float __x) { return __builtin_rintf(__x); } |
606 | |
607 | __DEVICE__ |
608 | float rnorm3df(float __x, float __y, float __z) { |
609 | return __ocml_rlen3_f32(__x, __y, __z); |
610 | } |
611 | |
612 | __DEVICE__ |
613 | float rnorm4df(float __x, float __y, float __z, float __w) { |
614 | return __ocml_rlen4_f32(__x, __y, __z, __w); |
615 | } |
616 | |
617 | __DEVICE__ |
618 | float rnormf(int __dim, |
619 | const float *__a) { // TODO: placeholder until OCML adds support. |
620 | float __r = 0; |
621 | while (__dim--) { |
622 | __r += __a[0] * __a[0]; |
623 | ++__a; |
624 | } |
625 | |
626 | return __ocml_rsqrt_f32(__r); |
627 | } |
628 | |
629 | __DEVICE__ |
630 | float roundf(float __x) { return __builtin_roundf(__x); } |
631 | |
632 | __DEVICE__ |
633 | float rsqrtf(float __x) { return __ocml_rsqrt_f32(__x); } |
634 | |
635 | __DEVICE__ |
636 | float scalblnf(float __x, long int __n) { |
637 | return (__n < INT_MAX) ? __builtin_amdgcn_ldexpf(__x, __n) |
638 | : __ocml_scalb_f32(__x, __n); |
639 | } |
640 | |
641 | __DEVICE__ |
642 | float scalbnf(float __x, int __n) { return __builtin_amdgcn_ldexpf(__x, __n); } |
643 | |
644 | __DEVICE__ |
645 | __RETURN_TYPE __signbitf(float __x) { return __builtin_signbitf(__x); } |
646 | |
647 | __DEVICE__ |
648 | void sincosf(float __x, float *__sinptr, float *__cosptr) { |
649 | float __tmp; |
650 | #ifdef __OPENMP_AMDGCN__ |
651 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
652 | #endif |
653 | #ifdef __CLANG_CUDA_APPROX_TRANSCENDENTALS__ |
654 | __sincosf(__x, __sinptr, __cosptr); |
655 | #else |
656 | *__sinptr = |
657 | __ocml_sincos_f32(__x, (__attribute__((address_space(5))) float *)&__tmp); |
658 | *__cosptr = __tmp; |
659 | #endif |
660 | } |
661 | |
662 | __DEVICE__ |
663 | void sincospif(float __x, float *__sinptr, float *__cosptr) { |
664 | float __tmp; |
665 | #ifdef __OPENMP_AMDGCN__ |
666 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
667 | #endif |
668 | *__sinptr = __ocml_sincospi_f32( |
669 | __x, (__attribute__((address_space(5))) float *)&__tmp); |
670 | *__cosptr = __tmp; |
671 | } |
672 | |
673 | __DEVICE__ |
674 | float sinf(float __x) { return __FAST_OR_SLOW(__sinf, __ocml_sin_f32)(__x); } |
675 | |
676 | __DEVICE__ |
677 | float sinhf(float __x) { return __ocml_sinh_f32(__x); } |
678 | |
679 | __DEVICE__ |
680 | float sinpif(float __x) { return __ocml_sinpi_f32(__x); } |
681 | |
682 | __DEVICE__ |
683 | float sqrtf(float __x) { return __builtin_sqrtf(__x); } |
684 | |
685 | __DEVICE__ |
686 | float tanf(float __x) { return __ocml_tan_f32(__x); } |
687 | |
688 | __DEVICE__ |
689 | float tanhf(float __x) { return __ocml_tanh_f32(__x); } |
690 | |
691 | __DEVICE__ |
692 | float tgammaf(float __x) { return __ocml_tgamma_f32(__x); } |
693 | |
694 | __DEVICE__ |
695 | float truncf(float __x) { return __builtin_truncf(__x); } |
696 | |
697 | __DEVICE__ |
698 | float y0f(float __x) { return __ocml_y0_f32(__x); } |
699 | |
700 | __DEVICE__ |
701 | float y1f(float __x) { return __ocml_y1_f32(__x); } |
702 | |
703 | __DEVICE__ |
704 | float ynf(int __n, float __x) { // TODO: we could use Ahmes multiplication |
705 | // and the Miller & Brown algorithm |
706 | // for linear recurrences to get O(log n) steps, but it's unclear if |
707 | // it'd be beneficial in this case. Placeholder until OCML adds |
708 | // support. |
709 | if (__n == 0) |
710 | return y0f(__x); |
711 | if (__n == 1) |
712 | return y1f(__x); |
713 | |
714 | float __x0 = y0f(__x); |
715 | float __x1 = y1f(__x); |
716 | for (int __i = 1; __i < __n; ++__i) { |
717 | float __x2 = (2 * __i) / __x * __x1 - __x0; |
718 | __x0 = __x1; |
719 | __x1 = __x2; |
720 | } |
721 | |
722 | return __x1; |
723 | } |
724 | |
725 | |
726 | // END FLOAT |
727 | |
728 | // BEGIN DOUBLE |
729 | __DEVICE__ |
730 | double acos(double __x) { return __ocml_acos_f64(__x); } |
731 | |
732 | __DEVICE__ |
733 | double acosh(double __x) { return __ocml_acosh_f64(__x); } |
734 | |
735 | __DEVICE__ |
736 | double asin(double __x) { return __ocml_asin_f64(__x); } |
737 | |
738 | __DEVICE__ |
739 | double asinh(double __x) { return __ocml_asinh_f64(__x); } |
740 | |
741 | __DEVICE__ |
742 | double atan(double __x) { return __ocml_atan_f64(__x); } |
743 | |
744 | __DEVICE__ |
745 | double atan2(double __x, double __y) { return __ocml_atan2_f64(__x, __y); } |
746 | |
747 | __DEVICE__ |
748 | double atanh(double __x) { return __ocml_atanh_f64(__x); } |
749 | |
750 | __DEVICE__ |
751 | double cbrt(double __x) { return __ocml_cbrt_f64(__x); } |
752 | |
753 | __DEVICE__ |
754 | double ceil(double __x) { return __builtin_ceil(__x); } |
755 | |
756 | __DEVICE__ |
757 | double copysign(double __x, double __y) { |
758 | return __builtin_copysign(__x, __y); |
759 | } |
760 | |
761 | __DEVICE__ |
762 | double cos(double __x) { return __ocml_cos_f64(__x); } |
763 | |
764 | __DEVICE__ |
765 | double cosh(double __x) { return __ocml_cosh_f64(__x); } |
766 | |
767 | __DEVICE__ |
768 | double cospi(double __x) { return __ocml_cospi_f64(__x); } |
769 | |
770 | __DEVICE__ |
771 | double cyl_bessel_i0(double __x) { return __ocml_i0_f64(__x); } |
772 | |
773 | __DEVICE__ |
774 | double cyl_bessel_i1(double __x) { return __ocml_i1_f64(__x); } |
775 | |
776 | __DEVICE__ |
777 | double erf(double __x) { return __ocml_erf_f64(__x); } |
778 | |
779 | __DEVICE__ |
780 | double erfc(double __x) { return __ocml_erfc_f64(__x); } |
781 | |
782 | __DEVICE__ |
783 | double erfcinv(double __x) { return __ocml_erfcinv_f64(__x); } |
784 | |
785 | __DEVICE__ |
786 | double erfcx(double __x) { return __ocml_erfcx_f64(__x); } |
787 | |
788 | __DEVICE__ |
789 | double erfinv(double __x) { return __ocml_erfinv_f64(__x); } |
790 | |
791 | __DEVICE__ |
792 | double exp(double __x) { return __ocml_exp_f64(__x); } |
793 | |
794 | __DEVICE__ |
795 | double exp10(double __x) { return __ocml_exp10_f64(__x); } |
796 | |
797 | __DEVICE__ |
798 | double exp2(double __x) { return __ocml_exp2_f64(__x); } |
799 | |
800 | __DEVICE__ |
801 | double expm1(double __x) { return __ocml_expm1_f64(__x); } |
802 | |
803 | __DEVICE__ |
804 | double fabs(double __x) { return __builtin_fabs(__x); } |
805 | |
806 | __DEVICE__ |
807 | double fdim(double __x, double __y) { return __ocml_fdim_f64(__x, __y); } |
808 | |
809 | __DEVICE__ |
810 | double floor(double __x) { return __builtin_floor(__x); } |
811 | |
812 | __DEVICE__ |
813 | double fma(double __x, double __y, double __z) { |
814 | return __builtin_fma(__x, __y, __z); |
815 | } |
816 | |
817 | __DEVICE__ |
818 | double fmax(double __x, double __y) { return __builtin_fmax(__x, __y); } |
819 | |
820 | __DEVICE__ |
821 | double fmin(double __x, double __y) { return __builtin_fmin(__x, __y); } |
822 | |
823 | __DEVICE__ |
824 | double fmod(double __x, double __y) { return __ocml_fmod_f64(__x, __y); } |
825 | |
826 | __DEVICE__ |
827 | double frexp(double __x, int *__nptr) { |
828 | return __builtin_frexp(__x, __nptr); |
829 | } |
830 | |
831 | __DEVICE__ |
832 | double hypot(double __x, double __y) { return __ocml_hypot_f64(__x, __y); } |
833 | |
834 | __DEVICE__ |
835 | int ilogb(double __x) { return __ocml_ilogb_f64(__x); } |
836 | |
837 | __DEVICE__ |
838 | __RETURN_TYPE __finite(double __x) { return __builtin_isfinite(__x); } |
839 | |
840 | __DEVICE__ |
841 | __RETURN_TYPE __isinf(double __x) { return __builtin_isinf(__x); } |
842 | |
843 | __DEVICE__ |
844 | __RETURN_TYPE __isnan(double __x) { return __builtin_isnan(__x); } |
845 | |
846 | __DEVICE__ |
847 | double j0(double __x) { return __ocml_j0_f64(__x); } |
848 | |
849 | __DEVICE__ |
850 | double j1(double __x) { return __ocml_j1_f64(__x); } |
851 | |
852 | __DEVICE__ |
853 | double jn(int __n, double __x) { // TODO: we could use Ahmes multiplication |
854 | // and the Miller & Brown algorithm |
855 | // for linear recurrences to get O(log n) steps, but it's unclear if |
856 | // it'd be beneficial in this case. Placeholder until OCML adds |
857 | // support. |
858 | if (__n == 0) |
859 | return j0(__x); |
860 | if (__n == 1) |
861 | return j1(__x); |
862 | |
863 | double __x0 = j0(__x); |
864 | double __x1 = j1(__x); |
865 | for (int __i = 1; __i < __n; ++__i) { |
866 | double __x2 = (2 * __i) / __x * __x1 - __x0; |
867 | __x0 = __x1; |
868 | __x1 = __x2; |
869 | } |
870 | return __x1; |
871 | } |
872 | |
873 | __DEVICE__ |
874 | double ldexp(double __x, int __e) { return __builtin_amdgcn_ldexp(__x, __e); } |
875 | |
876 | __DEVICE__ |
877 | double lgamma(double __x) { return __ocml_lgamma_f64(__x); } |
878 | |
879 | __DEVICE__ |
880 | long long int llrint(double __x) { return __builtin_rint(__x); } |
881 | |
882 | __DEVICE__ |
883 | long long int llround(double __x) { return __builtin_round(__x); } |
884 | |
885 | __DEVICE__ |
886 | double log(double __x) { return __ocml_log_f64(__x); } |
887 | |
888 | __DEVICE__ |
889 | double log10(double __x) { return __ocml_log10_f64(__x); } |
890 | |
891 | __DEVICE__ |
892 | double log1p(double __x) { return __ocml_log1p_f64(__x); } |
893 | |
894 | __DEVICE__ |
895 | double log2(double __x) { return __ocml_log2_f64(__x); } |
896 | |
897 | __DEVICE__ |
898 | double logb(double __x) { return __ocml_logb_f64(__x); } |
899 | |
900 | __DEVICE__ |
901 | long int lrint(double __x) { return __builtin_rint(__x); } |
902 | |
903 | __DEVICE__ |
904 | long int lround(double __x) { return __builtin_round(__x); } |
905 | |
906 | __DEVICE__ |
907 | double modf(double __x, double *__iptr) { |
908 | double __tmp; |
909 | #ifdef __OPENMP_AMDGCN__ |
910 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
911 | #endif |
912 | double __r = |
913 | __ocml_modf_f64(__x, (__attribute__((address_space(5))) double *)&__tmp); |
914 | *__iptr = __tmp; |
915 | |
916 | return __r; |
917 | } |
918 | |
919 | __DEVICE__ |
920 | double nan(const char *__tagp) { |
921 | #if !_WIN32 |
922 | union { |
923 | double val; |
924 | struct ieee_double { |
925 | uint64_t mantissa : 51; |
926 | uint32_t quiet : 1; |
927 | uint32_t exponent : 11; |
928 | uint32_t sign : 1; |
929 | } bits; |
930 | } __tmp; |
931 | __static_assert_type_size_equal(sizeof(__tmp.val), sizeof(__tmp.bits)); |
932 | |
933 | __tmp.bits.sign = 0u; |
934 | __tmp.bits.exponent = ~0u; |
935 | __tmp.bits.quiet = 1u; |
936 | __tmp.bits.mantissa = __make_mantissa(__tagp); |
937 | |
938 | return __tmp.val; |
939 | #else |
940 | __static_assert_type_size_equal(sizeof(uint64_t), sizeof(double)); |
941 | uint64_t __val = __make_mantissa(__tagp); |
942 | __val |= 0xFFF << 51; |
943 | return *reinterpret_cast<double *>(&__val); |
944 | #endif |
945 | } |
946 | |
947 | __DEVICE__ |
948 | double nearbyint(double __x) { return __builtin_nearbyint(__x); } |
949 | |
950 | __DEVICE__ |
951 | double nextafter(double __x, double __y) { |
952 | return __ocml_nextafter_f64(__x, __y); |
953 | } |
954 | |
955 | __DEVICE__ |
956 | double norm(int __dim, |
957 | const double *__a) { // TODO: placeholder until OCML adds support. |
958 | double __r = 0; |
959 | while (__dim--) { |
960 | __r += __a[0] * __a[0]; |
961 | ++__a; |
962 | } |
963 | |
964 | return __builtin_sqrt(__r); |
965 | } |
966 | |
967 | __DEVICE__ |
968 | double norm3d(double __x, double __y, double __z) { |
969 | return __ocml_len3_f64(__x, __y, __z); |
970 | } |
971 | |
972 | __DEVICE__ |
973 | double norm4d(double __x, double __y, double __z, double __w) { |
974 | return __ocml_len4_f64(__x, __y, __z, __w); |
975 | } |
976 | |
977 | __DEVICE__ |
978 | double normcdf(double __x) { return __ocml_ncdf_f64(__x); } |
979 | |
980 | __DEVICE__ |
981 | double normcdfinv(double __x) { return __ocml_ncdfinv_f64(__x); } |
982 | |
983 | __DEVICE__ |
984 | double pow(double __x, double __y) { return __ocml_pow_f64(__x, __y); } |
985 | |
986 | __DEVICE__ |
987 | double powi(double __x, int __y) { return __ocml_pown_f64(__x, __y); } |
988 | |
989 | __DEVICE__ |
990 | double rcbrt(double __x) { return __ocml_rcbrt_f64(__x); } |
991 | |
992 | __DEVICE__ |
993 | double remainder(double __x, double __y) { |
994 | return __ocml_remainder_f64(__x, __y); |
995 | } |
996 | |
997 | __DEVICE__ |
998 | double remquo(double __x, double __y, int *__quo) { |
999 | int __tmp; |
1000 | #ifdef __OPENMP_AMDGCN__ |
1001 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
1002 | #endif |
1003 | double __r = __ocml_remquo_f64( |
1004 | __x, __y, (__attribute__((address_space(5))) int *)&__tmp); |
1005 | *__quo = __tmp; |
1006 | |
1007 | return __r; |
1008 | } |
1009 | |
1010 | __DEVICE__ |
1011 | double rhypot(double __x, double __y) { return __ocml_rhypot_f64(__x, __y); } |
1012 | |
1013 | __DEVICE__ |
1014 | double rint(double __x) { return __builtin_rint(__x); } |
1015 | |
1016 | __DEVICE__ |
1017 | double rnorm(int __dim, |
1018 | const double *__a) { // TODO: placeholder until OCML adds support. |
1019 | double __r = 0; |
1020 | while (__dim--) { |
1021 | __r += __a[0] * __a[0]; |
1022 | ++__a; |
1023 | } |
1024 | |
1025 | return __ocml_rsqrt_f64(__r); |
1026 | } |
1027 | |
1028 | __DEVICE__ |
1029 | double rnorm3d(double __x, double __y, double __z) { |
1030 | return __ocml_rlen3_f64(__x, __y, __z); |
1031 | } |
1032 | |
1033 | __DEVICE__ |
1034 | double rnorm4d(double __x, double __y, double __z, double __w) { |
1035 | return __ocml_rlen4_f64(__x, __y, __z, __w); |
1036 | } |
1037 | |
1038 | __DEVICE__ |
1039 | double round(double __x) { return __builtin_round(__x); } |
1040 | |
1041 | __DEVICE__ |
1042 | double rsqrt(double __x) { return __ocml_rsqrt_f64(__x); } |
1043 | |
1044 | __DEVICE__ |
1045 | double scalbln(double __x, long int __n) { |
1046 | return (__n < INT_MAX) ? __builtin_amdgcn_ldexp(__x, __n) |
1047 | : __ocml_scalb_f64(__x, __n); |
1048 | } |
1049 | __DEVICE__ |
1050 | double scalbn(double __x, int __n) { return __builtin_amdgcn_ldexp(__x, __n); } |
1051 | |
1052 | __DEVICE__ |
1053 | __RETURN_TYPE __signbit(double __x) { return __builtin_signbit(__x); } |
1054 | |
1055 | __DEVICE__ |
1056 | double sin(double __x) { return __ocml_sin_f64(__x); } |
1057 | |
1058 | __DEVICE__ |
1059 | void sincos(double __x, double *__sinptr, double *__cosptr) { |
1060 | double __tmp; |
1061 | #ifdef __OPENMP_AMDGCN__ |
1062 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
1063 | #endif |
1064 | *__sinptr = __ocml_sincos_f64( |
1065 | __x, (__attribute__((address_space(5))) double *)&__tmp); |
1066 | *__cosptr = __tmp; |
1067 | } |
1068 | |
1069 | __DEVICE__ |
1070 | void sincospi(double __x, double *__sinptr, double *__cosptr) { |
1071 | double __tmp; |
1072 | #ifdef __OPENMP_AMDGCN__ |
1073 | #pragma omp allocate(__tmp) allocator(omp_thread_mem_alloc) |
1074 | #endif |
1075 | *__sinptr = __ocml_sincospi_f64( |
1076 | __x, (__attribute__((address_space(5))) double *)&__tmp); |
1077 | *__cosptr = __tmp; |
1078 | } |
1079 | |
1080 | __DEVICE__ |
1081 | double sinh(double __x) { return __ocml_sinh_f64(__x); } |
1082 | |
1083 | __DEVICE__ |
1084 | double sinpi(double __x) { return __ocml_sinpi_f64(__x); } |
1085 | |
1086 | __DEVICE__ |
1087 | double sqrt(double __x) { return __builtin_sqrt(__x); } |
1088 | |
1089 | __DEVICE__ |
1090 | double tan(double __x) { return __ocml_tan_f64(__x); } |
1091 | |
1092 | __DEVICE__ |
1093 | double tanh(double __x) { return __ocml_tanh_f64(__x); } |
1094 | |
1095 | __DEVICE__ |
1096 | double tgamma(double __x) { return __ocml_tgamma_f64(__x); } |
1097 | |
1098 | __DEVICE__ |
1099 | double trunc(double __x) { return __builtin_trunc(__x); } |
1100 | |
1101 | __DEVICE__ |
1102 | double y0(double __x) { return __ocml_y0_f64(__x); } |
1103 | |
1104 | __DEVICE__ |
1105 | double y1(double __x) { return __ocml_y1_f64(__x); } |
1106 | |
1107 | __DEVICE__ |
1108 | double yn(int __n, double __x) { // TODO: we could use Ahmes multiplication |
1109 | // and the Miller & Brown algorithm |
1110 | // for linear recurrences to get O(log n) steps, but it's unclear if |
1111 | // it'd be beneficial in this case. Placeholder until OCML adds |
1112 | // support. |
1113 | if (__n == 0) |
1114 | return y0(__x); |
1115 | if (__n == 1) |
1116 | return y1(__x); |
1117 | |
1118 | double __x0 = y0(__x); |
1119 | double __x1 = y1(__x); |
1120 | for (int __i = 1; __i < __n; ++__i) { |
1121 | double __x2 = (2 * __i) / __x * __x1 - __x0; |
1122 | __x0 = __x1; |
1123 | __x1 = __x2; |
1124 | } |
1125 | |
1126 | return __x1; |
1127 | } |
1128 | |
1129 | // BEGIN INTRINSICS |
1130 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1131 | __DEVICE__ |
1132 | double __dadd_rd(double __x, double __y) { |
1133 | return __ocml_add_rtn_f64(__x, __y); |
1134 | } |
1135 | __DEVICE__ |
1136 | double __dadd_rn(double __x, double __y) { |
1137 | return __ocml_add_rte_f64(__x, __y); |
1138 | } |
1139 | __DEVICE__ |
1140 | double __dadd_ru(double __x, double __y) { |
1141 | return __ocml_add_rtp_f64(__x, __y); |
1142 | } |
1143 | __DEVICE__ |
1144 | double __dadd_rz(double __x, double __y) { |
1145 | return __ocml_add_rtz_f64(__x, __y); |
1146 | } |
1147 | #else |
1148 | __DEVICE__ |
1149 | double __dadd_rn(double __x, double __y) { return __x + __y; } |
1150 | #endif |
1151 | |
1152 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1153 | __DEVICE__ |
1154 | double __ddiv_rd(double __x, double __y) { |
1155 | return __ocml_div_rtn_f64(__x, __y); |
1156 | } |
1157 | __DEVICE__ |
1158 | double __ddiv_rn(double __x, double __y) { |
1159 | return __ocml_div_rte_f64(__x, __y); |
1160 | } |
1161 | __DEVICE__ |
1162 | double __ddiv_ru(double __x, double __y) { |
1163 | return __ocml_div_rtp_f64(__x, __y); |
1164 | } |
1165 | __DEVICE__ |
1166 | double __ddiv_rz(double __x, double __y) { |
1167 | return __ocml_div_rtz_f64(__x, __y); |
1168 | } |
1169 | #else |
1170 | __DEVICE__ |
1171 | double __ddiv_rn(double __x, double __y) { return __x / __y; } |
1172 | #endif |
1173 | |
1174 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1175 | __DEVICE__ |
1176 | double __dmul_rd(double __x, double __y) { |
1177 | return __ocml_mul_rtn_f64(__x, __y); |
1178 | } |
1179 | __DEVICE__ |
1180 | double __dmul_rn(double __x, double __y) { |
1181 | return __ocml_mul_rte_f64(__x, __y); |
1182 | } |
1183 | __DEVICE__ |
1184 | double __dmul_ru(double __x, double __y) { |
1185 | return __ocml_mul_rtp_f64(__x, __y); |
1186 | } |
1187 | __DEVICE__ |
1188 | double __dmul_rz(double __x, double __y) { |
1189 | return __ocml_mul_rtz_f64(__x, __y); |
1190 | } |
1191 | #else |
1192 | __DEVICE__ |
1193 | double __dmul_rn(double __x, double __y) { return __x * __y; } |
1194 | #endif |
1195 | |
1196 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1197 | __DEVICE__ |
1198 | double __drcp_rd(double __x) { return __ocml_div_rtn_f64(1.0, __x); } |
1199 | __DEVICE__ |
1200 | double __drcp_rn(double __x) { return __ocml_div_rte_f64(1.0, __x); } |
1201 | __DEVICE__ |
1202 | double __drcp_ru(double __x) { return __ocml_div_rtp_f64(1.0, __x); } |
1203 | __DEVICE__ |
1204 | double __drcp_rz(double __x) { return __ocml_div_rtz_f64(1.0, __x); } |
1205 | #else |
1206 | __DEVICE__ |
1207 | double __drcp_rn(double __x) { return 1.0 / __x; } |
1208 | #endif |
1209 | |
1210 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1211 | __DEVICE__ |
1212 | double __dsqrt_rd(double __x) { return __ocml_sqrt_rtn_f64(__x); } |
1213 | __DEVICE__ |
1214 | double __dsqrt_rn(double __x) { return __ocml_sqrt_rte_f64(__x); } |
1215 | __DEVICE__ |
1216 | double __dsqrt_ru(double __x) { return __ocml_sqrt_rtp_f64(__x); } |
1217 | __DEVICE__ |
1218 | double __dsqrt_rz(double __x) { return __ocml_sqrt_rtz_f64(__x); } |
1219 | #else |
1220 | __DEVICE__ |
1221 | double __dsqrt_rn(double __x) { return __builtin_sqrt(__x); } |
1222 | #endif |
1223 | |
1224 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1225 | __DEVICE__ |
1226 | double __dsub_rd(double __x, double __y) { |
1227 | return __ocml_sub_rtn_f64(__x, __y); |
1228 | } |
1229 | __DEVICE__ |
1230 | double __dsub_rn(double __x, double __y) { |
1231 | return __ocml_sub_rte_f64(__x, __y); |
1232 | } |
1233 | __DEVICE__ |
1234 | double __dsub_ru(double __x, double __y) { |
1235 | return __ocml_sub_rtp_f64(__x, __y); |
1236 | } |
1237 | __DEVICE__ |
1238 | double __dsub_rz(double __x, double __y) { |
1239 | return __ocml_sub_rtz_f64(__x, __y); |
1240 | } |
1241 | #else |
1242 | __DEVICE__ |
1243 | double __dsub_rn(double __x, double __y) { return __x - __y; } |
1244 | #endif |
1245 | |
1246 | #if defined OCML_BASIC_ROUNDED_OPERATIONS |
1247 | __DEVICE__ |
1248 | double __fma_rd(double __x, double __y, double __z) { |
1249 | return __ocml_fma_rtn_f64(__x, __y, __z); |
1250 | } |
1251 | __DEVICE__ |
1252 | double __fma_rn(double __x, double __y, double __z) { |
1253 | return __ocml_fma_rte_f64(__x, __y, __z); |
1254 | } |
1255 | __DEVICE__ |
1256 | double __fma_ru(double __x, double __y, double __z) { |
1257 | return __ocml_fma_rtp_f64(__x, __y, __z); |
1258 | } |
1259 | __DEVICE__ |
1260 | double __fma_rz(double __x, double __y, double __z) { |
1261 | return __ocml_fma_rtz_f64(__x, __y, __z); |
1262 | } |
1263 | #else |
1264 | __DEVICE__ |
1265 | double __fma_rn(double __x, double __y, double __z) { |
1266 | return __builtin_fma(__x, __y, __z); |
1267 | } |
1268 | #endif |
1269 | // END INTRINSICS |
1270 | // END DOUBLE |
1271 | |
1272 | // C only macros |
1273 | #if !defined(__cplusplus) && __STDC_VERSION__ >= 201112L |
1274 | #define isfinite(__x) _Generic((__x), float : __finitef, double : __finite)(__x) |
1275 | #define isinf(__x) _Generic((__x), float : __isinff, double : __isinf)(__x) |
1276 | #define isnan(__x) _Generic((__x), float : __isnanf, double : __isnan)(__x) |
1277 | #define signbit(__x) \ |
1278 | _Generic((__x), float : __signbitf, double : __signbit)(__x) |
1279 | #endif // !defined(__cplusplus) && __STDC_VERSION__ >= 201112L |
1280 | |
1281 | #if defined(__cplusplus) |
1282 | template <class T> __DEVICE__ T min(T __arg1, T __arg2) { |
1283 | return (__arg1 < __arg2) ? __arg1 : __arg2; |
1284 | } |
1285 | |
1286 | template <class T> __DEVICE__ T max(T __arg1, T __arg2) { |
1287 | return (__arg1 > __arg2) ? __arg1 : __arg2; |
1288 | } |
1289 | |
1290 | __DEVICE__ int min(int __arg1, int __arg2) { |
1291 | return (__arg1 < __arg2) ? __arg1 : __arg2; |
1292 | } |
1293 | __DEVICE__ int max(int __arg1, int __arg2) { |
1294 | return (__arg1 > __arg2) ? __arg1 : __arg2; |
1295 | } |
1296 | |
1297 | __DEVICE__ |
1298 | float max(float __x, float __y) { return __builtin_fmaxf(__x, __y); } |
1299 | |
1300 | __DEVICE__ |
1301 | double max(double __x, double __y) { return __builtin_fmax(__x, __y); } |
1302 | |
1303 | __DEVICE__ |
1304 | float min(float __x, float __y) { return __builtin_fminf(__x, __y); } |
1305 | |
1306 | __DEVICE__ |
1307 | double min(double __x, double __y) { return __builtin_fmin(__x, __y); } |
1308 | |
1309 | #if !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) |
1310 | __host__ inline static int min(int __arg1, int __arg2) { |
1311 | return __arg1 < __arg2 ? __arg1 : __arg2; |
1312 | } |
1313 | |
1314 | __host__ inline static int max(int __arg1, int __arg2) { |
1315 | return __arg1 > __arg2 ? __arg1 : __arg2; |
1316 | } |
1317 | #endif // !defined(__HIPCC_RTC__) && !defined(__OPENMP_AMDGCN__) |
1318 | #endif |
1319 | |
1320 | #pragma pop_macro("__DEVICE__") |
1321 | #pragma pop_macro("__RETURN_TYPE") |
1322 | #pragma pop_macro("__FAST_OR_SLOW") |
1323 | |
1324 | #endif // __CLANG_HIP_MATH_H__ |
1325 |
Warning: This file is not a C or C++ file. It does not have highlighting.