Warning: This file is not a C or C++ file. It does not have highlighting.
1 | /*===--- __clang_cuda_intrinsics.h - Device-side CUDA intrinsic wrappers ---=== |
---|---|
2 | * |
3 | * Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. |
4 | * See https://llvm.org/LICENSE.txt for license information. |
5 | * SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception |
6 | * |
7 | *===-----------------------------------------------------------------------=== |
8 | */ |
9 | #ifndef __CLANG_CUDA_INTRINSICS_H__ |
10 | #define __CLANG_CUDA_INTRINSICS_H__ |
11 | #ifndef __CUDA__ |
12 | #error "This file is for CUDA compilation only." |
13 | #endif |
14 | |
15 | // sm_30 intrinsics: __shfl_{up,down,xor}. |
16 | |
17 | #define __SM_30_INTRINSICS_H__ |
18 | #define __SM_30_INTRINSICS_HPP__ |
19 | |
20 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
21 | |
22 | #pragma push_macro("__MAKE_SHUFFLES") |
23 | #define __MAKE_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, __Mask, \ |
24 | __Type) \ |
25 | inline __device__ int __FnName(int __val, __Type __offset, \ |
26 | int __width = warpSize) { \ |
27 | return __IntIntrinsic(__val, __offset, \ |
28 | ((warpSize - __width) << 8) | (__Mask)); \ |
29 | } \ |
30 | inline __device__ float __FnName(float __val, __Type __offset, \ |
31 | int __width = warpSize) { \ |
32 | return __FloatIntrinsic(__val, __offset, \ |
33 | ((warpSize - __width) << 8) | (__Mask)); \ |
34 | } \ |
35 | inline __device__ unsigned int __FnName(unsigned int __val, __Type __offset, \ |
36 | int __width = warpSize) { \ |
37 | return static_cast<unsigned int>( \ |
38 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
39 | } \ |
40 | inline __device__ long long __FnName(long long __val, __Type __offset, \ |
41 | int __width = warpSize) { \ |
42 | struct __Bits { \ |
43 | int __a, __b; \ |
44 | }; \ |
45 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
46 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
47 | __Bits __tmp; \ |
48 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
49 | __tmp.__a = ::__FnName(__tmp.__a, __offset, __width); \ |
50 | __tmp.__b = ::__FnName(__tmp.__b, __offset, __width); \ |
51 | long long __ret; \ |
52 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
53 | return __ret; \ |
54 | } \ |
55 | inline __device__ long __FnName(long __val, __Type __offset, \ |
56 | int __width = warpSize) { \ |
57 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
58 | sizeof(long) == sizeof(int)); \ |
59 | if (sizeof(long) == sizeof(long long)) { \ |
60 | return static_cast<long>( \ |
61 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
62 | } else if (sizeof(long) == sizeof(int)) { \ |
63 | return static_cast<long>( \ |
64 | ::__FnName(static_cast<int>(__val), __offset, __width)); \ |
65 | } \ |
66 | } \ |
67 | inline __device__ unsigned long __FnName( \ |
68 | unsigned long __val, __Type __offset, int __width = warpSize) { \ |
69 | return static_cast<unsigned long>( \ |
70 | ::__FnName(static_cast<long>(__val), __offset, __width)); \ |
71 | } \ |
72 | inline __device__ unsigned long long __FnName( \ |
73 | unsigned long long __val, __Type __offset, int __width = warpSize) { \ |
74 | return static_cast<unsigned long long>( \ |
75 | ::__FnName(static_cast<long long>(__val), __offset, __width)); \ |
76 | } \ |
77 | inline __device__ double __FnName(double __val, __Type __offset, \ |
78 | int __width = warpSize) { \ |
79 | long long __tmp; \ |
80 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
81 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
82 | __tmp = ::__FnName(__tmp, __offset, __width); \ |
83 | double __ret; \ |
84 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
85 | return __ret; \ |
86 | } |
87 | |
88 | __MAKE_SHUFFLES(__shfl, __nvvm_shfl_idx_i32, __nvvm_shfl_idx_f32, 0x1f, int); |
89 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
90 | // maxLane. |
91 | __MAKE_SHUFFLES(__shfl_up, __nvvm_shfl_up_i32, __nvvm_shfl_up_f32, 0, |
92 | unsigned int); |
93 | __MAKE_SHUFFLES(__shfl_down, __nvvm_shfl_down_i32, __nvvm_shfl_down_f32, 0x1f, |
94 | unsigned int); |
95 | __MAKE_SHUFFLES(__shfl_xor, __nvvm_shfl_bfly_i32, __nvvm_shfl_bfly_f32, 0x1f, |
96 | int); |
97 | #pragma pop_macro("__MAKE_SHUFFLES") |
98 | |
99 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
100 | |
101 | #if CUDA_VERSION >= 9000 |
102 | #if (!defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300) |
103 | // __shfl_sync_* variants available in CUDA-9 |
104 | #pragma push_macro("__MAKE_SYNC_SHUFFLES") |
105 | #define __MAKE_SYNC_SHUFFLES(__FnName, __IntIntrinsic, __FloatIntrinsic, \ |
106 | __Mask, __Type) \ |
107 | inline __device__ int __FnName(unsigned int __mask, int __val, \ |
108 | __Type __offset, int __width = warpSize) { \ |
109 | return __IntIntrinsic(__mask, __val, __offset, \ |
110 | ((warpSize - __width) << 8) | (__Mask)); \ |
111 | } \ |
112 | inline __device__ float __FnName(unsigned int __mask, float __val, \ |
113 | __Type __offset, int __width = warpSize) { \ |
114 | return __FloatIntrinsic(__mask, __val, __offset, \ |
115 | ((warpSize - __width) << 8) | (__Mask)); \ |
116 | } \ |
117 | inline __device__ unsigned int __FnName(unsigned int __mask, \ |
118 | unsigned int __val, __Type __offset, \ |
119 | int __width = warpSize) { \ |
120 | return static_cast<unsigned int>( \ |
121 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
122 | } \ |
123 | inline __device__ long long __FnName(unsigned int __mask, long long __val, \ |
124 | __Type __offset, \ |
125 | int __width = warpSize) { \ |
126 | struct __Bits { \ |
127 | int __a, __b; \ |
128 | }; \ |
129 | _Static_assert(sizeof(__val) == sizeof(__Bits)); \ |
130 | _Static_assert(sizeof(__Bits) == 2 * sizeof(int)); \ |
131 | __Bits __tmp; \ |
132 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
133 | __tmp.__a = ::__FnName(__mask, __tmp.__a, __offset, __width); \ |
134 | __tmp.__b = ::__FnName(__mask, __tmp.__b, __offset, __width); \ |
135 | long long __ret; \ |
136 | memcpy(&__ret, &__tmp, sizeof(__tmp)); \ |
137 | return __ret; \ |
138 | } \ |
139 | inline __device__ unsigned long long __FnName( \ |
140 | unsigned int __mask, unsigned long long __val, __Type __offset, \ |
141 | int __width = warpSize) { \ |
142 | return static_cast<unsigned long long>( \ |
143 | ::__FnName(__mask, static_cast<long long>(__val), __offset, __width)); \ |
144 | } \ |
145 | inline __device__ long __FnName(unsigned int __mask, long __val, \ |
146 | __Type __offset, int __width = warpSize) { \ |
147 | _Static_assert(sizeof(long) == sizeof(long long) || \ |
148 | sizeof(long) == sizeof(int)); \ |
149 | if (sizeof(long) == sizeof(long long)) { \ |
150 | return static_cast<long>(::__FnName( \ |
151 | __mask, static_cast<long long>(__val), __offset, __width)); \ |
152 | } else if (sizeof(long) == sizeof(int)) { \ |
153 | return static_cast<long>( \ |
154 | ::__FnName(__mask, static_cast<int>(__val), __offset, __width)); \ |
155 | } \ |
156 | } \ |
157 | inline __device__ unsigned long __FnName( \ |
158 | unsigned int __mask, unsigned long __val, __Type __offset, \ |
159 | int __width = warpSize) { \ |
160 | return static_cast<unsigned long>( \ |
161 | ::__FnName(__mask, static_cast<long>(__val), __offset, __width)); \ |
162 | } \ |
163 | inline __device__ double __FnName(unsigned int __mask, double __val, \ |
164 | __Type __offset, int __width = warpSize) { \ |
165 | long long __tmp; \ |
166 | _Static_assert(sizeof(__tmp) == sizeof(__val)); \ |
167 | memcpy(&__tmp, &__val, sizeof(__val)); \ |
168 | __tmp = ::__FnName(__mask, __tmp, __offset, __width); \ |
169 | double __ret; \ |
170 | memcpy(&__ret, &__tmp, sizeof(__ret)); \ |
171 | return __ret; \ |
172 | } |
173 | __MAKE_SYNC_SHUFFLES(__shfl_sync, __nvvm_shfl_sync_idx_i32, |
174 | __nvvm_shfl_sync_idx_f32, 0x1f, int); |
175 | // We use 0 rather than 31 as our mask, because shfl.up applies to lanes >= |
176 | // maxLane. |
177 | __MAKE_SYNC_SHUFFLES(__shfl_up_sync, __nvvm_shfl_sync_up_i32, |
178 | __nvvm_shfl_sync_up_f32, 0, unsigned int); |
179 | __MAKE_SYNC_SHUFFLES(__shfl_down_sync, __nvvm_shfl_sync_down_i32, |
180 | __nvvm_shfl_sync_down_f32, 0x1f, unsigned int); |
181 | __MAKE_SYNC_SHUFFLES(__shfl_xor_sync, __nvvm_shfl_sync_bfly_i32, |
182 | __nvvm_shfl_sync_bfly_f32, 0x1f, int); |
183 | #pragma pop_macro("__MAKE_SYNC_SHUFFLES") |
184 | |
185 | inline __device__ void __syncwarp(unsigned int mask = 0xffffffff) { |
186 | return __nvvm_bar_warp_sync(mask); |
187 | } |
188 | |
189 | inline __device__ void __barrier_sync(unsigned int id) { |
190 | __nvvm_barrier_sync(id); |
191 | } |
192 | |
193 | inline __device__ void __barrier_sync_count(unsigned int id, |
194 | unsigned int count) { |
195 | __nvvm_barrier_sync_cnt(id, count); |
196 | } |
197 | |
198 | inline __device__ int __all_sync(unsigned int mask, int pred) { |
199 | return __nvvm_vote_all_sync(mask, pred); |
200 | } |
201 | |
202 | inline __device__ int __any_sync(unsigned int mask, int pred) { |
203 | return __nvvm_vote_any_sync(mask, pred); |
204 | } |
205 | |
206 | inline __device__ int __uni_sync(unsigned int mask, int pred) { |
207 | return __nvvm_vote_uni_sync(mask, pred); |
208 | } |
209 | |
210 | inline __device__ unsigned int __ballot_sync(unsigned int mask, int pred) { |
211 | return __nvvm_vote_ballot_sync(mask, pred); |
212 | } |
213 | |
214 | inline __device__ unsigned int __activemask() { |
215 | #if CUDA_VERSION < 9020 |
216 | return __nvvm_vote_ballot(1); |
217 | #else |
218 | return __nvvm_activemask(); |
219 | #endif |
220 | } |
221 | |
222 | inline __device__ unsigned int __fns(unsigned mask, unsigned base, int offset) { |
223 | return __nvvm_fns(mask, base, offset); |
224 | } |
225 | |
226 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 300 |
227 | |
228 | // Define __match* builtins CUDA-9 headers expect to see. |
229 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
230 | inline __device__ unsigned int __match32_any_sync(unsigned int mask, |
231 | unsigned int value) { |
232 | return __nvvm_match_any_sync_i32(mask, value); |
233 | } |
234 | |
235 | inline __device__ unsigned int |
236 | __match64_any_sync(unsigned int mask, unsigned long long value) { |
237 | return __nvvm_match_any_sync_i64(mask, value); |
238 | } |
239 | |
240 | inline __device__ unsigned int |
241 | __match32_all_sync(unsigned int mask, unsigned int value, int *pred) { |
242 | return __nvvm_match_all_sync_i32p(mask, value, pred); |
243 | } |
244 | |
245 | inline __device__ unsigned int |
246 | __match64_all_sync(unsigned int mask, unsigned long long value, int *pred) { |
247 | return __nvvm_match_all_sync_i64p(mask, value, pred); |
248 | } |
249 | #include "crt/sm_70_rt.hpp" |
250 | |
251 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 700 |
252 | #endif // __CUDA_VERSION >= 9000 |
253 | |
254 | // sm_32 intrinsics: __ldg and __funnelshift_{l,lc,r,rc}. |
255 | |
256 | // Prevent the vanilla sm_32 intrinsics header from being included. |
257 | #define __SM_32_INTRINSICS_H__ |
258 | #define __SM_32_INTRINSICS_HPP__ |
259 | |
260 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
261 | |
262 | inline __device__ char __ldg(const char *ptr) { return __nvvm_ldg_c(ptr); } |
263 | inline __device__ short __ldg(const short *ptr) { return __nvvm_ldg_s(ptr); } |
264 | inline __device__ int __ldg(const int *ptr) { return __nvvm_ldg_i(ptr); } |
265 | inline __device__ long __ldg(const long *ptr) { return __nvvm_ldg_l(ptr); } |
266 | inline __device__ long long __ldg(const long long *ptr) { |
267 | return __nvvm_ldg_ll(ptr); |
268 | } |
269 | inline __device__ unsigned char __ldg(const unsigned char *ptr) { |
270 | return __nvvm_ldg_uc(ptr); |
271 | } |
272 | inline __device__ signed char __ldg(const signed char *ptr) { |
273 | return __nvvm_ldg_uc((const unsigned char *)ptr); |
274 | } |
275 | inline __device__ unsigned short __ldg(const unsigned short *ptr) { |
276 | return __nvvm_ldg_us(ptr); |
277 | } |
278 | inline __device__ unsigned int __ldg(const unsigned int *ptr) { |
279 | return __nvvm_ldg_ui(ptr); |
280 | } |
281 | inline __device__ unsigned long __ldg(const unsigned long *ptr) { |
282 | return __nvvm_ldg_ul(ptr); |
283 | } |
284 | inline __device__ unsigned long long __ldg(const unsigned long long *ptr) { |
285 | return __nvvm_ldg_ull(ptr); |
286 | } |
287 | inline __device__ float __ldg(const float *ptr) { return __nvvm_ldg_f(ptr); } |
288 | inline __device__ double __ldg(const double *ptr) { return __nvvm_ldg_d(ptr); } |
289 | |
290 | inline __device__ char2 __ldg(const char2 *ptr) { |
291 | typedef char c2 __attribute__((ext_vector_type(2))); |
292 | // We can assume that ptr is aligned at least to char2's alignment, but the |
293 | // load will assume that ptr is aligned to char2's alignment. This is only |
294 | // safe if alignof(c2) <= alignof(char2). |
295 | c2 rv = __nvvm_ldg_c2(reinterpret_cast<const c2 *>(ptr)); |
296 | char2 ret; |
297 | ret.x = rv[0]; |
298 | ret.y = rv[1]; |
299 | return ret; |
300 | } |
301 | inline __device__ char4 __ldg(const char4 *ptr) { |
302 | typedef char c4 __attribute__((ext_vector_type(4))); |
303 | c4 rv = __nvvm_ldg_c4(reinterpret_cast<const c4 *>(ptr)); |
304 | char4 ret; |
305 | ret.x = rv[0]; |
306 | ret.y = rv[1]; |
307 | ret.z = rv[2]; |
308 | ret.w = rv[3]; |
309 | return ret; |
310 | } |
311 | inline __device__ short2 __ldg(const short2 *ptr) { |
312 | typedef short s2 __attribute__((ext_vector_type(2))); |
313 | s2 rv = __nvvm_ldg_s2(reinterpret_cast<const s2 *>(ptr)); |
314 | short2 ret; |
315 | ret.x = rv[0]; |
316 | ret.y = rv[1]; |
317 | return ret; |
318 | } |
319 | inline __device__ short4 __ldg(const short4 *ptr) { |
320 | typedef short s4 __attribute__((ext_vector_type(4))); |
321 | s4 rv = __nvvm_ldg_s4(reinterpret_cast<const s4 *>(ptr)); |
322 | short4 ret; |
323 | ret.x = rv[0]; |
324 | ret.y = rv[1]; |
325 | ret.z = rv[2]; |
326 | ret.w = rv[3]; |
327 | return ret; |
328 | } |
329 | inline __device__ int2 __ldg(const int2 *ptr) { |
330 | typedef int i2 __attribute__((ext_vector_type(2))); |
331 | i2 rv = __nvvm_ldg_i2(reinterpret_cast<const i2 *>(ptr)); |
332 | int2 ret; |
333 | ret.x = rv[0]; |
334 | ret.y = rv[1]; |
335 | return ret; |
336 | } |
337 | inline __device__ int4 __ldg(const int4 *ptr) { |
338 | typedef int i4 __attribute__((ext_vector_type(4))); |
339 | i4 rv = __nvvm_ldg_i4(reinterpret_cast<const i4 *>(ptr)); |
340 | int4 ret; |
341 | ret.x = rv[0]; |
342 | ret.y = rv[1]; |
343 | ret.z = rv[2]; |
344 | ret.w = rv[3]; |
345 | return ret; |
346 | } |
347 | inline __device__ longlong2 __ldg(const longlong2 *ptr) { |
348 | typedef long long ll2 __attribute__((ext_vector_type(2))); |
349 | ll2 rv = __nvvm_ldg_ll2(reinterpret_cast<const ll2 *>(ptr)); |
350 | longlong2 ret; |
351 | ret.x = rv[0]; |
352 | ret.y = rv[1]; |
353 | return ret; |
354 | } |
355 | |
356 | inline __device__ uchar2 __ldg(const uchar2 *ptr) { |
357 | typedef unsigned char uc2 __attribute__((ext_vector_type(2))); |
358 | uc2 rv = __nvvm_ldg_uc2(reinterpret_cast<const uc2 *>(ptr)); |
359 | uchar2 ret; |
360 | ret.x = rv[0]; |
361 | ret.y = rv[1]; |
362 | return ret; |
363 | } |
364 | inline __device__ uchar4 __ldg(const uchar4 *ptr) { |
365 | typedef unsigned char uc4 __attribute__((ext_vector_type(4))); |
366 | uc4 rv = __nvvm_ldg_uc4(reinterpret_cast<const uc4 *>(ptr)); |
367 | uchar4 ret; |
368 | ret.x = rv[0]; |
369 | ret.y = rv[1]; |
370 | ret.z = rv[2]; |
371 | ret.w = rv[3]; |
372 | return ret; |
373 | } |
374 | inline __device__ ushort2 __ldg(const ushort2 *ptr) { |
375 | typedef unsigned short us2 __attribute__((ext_vector_type(2))); |
376 | us2 rv = __nvvm_ldg_us2(reinterpret_cast<const us2 *>(ptr)); |
377 | ushort2 ret; |
378 | ret.x = rv[0]; |
379 | ret.y = rv[1]; |
380 | return ret; |
381 | } |
382 | inline __device__ ushort4 __ldg(const ushort4 *ptr) { |
383 | typedef unsigned short us4 __attribute__((ext_vector_type(4))); |
384 | us4 rv = __nvvm_ldg_us4(reinterpret_cast<const us4 *>(ptr)); |
385 | ushort4 ret; |
386 | ret.x = rv[0]; |
387 | ret.y = rv[1]; |
388 | ret.z = rv[2]; |
389 | ret.w = rv[3]; |
390 | return ret; |
391 | } |
392 | inline __device__ uint2 __ldg(const uint2 *ptr) { |
393 | typedef unsigned int ui2 __attribute__((ext_vector_type(2))); |
394 | ui2 rv = __nvvm_ldg_ui2(reinterpret_cast<const ui2 *>(ptr)); |
395 | uint2 ret; |
396 | ret.x = rv[0]; |
397 | ret.y = rv[1]; |
398 | return ret; |
399 | } |
400 | inline __device__ uint4 __ldg(const uint4 *ptr) { |
401 | typedef unsigned int ui4 __attribute__((ext_vector_type(4))); |
402 | ui4 rv = __nvvm_ldg_ui4(reinterpret_cast<const ui4 *>(ptr)); |
403 | uint4 ret; |
404 | ret.x = rv[0]; |
405 | ret.y = rv[1]; |
406 | ret.z = rv[2]; |
407 | ret.w = rv[3]; |
408 | return ret; |
409 | } |
410 | inline __device__ ulonglong2 __ldg(const ulonglong2 *ptr) { |
411 | typedef unsigned long long ull2 __attribute__((ext_vector_type(2))); |
412 | ull2 rv = __nvvm_ldg_ull2(reinterpret_cast<const ull2 *>(ptr)); |
413 | ulonglong2 ret; |
414 | ret.x = rv[0]; |
415 | ret.y = rv[1]; |
416 | return ret; |
417 | } |
418 | |
419 | inline __device__ float2 __ldg(const float2 *ptr) { |
420 | typedef float f2 __attribute__((ext_vector_type(2))); |
421 | f2 rv = __nvvm_ldg_f2(reinterpret_cast<const f2 *>(ptr)); |
422 | float2 ret; |
423 | ret.x = rv[0]; |
424 | ret.y = rv[1]; |
425 | return ret; |
426 | } |
427 | inline __device__ float4 __ldg(const float4 *ptr) { |
428 | typedef float f4 __attribute__((ext_vector_type(4))); |
429 | f4 rv = __nvvm_ldg_f4(reinterpret_cast<const f4 *>(ptr)); |
430 | float4 ret; |
431 | ret.x = rv[0]; |
432 | ret.y = rv[1]; |
433 | ret.z = rv[2]; |
434 | ret.w = rv[3]; |
435 | return ret; |
436 | } |
437 | inline __device__ double2 __ldg(const double2 *ptr) { |
438 | typedef double d2 __attribute__((ext_vector_type(2))); |
439 | d2 rv = __nvvm_ldg_d2(reinterpret_cast<const d2 *>(ptr)); |
440 | double2 ret; |
441 | ret.x = rv[0]; |
442 | ret.y = rv[1]; |
443 | return ret; |
444 | } |
445 | |
446 | // TODO: Implement these as intrinsics, so the backend can work its magic on |
447 | // these. Alternatively, we could implement these as plain C and try to get |
448 | // llvm to recognize the relevant patterns. |
449 | inline __device__ unsigned __funnelshift_l(unsigned low32, unsigned high32, |
450 | unsigned shiftWidth) { |
451 | unsigned result; |
452 | asm("shf.l.wrap.b32 %0, %1, %2, %3;" |
453 | : "=r"(result) |
454 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
455 | return result; |
456 | } |
457 | inline __device__ unsigned __funnelshift_lc(unsigned low32, unsigned high32, |
458 | unsigned shiftWidth) { |
459 | unsigned result; |
460 | asm("shf.l.clamp.b32 %0, %1, %2, %3;" |
461 | : "=r"(result) |
462 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
463 | return result; |
464 | } |
465 | inline __device__ unsigned __funnelshift_r(unsigned low32, unsigned high32, |
466 | unsigned shiftWidth) { |
467 | unsigned result; |
468 | asm("shf.r.wrap.b32 %0, %1, %2, %3;" |
469 | : "=r"(result) |
470 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
471 | return result; |
472 | } |
473 | inline __device__ unsigned __funnelshift_rc(unsigned low32, unsigned high32, |
474 | unsigned shiftWidth) { |
475 | unsigned ret; |
476 | asm("shf.r.clamp.b32 %0, %1, %2, %3;" |
477 | : "=r"(ret) |
478 | : "r"(low32), "r"(high32), "r"(shiftWidth)); |
479 | return ret; |
480 | } |
481 | |
482 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 320 |
483 | |
484 | #if CUDA_VERSION >= 11000 |
485 | extern "C" { |
486 | __device__ inline size_t __nv_cvta_generic_to_global_impl(const void *__ptr) { |
487 | return (size_t)(void __attribute__((address_space(1))) *)__ptr; |
488 | } |
489 | __device__ inline size_t __nv_cvta_generic_to_shared_impl(const void *__ptr) { |
490 | return (size_t)(void __attribute__((address_space(3))) *)__ptr; |
491 | } |
492 | __device__ inline size_t __nv_cvta_generic_to_constant_impl(const void *__ptr) { |
493 | return (size_t)(void __attribute__((address_space(4))) *)__ptr; |
494 | } |
495 | __device__ inline size_t __nv_cvta_generic_to_local_impl(const void *__ptr) { |
496 | return (size_t)(void __attribute__((address_space(5))) *)__ptr; |
497 | } |
498 | __device__ inline void *__nv_cvta_global_to_generic_impl(size_t __ptr) { |
499 | return (void *)(void __attribute__((address_space(1))) *)__ptr; |
500 | } |
501 | __device__ inline void *__nv_cvta_shared_to_generic_impl(size_t __ptr) { |
502 | return (void *)(void __attribute__((address_space(3))) *)__ptr; |
503 | } |
504 | __device__ inline void *__nv_cvta_constant_to_generic_impl(size_t __ptr) { |
505 | return (void *)(void __attribute__((address_space(4))) *)__ptr; |
506 | } |
507 | __device__ inline void *__nv_cvta_local_to_generic_impl(size_t __ptr) { |
508 | return (void *)(void __attribute__((address_space(5))) *)__ptr; |
509 | } |
510 | __device__ inline cuuint32_t __nvvm_get_smem_pointer(void *__ptr) { |
511 | return __nv_cvta_generic_to_shared_impl(__ptr); |
512 | } |
513 | } // extern "C" |
514 | |
515 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 |
516 | __device__ inline unsigned __reduce_add_sync(unsigned __mask, |
517 | unsigned __value) { |
518 | return __nvvm_redux_sync_add(__mask, __value); |
519 | } |
520 | __device__ inline unsigned __reduce_min_sync(unsigned __mask, |
521 | unsigned __value) { |
522 | return __nvvm_redux_sync_umin(__mask, __value); |
523 | } |
524 | __device__ inline unsigned __reduce_max_sync(unsigned __mask, |
525 | unsigned __value) { |
526 | return __nvvm_redux_sync_umax(__mask, __value); |
527 | } |
528 | __device__ inline int __reduce_min_sync(unsigned __mask, int __value) { |
529 | return __nvvm_redux_sync_min(__mask, __value); |
530 | } |
531 | __device__ inline int __reduce_max_sync(unsigned __mask, int __value) { |
532 | return __nvvm_redux_sync_max(__mask, __value); |
533 | } |
534 | __device__ inline unsigned __reduce_or_sync(unsigned __mask, unsigned __value) { |
535 | return __nvvm_redux_sync_or(__mask, __value); |
536 | } |
537 | __device__ inline unsigned __reduce_and_sync(unsigned __mask, |
538 | unsigned __value) { |
539 | return __nvvm_redux_sync_and(__mask, __value); |
540 | } |
541 | __device__ inline unsigned __reduce_xor_sync(unsigned __mask, |
542 | unsigned __value) { |
543 | return __nvvm_redux_sync_xor(__mask, __value); |
544 | } |
545 | |
546 | __device__ inline void __nv_memcpy_async_shared_global_4(void *__dst, |
547 | const void *__src, |
548 | unsigned __src_size) { |
549 | __nvvm_cp_async_ca_shared_global_4( |
550 | (void __attribute__((address_space(3))) *)__dst, |
551 | (const void __attribute__((address_space(1))) *)__src, __src_size); |
552 | } |
553 | __device__ inline void __nv_memcpy_async_shared_global_8(void *__dst, |
554 | const void *__src, |
555 | unsigned __src_size) { |
556 | __nvvm_cp_async_ca_shared_global_8( |
557 | (void __attribute__((address_space(3))) *)__dst, |
558 | (const void __attribute__((address_space(1))) *)__src, __src_size); |
559 | } |
560 | __device__ inline void __nv_memcpy_async_shared_global_16(void *__dst, |
561 | const void *__src, |
562 | unsigned __src_size) { |
563 | __nvvm_cp_async_ca_shared_global_16( |
564 | (void __attribute__((address_space(3))) *)__dst, |
565 | (const void __attribute__((address_space(1))) *)__src, __src_size); |
566 | } |
567 | |
568 | __device__ inline void * |
569 | __nv_associate_access_property(const void *__ptr, unsigned long long __prop) { |
570 | // TODO: it appears to provide compiler with some sort of a hint. We do not |
571 | // know what exactly it is supposed to do. However, CUDA headers suggest that |
572 | // just passing through __ptr should not affect correctness. They do so on |
573 | // pre-sm80 GPUs where this builtin is not available. |
574 | return (void*)__ptr; |
575 | } |
576 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 800 |
577 | |
578 | #if !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900 |
579 | __device__ inline unsigned __isCtaShared(const void *ptr) { |
580 | return __isShared(ptr); |
581 | } |
582 | |
583 | __device__ inline unsigned __isClusterShared(const void *__ptr) { |
584 | return __nvvm_isspacep_shared_cluster(__ptr); |
585 | } |
586 | |
587 | __device__ inline void *__cluster_map_shared_rank(const void *__ptr, |
588 | unsigned __rank) { |
589 | return __nvvm_mapa((void *)__ptr, __rank); |
590 | } |
591 | |
592 | __device__ inline unsigned __cluster_query_shared_rank(const void *__ptr) { |
593 | return __nvvm_getctarank((void *)__ptr); |
594 | } |
595 | |
596 | __device__ inline uint2 |
597 | __cluster_map_shared_multicast(const void *__ptr, |
598 | unsigned int __cluster_cta_mask) { |
599 | return make_uint2((unsigned)__cvta_generic_to_shared(__ptr), |
600 | __cluster_cta_mask); |
601 | } |
602 | |
603 | __device__ inline unsigned __clusterDimIsSpecified() { |
604 | return __nvvm_is_explicit_cluster(); |
605 | } |
606 | |
607 | __device__ inline dim3 __clusterDim() { |
608 | return dim3(__nvvm_read_ptx_sreg_cluster_nctaid_x(), |
609 | __nvvm_read_ptx_sreg_cluster_nctaid_y(), |
610 | __nvvm_read_ptx_sreg_cluster_nctaid_z()); |
611 | } |
612 | |
613 | __device__ inline dim3 __clusterRelativeBlockIdx() { |
614 | return dim3(__nvvm_read_ptx_sreg_cluster_ctaid_x(), |
615 | __nvvm_read_ptx_sreg_cluster_ctaid_y(), |
616 | __nvvm_read_ptx_sreg_cluster_ctaid_z()); |
617 | } |
618 | |
619 | __device__ inline dim3 __clusterGridDimInClusters() { |
620 | return dim3(__nvvm_read_ptx_sreg_nclusterid_x(), |
621 | __nvvm_read_ptx_sreg_nclusterid_y(), |
622 | __nvvm_read_ptx_sreg_nclusterid_z()); |
623 | } |
624 | |
625 | __device__ inline dim3 __clusterIdx() { |
626 | return dim3(__nvvm_read_ptx_sreg_clusterid_x(), |
627 | __nvvm_read_ptx_sreg_clusterid_y(), |
628 | __nvvm_read_ptx_sreg_clusterid_z()); |
629 | } |
630 | |
631 | __device__ inline unsigned __clusterRelativeBlockRank() { |
632 | return __nvvm_read_ptx_sreg_cluster_ctarank(); |
633 | } |
634 | |
635 | __device__ inline unsigned __clusterSizeInBlocks() { |
636 | return __nvvm_read_ptx_sreg_cluster_nctarank(); |
637 | } |
638 | |
639 | __device__ inline void __cluster_barrier_arrive() { |
640 | __nvvm_barrier_cluster_arrive(); |
641 | } |
642 | |
643 | __device__ inline void __cluster_barrier_arrive_relaxed() { |
644 | __nvvm_barrier_cluster_arrive_relaxed(); |
645 | } |
646 | |
647 | __device__ inline void __cluster_barrier_wait() { |
648 | __nvvm_barrier_cluster_wait(); |
649 | } |
650 | |
651 | __device__ inline void __threadfence_cluster() { __nvvm_fence_sc_cluster(); } |
652 | |
653 | __device__ inline float2 atomicAdd(float2 *__ptr, float2 __val) { |
654 | float2 __ret; |
655 | __asm__("atom.add.v2.f32 {%0, %1}, [%2], {%3, %4};" |
656 | : "=f"(__ret.x), "=f"(__ret.y) |
657 | : "l"(__ptr), "f"(__val.x), "f"(__val.y)); |
658 | return __ret; |
659 | } |
660 | |
661 | __device__ inline float2 atomicAdd_block(float2 *__ptr, float2 __val) { |
662 | float2 __ret; |
663 | __asm__("atom.cta.add.v2.f32 {%0, %1}, [%2], {%3, %4};" |
664 | : "=f"(__ret.x), "=f"(__ret.y) |
665 | : "l"(__ptr), "f"(__val.x), "f"(__val.y)); |
666 | return __ret; |
667 | } |
668 | |
669 | __device__ inline float2 atomicAdd_system(float2 *__ptr, float2 __val) { |
670 | float2 __ret; |
671 | __asm__("atom.sys.add.v2.f32 {%0, %1}, [%2], {%3, %4};" |
672 | : "=f"(__ret.x), "=f"(__ret.y) |
673 | : "l"(__ptr), "f"(__val.x), "f"(__val.y)); |
674 | return __ret; |
675 | } |
676 | |
677 | __device__ inline float4 atomicAdd(float4 *__ptr, float4 __val) { |
678 | float4 __ret; |
679 | __asm__("atom.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};" |
680 | : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w) |
681 | : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)); |
682 | return __ret; |
683 | } |
684 | |
685 | __device__ inline float4 atomicAdd_block(float4 *__ptr, float4 __val) { |
686 | float4 __ret; |
687 | __asm__( |
688 | "atom.cta.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};" |
689 | : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w) |
690 | : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w)); |
691 | return __ret; |
692 | } |
693 | |
694 | __device__ inline float4 atomicAdd_system(float4 *__ptr, float4 __val) { |
695 | float4 __ret; |
696 | __asm__( |
697 | "atom.sys.add.v4.f32 {%0, %1, %2, %3}, [%4], {%5, %6, %7, %8};" |
698 | : "=f"(__ret.x), "=f"(__ret.y), "=f"(__ret.z), "=f"(__ret.w) |
699 | : "l"(__ptr), "f"(__val.x), "f"(__val.y), "f"(__val.z), "f"(__val.w) |
700 | :); |
701 | return __ret; |
702 | } |
703 | |
704 | #endif // !defined(__CUDA_ARCH__) || __CUDA_ARCH__ >= 900 |
705 | #endif // CUDA_VERSION >= 11000 |
706 | |
707 | #endif // defined(__CLANG_CUDA_INTRINSICS_H__) |
708 |
Warning: This file is not a C or C++ file. It does not have highlighting.