__clang_cuda_texture_intrinsics.h source code [clang/lib/Headers/__clang_cuda_texture_intrinsics.h]

Warning: This file is not a C or C++ file. It does not have highlighting.

1	/*===--- __clang_cuda_texture_intrinsics.h - Device-side texture support ---===
2	*
3	* Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
4	* See https://llvm.org/LICENSE.txt for license information.
5	* SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
6	*
7	*===-----------------------------------------------------------------------===
8	*
9	* This header provides in-header implmentations for NVCC's built-in
10	* __nv_tex_surf_handler() which is used by CUDA's texture-related headers. The
11	* built-in is unusual as it's actually a set of function overloads that use the
12	* first string literal argument as one of the overload parameters.
13	*/
14	#ifndef __CLANG_CUDA_TEXTURE_INTRINSICS_H__
15	#define __CLANG_CUDA_TEXTURE_INTRINSICS_H__
16	#ifndef __CUDA__
17	#error "This file is for CUDA compilation only."
18	#endif
19
20	// __nv_tex_surf_handler() provided by this header as a macro.
21	#define __nv_tex_surf_handler(__op, __ptr, ...) \
22	::__cuda_tex::__tex_fetch< \
23	::__cuda_tex::__Tag<::__cuda_tex::__tex_op_hash(__op)>>(__ptr, \
24	__VA_ARGS__)
25
26	#pragma push_macro("__ASM_OUT")
27	#pragma push_macro("__ASM_OUTP")
28	#pragma push_macro("__Args")
29	#pragma push_macro("__ID")
30	#pragma push_macro("__IDV")
31	#pragma push_macro("__IMPL_2DGATHER")
32	#pragma push_macro("__IMPL_ALIAS")
33	#pragma push_macro("__IMPL_ALIASI")
34	#pragma push_macro("__IMPL_F1")
35	#pragma push_macro("__IMPL_F3")
36	#pragma push_macro("__IMPL_F3N")
37	#pragma push_macro("__IMPL_F3S")
38	#pragma push_macro("__IMPL_S")
39	#pragma push_macro("__IMPL_S3")
40	#pragma push_macro("__IMPL_S3I")
41	#pragma push_macro("__IMPL_S3N")
42	#pragma push_macro("__IMPL_S3NI")
43	#pragma push_macro("__IMPL_S3S")
44	#pragma push_macro("__IMPL_S3SI")
45	#pragma push_macro("__IMPL_SI")
46	#pragma push_macro("__L")
47	#pragma push_macro("__STRIP_PARENS")
48
49	// Put all functions into anonymous namespace so they have internal linkage.
50	// The device-only function here must be internal in order to avoid ODR
51	// violations in case they are used from the files compiled with
52	// -fgpu-rdc. E.g. a library and an app using it may be built with a different
53	// version of this header file.
54	namespace {
55
56	// Put the implmentation into its own namespace so we don't pollute the TU.
57	namespace __cuda_tex {
58
59	// First, we need a perfect hash function and a few constexpr helper functions
60	// for converting a string literal into a numeric value which can be used to
61	// parametrize a template. We can not use string literals for that as that would
62	// require C++20.
63	//
64	// The hash function was generated with 'gperf' and then manually converted into
65	// its constexpr equivalent.
66	//
67	// NOTE: the perfect hashing scheme comes with inherent self-test. If the hash
68	// function has a collision for any of the texture operations, the compilation
69	// will fail due to an attempt to redefine a tag with the same value. If the
70	// header compiles, then the hash function is good enough for the job.
71
72	constexpr int __tex_len(const char *s) {
73	return (s[0] == 0) ? 0
74	: (s[1] == 0) ? 1
75	: (s[2] == 0) ? 2
76	: (s[3] == 0) ? 3
77	: (s[4] == 0) ? 4
78	: (s[5] == 0) ? 5
79	: (s[6] == 0) ? 6
80	: (s[7] == 0) ? 7
81	: (s[8] == 0) ? 8
82	: (s[9] == 0) ? 9
83	: (s[10] == 0) ? 10
84	: (s[11] == 0) ? 11
85	: (s[12] == 0) ? 12
86	: (s[13] == 0) ? 13
87	: (s[14] == 0) ? 14
88	: (s[15] == 0) ? 15
89	: (s[16] == 0) ? 16
90	: (s[17] == 0) ? 17
91	: (s[18] == 0) ? 18
92	: (s[19] == 0) ? 19
93	: (s[20] == 0) ? 20
94	: (s[21] == 0) ? 21
95	: (s[22] == 0) ? 22
96	: (s[23] == 0) ? 23
97	: (s[24] == 0) ? 24
98	: (s[25] == 0) ? 25
99	: (s[26] == 0) ? 26
100	: (s[27] == 0) ? 27
101	: (s[28] == 0) ? 28
102	: (s[29] == 0) ? 29
103	: (s[30] == 0) ? 30
104	: (s[31] == 0) ? 31
105	: 32;
106	}
107
108	constexpr int __tex_hash_map(int c) {
109	return (c == 49) ? 10
110	: (c == 50) ? 0
111	: (c == 51) ? 100
112	: (c == 52) ? 30
113	: (c == 67) ? 10
114	: (c == 68) ? 0
115	: (c == 69) ? 25
116	: (c == 72) ? 70
117	: (c == 77) ? 0
118	: (c == 96) ? 44
119	: (c == 99) ? 10
120	: (c == 100) ? 5
121	: (c == 101) ? 60
122	: (c == 102) ? 40
123	: (c == 103) ? 70
124	: (c == 104) ? 25
125	: (c == 112) ? 0
126	: (c == 114) ? 45
127	: (c == 117) ? 5
128	: (c == 118) ? 85
129	: (c == 120) ? 20
130	: 225;
131	}
132
133	constexpr int __tex_op_hash(const char *str) {
134	return __tex_len(str) + __tex_hash_map(str[7] + 1) + __tex_hash_map(str[6]) +
135	__tex_hash_map(str[5]) + __tex_hash_map(str[__tex_len(str) - 1]);
136	}
137
138	// Tag type to identify particular texture operation.
139	template <int N> struct __Tag;
140	#define __ID(__op) __Tag<__tex_op_hash(__op)>
141	// Tags for variants of particular operation. E.g. tex2Dgather can translate
142	// into 4 different instructions.
143	#define __IDV(__op, __variant) \
144	__Tag<10000 + __tex_op_hash(__op) * 100 + __variant>
145
146	// Helper classes for figuring out key data types for derived types.
147	// E.g. char2 has __base_t = char, __fetch_t = char4
148	template <class> struct __TypeInfoT;
149	// Type info for the fundamental types.
150	template <> struct __TypeInfoT<float> {
151	using __base_t = float;
152	using __fetch_t = float4;
153	};
154	template <> struct __TypeInfoT<char> {
155	using __base_t = char;
156	using __fetch_t = int4;
157	};
158	template <> struct __TypeInfoT<signed char> {
159	using __base_t = signed char;
160	using __fetch_t = int4;
161	};
162	template <> struct __TypeInfoT<unsigned char> {
163	using __base_t = unsigned char;
164	using __fetch_t = uint4;
165	};
166	template <> struct __TypeInfoT<short> {
167	using __base_t = short;
168	using __fetch_t = int4;
169	};
170	template <> struct __TypeInfoT<unsigned short> {
171	using __base_t = unsigned short;
172	using __fetch_t = uint4;
173	};
174	template <> struct __TypeInfoT<int> {
175	using __base_t = int;
176	using __fetch_t = int4;
177	};
178	template <> struct __TypeInfoT<unsigned int> {
179	using __base_t = unsigned int;
180	using __fetch_t = uint4;
181	};
182
183	// Derived base/fetch types for N-element vectors.
184	template <class __T> struct __TypeInfoT {
185	using __base_t = decltype(__T::x);
186	using __fetch_t = typename __TypeInfoT<__base_t>::__fetch_t;
187	};
188
189	// Classes that implement specific texture ops.
190	template <class __op> struct __tex_fetch_v4;
191
192	// Helper macros to strip parens from a macro argument.
193	#define __Args(...) __VA_ARGS__
194	#define __STRIP_PARENS(__X) __X
195	#define __L(__X) __STRIP_PARENS(__Args __X)
196
197	// Construct inline assembly output args.
198	// Results are stored in a temp var __r.
199	// isResident bool is pointed to by __ir
200	// Asm args for return values. It's a 4-element vector
201	#define __ASM_OUT(__t) \
202	("=" __t(__r.x), "=" __t(__r.y), "=" __t(__r.z), "=" __t(__r.w))
203	// .. possibly combined with a predicate.
204	#define __ASM_OUTP(__t) (__L(__ASM_OUT(__t)), "=h"(*__ir))
205
206	// Implements a single variant of texture fetch instruction.
207	#define __IMPL_F1(__rt, __dt, __args, __asm_op, __asm_outs, __asm_args) \
208	template <> \
209	__device__ __rt __run<__dt>(cudaTextureObject_t __obj, __L(__args)) { \
210	__rt __r; \
211	asm(__asm_op : __L(__asm_outs) : "l"(__obj), __L(__asm_args)); \
212	return __r; \
213	}
214
215	// Implements texture fetch instructions for int4/uint4/float4 data types.
216	#define __IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
217	__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
218	__ASM_OUT("r"), __asm_args) \
219	__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
220	__ASM_OUT("r"), __asm_args) \
221	__IMPL_F1(float4, float4, __args, \
222	__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUT("f"), \
223	__asm_args)
224	// Implements 'sparse' texture fetch instructions for int4/uint4/float4 data
225	// types. Similar to above, but returns a boolean 'isPresent' value in addition
226	// to texture data,
227	#define __IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
228	__IMPL_F1(int4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
229	__ASM_OUTP("r"), __asm_args) \
230	__IMPL_F1(uint4, uint4, __args, __asm_op ".u32." __ctype "\t" __asm_op_args, \
231	__ASM_OUTP("r"), __asm_args) \
232	__IMPL_F1(float4, float4, __args, \
233	__asm_op ".f32." __ctype "\t" __asm_op_args, __ASM_OUTP("f"), \
234	__asm_args)
235
236	// Similar to F3, but for integer data which is returned as normalized floats.
237	// Only instantiates fetch functions for int4/uint4.
238	#define __IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
239	__IMPL_F1(float4, int4, __args, __asm_op ".s32." __ctype "\t" __asm_op_args, \
240	__ASM_OUT("r"), __asm_args) \
241	__IMPL_F1(float4, uint4, __args, \
242	__asm_op ".u32." __ctype "\t" __asm_op_args, __ASM_OUT("r"), \
243	__asm_args)
244
245	// Instantiates __tex_fetch_v4 with regular fetch functions.
246	#define __IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
247	template <> struct __tex_fetch_v4<__op> { \
248	template <class T> \
249	__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
250	__IMPL_F3(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
251	}
252
253	// Same, but for sparse ops. Only available on sm_60+
254	#if !defined(__CUDA_ARCH__) \|\| (__CUDA_ARCH__ >= 600)
255	#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, \
256	__asm_args) \
257	template <> struct __tex_fetch_v4<__op> { \
258	template <class T> \
259	__device__ static T __run(cudaTextureObject_t __obj, __L(__args)); \
260	__IMPL_F3S(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
261	}
262	#else
263	#define __IMPL_S3SI(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
264	#endif
265
266	// Same, but for normalized float ops.
267	#define __IMPL_S3NI(__op, __args, __asm_op, __ctype, __asm_op_args, \
268	__asm_args) \
269	template <> struct __tex_fetch_v4<__op> { \
270	template <class T> \
271	__device__ static float4 __run(cudaTextureObject_t __obj, __L(__args)); \
272	__IMPL_F3N(__args, __asm_op, __ctype, __asm_op_args, __asm_args) \
273	}
274
275	// Regular and normalized float ops share a lot of similarities. This macro
276	// instantiates both variants -- normal for __op and normalized for __opn.
277	#define __IMPL_SI(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
278	__asm_args) \
279	__IMPL_S3I(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args); \
280	__IMPL_S3NI(__opn, __args, __asm_op, __ctype, __asm_op_args, __asm_args)
281
282	// Convenience macros which converts string literal __op into a __Tag,
283	#define __IMPL_S3(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
284	__IMPL_S3I(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
285	#define __IMPL_S3S(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
286	__IMPL_S3SI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
287	#define __IMPL_S3N(__op, __args, __asm_op, __ctype, __asm_op_args, __asm_args) \
288	__IMPL_S3NI(__ID(__op), __args, __asm_op, __ctype, __asm_op_args, __asm_args)
289	#define __IMPL_S(__op, __opn, __args, __asm_op, __ctype, __asm_op_args, \
290	__asm_args) \
291	__IMPL_SI(__ID(__op), __ID(__opn), __args, __asm_op, __ctype, __asm_op_args, \
292	__asm_args)
293
294	// CUDA headers have some 'legacy' texture oprerations that duplicate
295	// functionality. So, we just inherit it, instead of refining a copy.
296	#define __IMPL_ALIASI(__op, __opn) \
297	template <> struct __tex_fetch_v4<__op> : __tex_fetch_v4<__opn> {}
298	#define __IMPL_ALIAS(__op, __opn) __IMPL_ALIASI(__ID(__op), __ID(__opn))
299
300	// Now we can instantiate everything we need for each specific texture fetch
301	// variant.
302	__IMPL_S("__tex1D_v2", "__tex1D_rmnf_v2", (float __x), "tex.1d.v4", "f32",
303	"{%0, %1, %2, %3}, [%4, {%5}];", ("f"(__x)));
304	__IMPL_S("__tex1Dfetch_v2", "__tex1Dfetch_rmnf_v2", (int __x), "tex.1d.v4",
305	"s32", "{%0, %1, %2, %3}, [%4, {%5}];", ("r"(__x)));
306	__IMPL_ALIAS("__itex1D", "__tex1D_v2");
307	__IMPL_ALIAS("__itex1Dfetch", "__tex1Dfetch_v2");
308
309	__IMPL_S("__tex1DGrad_v2", "__tex1DGrad_rmnf_v2",
310	(float __x, float __dPdx, float __dPdy), "tex.grad.1d.v4", "f32",
311	"{%0, %1, %2, %3}, [%4, {%5}], {%6}, {%7};",
312	("f"(__x), "f"(__dPdx), "f"(__dPdy)));
313	__IMPL_ALIAS("__itex1DGrad", "__tex1DGrad_v2");
314
315	__IMPL_S("__tex1DLayered_v2", "__tex1DLayered_rmnf_v2",
316	(float __x, int __layer), "tex.a1d.v4", "f32",
317	"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("r"(__layer), "f"(__x)));
318	__IMPL_ALIAS("__itex1DLayered", "__tex1DLayered_v2");
319
320	__IMPL_S("__tex1DLayeredGrad_v2", "__tex1DLayeredGrad_rmnf_v2",
321	(float __x, int __layer, float __dPdx, float __dPdy),
322	"tex.grad.a1d.v4", "f32",
323	"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7}, {%8};",
324	("r"(__layer), "f"(__x), "f"(__dPdx), "f"(__dPdy)));
325	__IMPL_ALIAS("__itex1DLayeredGrad", "__tex1DLayeredGrad_v2");
326
327	__IMPL_S("__tex1DLayeredLod_v2", "__tex1DLayeredLod_rmnf_v2",
328	(float __x, int __layer, float __level), "tex.level.a1d.v4", "f32",
329	"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
330	("r"(__layer), "f"(__x), "f"(__level)));
331	__IMPL_ALIAS("__itex1DLayeredLod", "__tex1DLayeredLod_v2");
332
333	__IMPL_S("__tex1DLod_v2", "__tex1DLod_rmnf_v2", (float __x, float __level),
334	"tex.level.1d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5}], %6;",
335	("f"(__x), "f"(__level)));
336	__IMPL_ALIAS("__itex1DLod", "__tex1DLod_v2");
337
338	// 2D
339	__IMPL_S("__tex2D_v2", "__tex2D_rmnf_v2", (float __x, float __y), "tex.2d.v4",
340	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y)));
341	__IMPL_ALIAS("__itex2D", "__tex2D_v2");
342
343	__IMPL_S3S("__itex2D_sparse", (float __x, float __y, unsigned char *__ir),
344	"{.reg .pred %%p0;\n\t"
345	"tex.2d.v4",
346	"f32",
347	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}];\n\t"
348	" selp.u16 %4, 1, 0, %%p0; }",
349	("f"(__x), "f"(__y)));
350
351	__IMPL_S("__tex2DGrad_v2", "__tex2DGrad_rmnf_v2",
352	(float __x, float __y, const float2 __dPdx, const float2 __dPdy),
353	"tex.grad.2d.v4", "f32",
354	"{%0, %1, %2, %3}, [%4, {%5, %6}], {%7, %8}, {%9, %10};",
355	("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
356	"f"(__dPdy->y)));
357	__IMPL_ALIAS("__itex2DGrad_v2", "__tex2DGrad_v2");
358
359	__IMPL_S3S("__itex2DGrad_sparse",
360	(float __x, float __y, const float2 __dPdx, const float2 __dPdy,
361	unsigned char *__ir),
362	"{.reg .pred %%p0;\n\t"
363	"tex.grad.2d.v4",
364	"f32",
365	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}], {%8, %9}, {%10, %11};\n\t"
366	"selp.u16 %4, 1, 0, %%p0; }",
367	("f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y), "f"(__dPdy->x),
368	"f"(__dPdy->y)));
369
370	__IMPL_S("__tex2DLayered_v2", "__tex2DLayered_rmnf_v2",
371	(float __x, float __y, int __layer), "tex.a2d.v4", "f32",
372	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
373	("r"(__layer), "f"(__x), "f"(__y)));
374	__IMPL_ALIAS("__itex2DLayered", "__tex2DLayered_v2");
375
376	__IMPL_S3S("__itex2DLayered_sparse",
377	(float __x, float __y, int __layer, unsigned char *__ir),
378	"{.reg .pred %%p0;\n\t"
379	"tex.a2d.v4",
380	"f32",
381	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
382	"selp.u16 %4, 1, 0, %%p0; }",
383	("r"(__layer), "f"(__x), "f"(__y)));
384
385	__IMPL_S("__tex2DLayeredGrad_v2", "__tex2DLayeredGrad_rmnf_v2",
386	(float __x, float __y, int __layer, const float2 *__dPdx,
387	const float2 *__dPdy),
388	"tex.grad.a2d.v4", "f32",
389	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], {%8, %9}, {%10, %11};",
390	("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
391	"f"(__dPdy->x), "f"(__dPdy->y)));
392	__IMPL_ALIAS("__itex2DLayeredGrad_v2", "__tex2DLayeredGrad_v2");
393
394	__IMPL_S3S(
395	"__itex2DLayeredGrad_sparse",
396	(float __x, float __y, int __layer, const float2 *__dPdx,
397	const float2 __dPdy, unsigned char __ir),
398	"{.reg .pred %%p0;\n\t"
399	"tex.grad.a2d.v4",
400	"f32",
401	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], {%9, %10}, {%11, %12};\n\t"
402	"selp.u16 %4, 1, 0, %%p0; }",
403	("r"(__layer), "f"(__x), "f"(__y), "f"(__dPdx->x), "f"(__dPdx->y),
404	"f"(__dPdy->x), "f"(__dPdy->y)));
405
406	__IMPL_S("__tex2DLayeredLod_v2", "__tex2DLayeredLod_rmnf_v2",
407	(float __x, float __y, int __layer, float __level), "tex.level.a2d.v4",
408	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
409	("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
410	__IMPL_ALIAS("__itex2DLayeredLod", "__tex2DLayeredLod_v2");
411
412	__IMPL_S3S("__itex2DLayeredLod_sparse",
413	(float __x, float __y, int __layer, float __level,
414	unsigned char *__ir),
415	"{.reg .pred %%p0;\n\t"
416	"tex.level.a2d.v4",
417	"f32",
418	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
419	"selp.u16 %4, 1, 0, %%p0; }",
420	("r"(__layer), "f"(__x), "f"(__y), "f"(__level)));
421
422	__IMPL_S("__tex2DLod_v2", "__tex2DLod_rmnf_v2",
423	(float __x, float __y, float __level), "tex.level.2d.v4", "f32",
424	"{%0, %1, %2, %3}, [%4, {%5, %6}], %7;",
425	("f"(__x), "f"(__y), "f"(__level)));
426	__IMPL_ALIAS("__itex2DLod", "__tex2DLod_v2");
427
428	__IMPL_S3S("__itex2DLod_sparse",
429	(float __x, float __y, float __level, unsigned char *__ir),
430	"{.reg .pred %%p0;\n\t"
431	"tex.level.2d.v4",
432	"f32",
433	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}], %8;\n\t"
434	"selp.u16 %4, 1, 0, %%p0; }",
435	("f"(__x), "f"(__y), "f"(__level)));
436
437	// 2D gather is special. Unlike other variants that translate into exactly one
438	// asm instruction, it uses one of the four different instructions selected by
439	// __comp. We implement each instruction variant separately, and dispatch the
440	// right one from the manually implemented 'umbrella' fetch.
441	#define __IMPL_2DGATHER(variant, instr) \
442	__IMPL_SI(__IDV("__tex2Dgather_v2", variant), \
443	__IDV("__tex2Dgather_rmnf_v2", variant), \
444	(float __x, float __y, int __comp), instr, "f32", \
445	"{%0, %1, %2, %3}, [%4, {%5, %6}];", ("f"(__x), "f"(__y))); \
446	__IMPL_ALIASI(__IDV("__itex2Dgather", variant), \
447	__IDV("__tex2Dgather_v2", variant)); \
448	__IMPL_S3SI(__IDV("__itex2Dgather_sparse", variant), \
449	(float __x, float __y, unsigned char *__ir, int __comp), \
450	"{.reg .pred %%p0;\n\t" instr, "f32", \
451	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7}];\n\t" \
452	"selp.u16 %4, 1, 0, %%p0; }", \
453	("f"(__x), "f"(__y)));
454	__IMPL_2DGATHER(0, "tld4.r.2d.v4");
455	__IMPL_2DGATHER(1, "tld4.g.2d.v4");
456	__IMPL_2DGATHER(2, "tld4.b.2d.v4");
457	__IMPL_2DGATHER(3, "tld4.a.2d.v4");
458
459	// Umbrella dispatcher -- calls into specific 2Dgather variant.
460	template <> struct __tex_fetch_v4<__ID("__tex2Dgather_v2")> {
461	template <class __T>
462	__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
463	int __comp) {
464	switch (__comp) {
465	case 0:
466	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 0)>::__run<__T>(
467	__obj, __x, __y, __comp);
468	case 1:
469	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 1)>::__run<__T>(
470	__obj, __x, __y, __comp);
471	case 2:
472	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 2)>::__run<__T>(
473	__obj, __x, __y, __comp);
474	case 3:
475	return __tex_fetch_v4<__IDV("__tex2Dgather_v2", 3)>::__run<__T>(
476	__obj, __x, __y, __comp);
477	}
478	}
479	};
480	__IMPL_ALIAS("__itex2Dgather", "__tex2Dgather_v2");
481
482	template <> struct __tex_fetch_v4<__ID("__tex2Dgather_rmnf_v2")> {
483	template <class __T>
484	__device__ static float4 __run(cudaTextureObject_t __obj, float __x,
485	float __y, int __comp) {
486	switch (__comp) {
487	case 0:
488	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 0)>::__run<__T>(
489	__obj, __x, __y, __comp);
490	case 1:
491	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 1)>::__run<__T>(
492	__obj, __x, __y, __comp);
493	case 2:
494	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 2)>::__run<__T>(
495	__obj, __x, __y, __comp);
496	case 3:
497	return __tex_fetch_v4<__IDV("__tex2Dgather_rmnf_v2", 3)>::__run<__T>(
498	__obj, __x, __y, __comp);
499	}
500	}
501	};
502
503	#if !defined(__CUDA_ARCH__) \|\| (__CUDA_ARCH__ >= 600)
504	template <> struct __tex_fetch_v4<__ID("__itex2Dgather_sparse")> {
505	template <class __T>
506	__device__ static __T __run(cudaTextureObject_t __obj, float __x, float __y,
507	unsigned char *__ir, int __comp) {
508	switch (__comp) {
509	case 0:
510	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 0)>::__run<__T>(
511	__obj, __x, __y, __ir, __comp);
512	case 1:
513	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 1)>::__run<__T>(
514	__obj, __x, __y, __ir, __comp);
515	case 2:
516	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 2)>::__run<__T>(
517	__obj, __x, __y, __ir, __comp);
518	case 3:
519	return __tex_fetch_v4<__IDV("__itex2Dgather_sparse", 3)>::__run<__T>(
520	__obj, __x, __y, __ir, __comp);
521	}
522	}
523	};
524	#endif
525
526	// 3D
527	__IMPL_S("__tex3D_v2", "__tex3D_rmnf_v2", (float __x, float __y, float __z),
528	"tex.3d.v4", "f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
529	("f"(__x), "f"(__y), "f"(__z)));
530	__IMPL_ALIAS("__itex3D", "__tex3D_v2");
531
532	__IMPL_S3S("__itex3D_sparse",
533	(float __x, float __y, float __z, unsigned char *__ir),
534	"{.reg .pred %%p0;\n\t"
535	"tex.3d.v4",
536	"f32",
537	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
538	"selp.u16 %4, 1, 0, %%p0; }",
539	("f"(__x), "f"(__y), "f"(__z)));
540
541	__IMPL_S("__tex3DGrad_v2", "__tex3DGrad_rmnf_v2",
542	(float __x, float __y, float __z, const float4 *__dPdx,
543	const float4 *__dPdy),
544	"tex.grad.3d.v4", "f32",
545	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
546	"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
547	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
548	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
549	__IMPL_ALIAS("__itex3DGrad_v2", "__tex3DGrad_v2");
550
551	__IMPL_S3S("__itex3DGrad_sparse",
552	(float __x, float __y, float __z, const float4 *__dPdx,
553	const float4 __dPdy, unsigned char __ir),
554	"{.reg .pred %%p0;\n\t"
555	"tex.grad.3d.v4",
556	"f32",
557	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], "
558	"{%9, %10, %11, %11}, {%12, %13, %14, %14};\n\t"
559	"selp.u16 %4, 1, 0, %%p0; }",
560	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
561	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
562
563	__IMPL_S("__tex3DLod_v2", "__tex3DLod_rmnf_v2",
564	(float __x, float __y, float __z, float __level), "tex.level.3d.v4",
565	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
566	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
567	__IMPL_ALIAS("__itex3DLod", "__tex3DLod_v2");
568
569	__IMPL_S3S("__itex3DLod_sparse",
570	(float __x, float __y, float __z, float __level,
571	unsigned char *__ir),
572	"{.reg .pred %%p0;\n\t"
573	"tex.level.3d.v4",
574	"f32",
575	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}], %9;\n\t"
576	"selp.u16 %4, 1, 0, %%p0; }",
577	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
578
579	// Cubemap
580	__IMPL_S("__texCubemap_v2", "__texCubemap_rmnf_v2",
581	(float __x, float __y, float __z), "tex.cube.v4", "f32",
582	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}];",
583	("f"(__x), "f"(__y), "f"(__z)));
584	__IMPL_ALIAS("__itexCubemap", "__texCubemap_v2");
585
586	__IMPL_S3S("__itexCubemap_sparse",
587	(float __x, float __y, float __z, unsigned char *__ir),
588	"{.reg .pred %%p0;\n\t"
589	"tex.cube.v4",
590	"f32",
591	"{%0, %1, %2, %3}\|%%p0, [%5, {%6, %7, %8, %8}];\n\t"
592	"selp.u16 %4, 1, 0, %%p0; }",
593	("f"(__x), "f"(__y), "f"(__z)));
594
595	__IMPL_S("__texCubemapGrad_v2", "__texCubemapGrad_rmnf_v2",
596	(float __x, float __y, float __z, const float4 *__dPdx,
597	const float4 *__dPdy),
598	"tex.grad.cube.v4", "f32",
599	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], "
600	"{%8, %9, %10, %10}, {%11, %12, %13, %13};",
601	("f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x), "f"(__dPdx->y),
602	"f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y), "f"(__dPdy->z)));
603	__IMPL_ALIAS("__itexCubemapGrad_v2", "__texCubemapGrad_v2");
604
605	__IMPL_S("__texCubemapLayered_v2", "__texCubemapLayered_rmnf_v2",
606	(float __x, float __y, float __z, int __layer), "tex.acube.v4", "f32",
607	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}];",
608	("r"(__layer), "f"(__x), "f"(__y), "f"(__z)));
609	__IMPL_ALIAS("__itexCubemapLayered", "__texCubemapLayered_v2");
610
611	__IMPL_S("__texCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_rmnf_v2",
612	(float __x, float __y, float __z, int __layer, const float4 *__dPdx,
613	const float4 *__dPdy),
614	"tex.grad.acube.v4", "f32",
615	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], "
616	"{%9, %10, %11, %11}, {%12, %13, %14, %14};",
617	("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__dPdx->x),
618	"f"(__dPdx->y), "f"(__dPdx->z), "f"(__dPdy->x), "f"(__dPdy->y),
619	"f"(__dPdy->z)));
620	__IMPL_ALIAS("__itexCubemapLayeredGrad_v2", "__texCubemapLayeredGrad_v2");
621
622	__IMPL_S("__texCubemapLayeredLod_v2", "__texCubemapLayeredLod_rmnf_v2",
623	(float __x, float __y, float __z, int __layer, float __level),
624	"tex.level.acube.v4", "f32",
625	"{%0, %1, %2, %3}, [%4, {%5, %6, %7, %8}], %9;",
626	("r"(__layer), "f"(__x), "f"(__y), "f"(__z), "f"(__level)));
627	__IMPL_ALIAS("__itexCubemapLayeredLod", "__texCubemapLayeredLod_v2");
628
629	__IMPL_S("__texCubemapLod_v2", "__texCubemapLod_rmnf_v2",
630	(float __x, float __y, float __z, float __level), "tex.level.cube.v4",
631	"f32", "{%0, %1, %2, %3}, [%4, {%5, %6, %7, %7}], %8;",
632	("f"(__x), "f"(__y), "f"(__z), "f"(__level)));
633	__IMPL_ALIAS("__itexCubemapLod", "__texCubemapLod_v2");
634
635	// Helper class for extracting slice of data from V4 fetch results.
636	template <class __DestT, class __SrcT> struct __convert {
637	template <int __NElements = sizeof(__DestT) /
638	sizeof(typename __TypeInfoT<__DestT>::__base_t)>
639	__device__ static __DestT __run(__SrcT __v);
640	template <> __device__ static __DestT __run<1>(__SrcT __v) { return {__v.x}; }
641	template <> __device__ static __DestT __run<2>(__SrcT __v) {
642	return {__v.x, __v.y};
643	}
644	template <> __device__ static __DestT __run<3>(__SrcT __v) {
645	return {__v.x, __v.y, __v.z};
646	}
647	template <> __device__ static __DestT __run<4>(__SrcT __v) {
648	return {__v.x, __v.y, __v.z, __v.w};
649	}
650	};
651
652	// These are the top-level function overloads the __nv_tex_surf_handler expands
653	// to. Each overload deals with one of the several ways __nv_tex_surf_handler
654	// is called by CUDA headers. In the end, each of the overloads does the same
655	// job -- it figures out which `__tex_fetch_v4::run` variant should be used to
656	// fetch texture data and which `__convert::run` is needed to convert it into
657	// appropriate return type.
658
659	// __nv_tex_surf_handler("__tex...", &ret, cudaTextureObject_t handle, args...);
660	// Data type and return type are based on ret.
661	template <class __op, class __T, class... __Args>
662	__device__ static void __tex_fetch(__T *__ptr, cudaTextureObject_t __handle,
663	__Args... __args) {
664	using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
665	*__ptr = __convert<__T, __FetchT>::__run(
666	__tex_fetch_v4<__op>::template __run<__FetchT>(__handle, __args...));
667	}
668
669	#if CUDA_VERSION < 12000
670	// texture<> objects get magically converted into a texture reference. However,
671	// there's no way to convert them to cudaTextureObject_t on C++ level. So, we
672	// cheat a bit and use inline assembly to do it. It costs us an extra register
673	// and a move, but that is easy for ptxas to optimize away.
674	template <class __T>
675	__device__ cudaTextureObject_t __tex_handle_to_obj(__T __handle) {
676	cudaTextureObject_t __obj;
677	asm("mov.b64 %0, %1; " : "=l"(__obj) : "l"(__handle));
678	return __obj;
679	}
680
681	// __nv_tex_surf_handler ("__tex...", &ret, textureReference, args...);
682	// Data type and return type is based on ret.
683	template <class __op, class __T, class __HandleT, class... __Args>
684	__device__ static void __tex_fetch(__T *__ptr, __HandleT __handle,
685	__Args... __args) {
686	using __FetchT = typename __TypeInfoT<__T>::__fetch_t;
687	*__ptr = __convert<__T, __FetchT>::__run(
688	__tex_fetch_v4<__op>::template __run<__FetchT>(
689	__tex_handle_to_obj(__handle), __args...));
690	}
691
692	// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
693	// cudaReadModeNormalizedFloat fetches always return float4.
694	template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
695	__device__ static void
696	__tex_fetch(__DataT , __RetT __ptr,
697	texture<__DataT, __TexT, cudaReadModeNormalizedFloat> __handle,
698	__Args... __args) {
699	using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
700	*__ptr = __convert<__RetT, float4>::__run(
701	__tex_fetch_v4<__op>::template __run<__FetchT>(
702	__tex_handle_to_obj(__handle), __args...));
703	}
704
705	// __nv_tex_surf_handler ("__tex...", &type_dummy, &ret, texture<...>, args...);
706	// For cudaReadModeElementType fetch return type is based on type_dummy.
707	template <class __op, class __DataT, class __RetT, int __TexT, class... __Args>
708	__device__ static void
709	__tex_fetch(__DataT , __RetT __ptr,
710	texture<__DataT, __TexT, cudaReadModeElementType> __handle,
711	__Args... __args) {
712	using __FetchT = typename __TypeInfoT<__DataT>::__fetch_t;
713	*__ptr = __convert<__RetT, __FetchT>::__run(
714	__tex_fetch_v4<__op>::template __run<__FetchT>(
715	__tex_handle_to_obj(__handle), __args...));
716	}
717	#endif // CUDA_VERSION
718	} // namespace __cuda_tex
719	} // namespace
720	#pragma pop_macro("__ASM_OUT")
721	#pragma pop_macro("__ASM_OUTP")
722	#pragma pop_macro("__Args")
723	#pragma pop_macro("__ID")
724	#pragma pop_macro("__IDV")
725	#pragma pop_macro("__IMPL_2DGATHER")
726	#pragma pop_macro("__IMPL_ALIAS")
727	#pragma pop_macro("__IMPL_ALIASI")
728	#pragma pop_macro("__IMPL_F1")
729	#pragma pop_macro("__IMPL_F3")
730	#pragma pop_macro("__IMPL_F3N")
731	#pragma pop_macro("__IMPL_F3S")
732	#pragma pop_macro("__IMPL_S")
733	#pragma pop_macro("__IMPL_S3")
734	#pragma pop_macro("__IMPL_S3I")
735	#pragma pop_macro("__IMPL_S3N")
736	#pragma pop_macro("__IMPL_S3NI")
737	#pragma pop_macro("__IMPL_S3S")
738	#pragma pop_macro("__IMPL_S3SI")
739	#pragma pop_macro("__IMPL_SI")
740	#pragma pop_macro("__L")
741	#pragma pop_macro("__STRIP_PARENS")
742	#endif // __CLANG_CUDA_TEXTURE_INTRINSICS_H__
743

Warning: This file is not a C or C++ file. It does not have highlighting.

source code of clang/lib/Headers/__clang_cuda_texture_intrinsics.h