1// Copyright (C) 2018 The Qt Company Ltd.
2// Copyright (C) 2018 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include "qdrawhelper_p.h"
6#include "qdrawhelper_x86_p.h"
7#include "qdrawingprimitive_sse2_p.h"
8#include "qpixellayout_p.h"
9#include "qrgba64_p.h"
10
11#if defined(QT_COMPILER_SUPPORTS_AVX2)
12
13QT_BEGIN_NAMESPACE
14
15enum {
16 FixedScale = 1 << 16,
17 HalfPoint = 1 << 15
18};
19
20// Vectorized blend functions:
21
22// See BYTE_MUL_SSE2 for details.
23inline static void Q_DECL_VECTORCALL
24BYTE_MUL_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
25{
26 __m256i pixelVectorAG = _mm256_srli_epi16(a: pixelVector, count: 8);
27 __m256i pixelVectorRB = _mm256_and_si256(a: pixelVector, b: colorMask);
28
29 pixelVectorAG = _mm256_mullo_epi16(a: pixelVectorAG, b: alphaChannel);
30 pixelVectorRB = _mm256_mullo_epi16(a: pixelVectorRB, b: alphaChannel);
31
32 pixelVectorRB = _mm256_add_epi16(a: pixelVectorRB, b: _mm256_srli_epi16(a: pixelVectorRB, count: 8));
33 pixelVectorAG = _mm256_add_epi16(a: pixelVectorAG, b: _mm256_srli_epi16(a: pixelVectorAG, count: 8));
34 pixelVectorRB = _mm256_add_epi16(a: pixelVectorRB, b: half);
35 pixelVectorAG = _mm256_add_epi16(a: pixelVectorAG, b: half);
36
37 pixelVectorRB = _mm256_srli_epi16(a: pixelVectorRB, count: 8);
38 pixelVectorAG = _mm256_andnot_si256(a: colorMask, b: pixelVectorAG);
39
40 pixelVector = _mm256_or_si256(a: pixelVectorAG, b: pixelVectorRB);
41}
42
43inline static void Q_DECL_VECTORCALL
44BYTE_MUL_RGB64_AVX2(__m256i &pixelVector, __m256i alphaChannel, __m256i colorMask, __m256i half)
45{
46 __m256i pixelVectorAG = _mm256_srli_epi32(a: pixelVector, count: 16);
47 __m256i pixelVectorRB = _mm256_and_si256(a: pixelVector, b: colorMask);
48
49 pixelVectorAG = _mm256_mullo_epi32(a: pixelVectorAG, b: alphaChannel);
50 pixelVectorRB = _mm256_mullo_epi32(a: pixelVectorRB, b: alphaChannel);
51
52 pixelVectorRB = _mm256_add_epi32(a: pixelVectorRB, b: _mm256_srli_epi32(a: pixelVectorRB, count: 16));
53 pixelVectorAG = _mm256_add_epi32(a: pixelVectorAG, b: _mm256_srli_epi32(a: pixelVectorAG, count: 16));
54 pixelVectorRB = _mm256_add_epi32(a: pixelVectorRB, b: half);
55 pixelVectorAG = _mm256_add_epi32(a: pixelVectorAG, b: half);
56
57 pixelVectorRB = _mm256_srli_epi32(a: pixelVectorRB, count: 16);
58 pixelVectorAG = _mm256_andnot_si256(a: colorMask, b: pixelVectorAG);
59
60 pixelVector = _mm256_or_si256(a: pixelVectorAG, b: pixelVectorRB);
61}
62
63// See INTERPOLATE_PIXEL_255_SSE2 for details.
64inline static void Q_DECL_VECTORCALL
65INTERPOLATE_PIXEL_255_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
66{
67 const __m256i srcVectorAG = _mm256_srli_epi16(a: srcVector, count: 8);
68 const __m256i dstVectorAG = _mm256_srli_epi16(a: dstVector, count: 8);
69 const __m256i srcVectorRB = _mm256_and_si256(a: srcVector, b: colorMask);
70 const __m256i dstVectorRB = _mm256_and_si256(a: dstVector, b: colorMask);
71 const __m256i srcVectorAGalpha = _mm256_mullo_epi16(a: srcVectorAG, b: alphaChannel);
72 const __m256i srcVectorRBalpha = _mm256_mullo_epi16(a: srcVectorRB, b: alphaChannel);
73 const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi16(a: dstVectorAG, b: oneMinusAlphaChannel);
74 const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi16(a: dstVectorRB, b: oneMinusAlphaChannel);
75 __m256i finalAG = _mm256_add_epi16(a: srcVectorAGalpha, b: dstVectorAGoneMinusAlpha);
76 __m256i finalRB = _mm256_add_epi16(a: srcVectorRBalpha, b: dstVectorRBoneMinusAlpha);
77 finalAG = _mm256_add_epi16(a: finalAG, b: _mm256_srli_epi16(a: finalAG, count: 8));
78 finalRB = _mm256_add_epi16(a: finalRB, b: _mm256_srli_epi16(a: finalRB, count: 8));
79 finalAG = _mm256_add_epi16(a: finalAG, b: half);
80 finalRB = _mm256_add_epi16(a: finalRB, b: half);
81 finalAG = _mm256_andnot_si256(a: colorMask, b: finalAG);
82 finalRB = _mm256_srli_epi16(a: finalRB, count: 8);
83
84 dstVector = _mm256_or_si256(a: finalAG, b: finalRB);
85}
86
87inline static void Q_DECL_VECTORCALL
88INTERPOLATE_PIXEL_RGB64_AVX2(__m256i srcVector, __m256i &dstVector, __m256i alphaChannel, __m256i oneMinusAlphaChannel, __m256i colorMask, __m256i half)
89{
90 const __m256i srcVectorAG = _mm256_srli_epi32(a: srcVector, count: 16);
91 const __m256i dstVectorAG = _mm256_srli_epi32(a: dstVector, count: 16);
92 const __m256i srcVectorRB = _mm256_and_si256(a: srcVector, b: colorMask);
93 const __m256i dstVectorRB = _mm256_and_si256(a: dstVector, b: colorMask);
94 const __m256i srcVectorAGalpha = _mm256_mullo_epi32(a: srcVectorAG, b: alphaChannel);
95 const __m256i srcVectorRBalpha = _mm256_mullo_epi32(a: srcVectorRB, b: alphaChannel);
96 const __m256i dstVectorAGoneMinusAlpha = _mm256_mullo_epi32(a: dstVectorAG, b: oneMinusAlphaChannel);
97 const __m256i dstVectorRBoneMinusAlpha = _mm256_mullo_epi32(a: dstVectorRB, b: oneMinusAlphaChannel);
98 __m256i finalAG = _mm256_add_epi32(a: srcVectorAGalpha, b: dstVectorAGoneMinusAlpha);
99 __m256i finalRB = _mm256_add_epi32(a: srcVectorRBalpha, b: dstVectorRBoneMinusAlpha);
100 finalAG = _mm256_add_epi32(a: finalAG, b: _mm256_srli_epi32(a: finalAG, count: 16));
101 finalRB = _mm256_add_epi32(a: finalRB, b: _mm256_srli_epi32(a: finalRB, count: 16));
102 finalAG = _mm256_add_epi32(a: finalAG, b: half);
103 finalRB = _mm256_add_epi32(a: finalRB, b: half);
104 finalAG = _mm256_andnot_si256(a: colorMask, b: finalAG);
105 finalRB = _mm256_srli_epi32(a: finalRB, count: 16);
106
107 dstVector = _mm256_or_si256(a: finalAG, b: finalRB);
108}
109
110// See BLEND_SOURCE_OVER_ARGB32_SSE2 for details.
111inline static void Q_DECL_VECTORCALL BLEND_SOURCE_OVER_ARGB32_AVX2(quint32 *dst, const quint32 *src, const int length)
112{
113 const __m256i half = _mm256_set1_epi16(w: 0x80);
114 const __m256i one = _mm256_set1_epi16(w: 0xff);
115 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
116 const __m256i alphaMask = _mm256_set1_epi32(i: 0xff000000);
117 const __m256i offsetMask = _mm256_setr_epi32(i0: 0, i1: 1, i2: 2, i3: 3, i4: 4, i5: 5, i6: 6, i7: 7);
118 const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(0xff),b30: 15,b29: char(0xff),b28: 15,b27: char(0xff),b26: 11,b25: char(0xff),b24: 11,b23: char(0xff),b22: 7,b21: char(0xff),b20: 7,b19: char(0xff),b18: 3,b17: char(0xff),b16: 3,
119 b15: char(0xff),b14: 15,b13: char(0xff),b12: 15,b11: char(0xff),b10: 11,b09: char(0xff),b08: 11,b07: char(0xff),b06: 7,b05: char(0xff),b04: 7,b03: char(0xff),b02: 3,b01: char(0xff),b00: 3);
120
121 const int minusOffsetToAlignDstOn32Bytes = (reinterpret_cast<quintptr>(dst) >> 2) & 0x7;
122
123 int x = 0;
124 // Prologue to handle all pixels until dst is 32-byte aligned in one step.
125 if (minusOffsetToAlignDstOn32Bytes != 0 && x < (length - 7)) {
126 const __m256i prologueMask = _mm256_sub_epi32(a: _mm256_set1_epi32(i: minusOffsetToAlignDstOn32Bytes - 1), b: offsetMask);
127 const __m256i srcVector = _mm256_maskload_epi32(X: (const int *)&src[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask);
128 const __m256i prologueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: prologueMask);
129 if (!_mm256_testz_si256(a: srcVector, b: prologueAlphaMask)) {
130 if (_mm256_testc_si256(a: srcVector, b: prologueAlphaMask)) {
131 _mm256_maskstore_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask, Y: srcVector);
132 } else {
133 __m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
134 alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
135 __m256i dstVector = _mm256_maskload_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask);
136 BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
137 dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
138 _mm256_maskstore_epi32(X: (int *)&dst[x - minusOffsetToAlignDstOn32Bytes], M: prologueMask, Y: dstVector);
139 }
140 }
141 x += (8 - minusOffsetToAlignDstOn32Bytes);
142 }
143
144 for (; x < (length - 7); x += 8) {
145 const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
146 if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
147 if (_mm256_testc_si256(a: srcVector, b: alphaMask)) {
148 _mm256_store_si256(p: (__m256i *)&dst[x], a: srcVector);
149 } else {
150 __m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
151 alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
152 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
153 BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
154 dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
155 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
156 }
157 }
158 }
159
160 // Epilogue to handle all remaining pixels in one step.
161 if (x < length) {
162 const __m256i epilogueMask = _mm256_add_epi32(a: offsetMask, b: _mm256_set1_epi32(i: x - length));
163 const __m256i srcVector = _mm256_maskload_epi32(X: (const int *)&src[x], M: epilogueMask);
164 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
165 if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
166 if (_mm256_testc_si256(a: srcVector, b: epilogueAlphaMask)) {
167 _mm256_maskstore_epi32(X: (int *)&dst[x], M: epilogueMask, Y: srcVector);
168 } else {
169 __m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
170 alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
171 __m256i dstVector = _mm256_maskload_epi32(X: (int *)&dst[x], M: epilogueMask);
172 BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
173 dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
174 _mm256_maskstore_epi32(X: (int *)&dst[x], M: epilogueMask, Y: dstVector);
175 }
176 }
177 }
178}
179
180
181// See BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2 for details.
182inline static void Q_DECL_VECTORCALL
183BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(quint32 *dst, const quint32 *src, const int length, const int const_alpha)
184{
185 int x = 0;
186
187 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
188 blend_pixel(dst&: dst[x], src: src[x], const_alpha);
189
190 const __m256i half = _mm256_set1_epi16(w: 0x80);
191 const __m256i one = _mm256_set1_epi16(w: 0xff);
192 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
193 const __m256i alphaMask = _mm256_set1_epi32(i: 0xff000000);
194 const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(0xff),b30: 15,b29: char(0xff),b28: 15,b27: char(0xff),b26: 11,b25: char(0xff),b24: 11,b23: char(0xff),b22: 7,b21: char(0xff),b20: 7,b19: char(0xff),b18: 3,b17: char(0xff),b16: 3,
195 b15: char(0xff),b14: 15,b13: char(0xff),b12: 15,b11: char(0xff),b10: 11,b09: char(0xff),b08: 11,b07: char(0xff),b06: 7,b05: char(0xff),b04: 7,b03: char(0xff),b02: 3,b01: char(0xff),b00: 3);
196 const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
197 for (; x < (length - 7); x += 8) {
198 __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
199 if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
200 BYTE_MUL_AVX2(pixelVector&: srcVector, alphaChannel: constAlphaVector, colorMask, half);
201
202 __m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
203 alphaChannel = _mm256_sub_epi16(a: one, b: alphaChannel);
204 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
205 BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
206 dstVector = _mm256_add_epi8(a: dstVector, b: srcVector);
207 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
208 }
209 }
210 SIMD_EPILOGUE(x, length, 7)
211 blend_pixel(dst&: dst[x], src: src[x], const_alpha);
212}
213
214void qt_blend_argb32_on_argb32_avx2(uchar *destPixels, int dbpl,
215 const uchar *srcPixels, int sbpl,
216 int w, int h,
217 int const_alpha)
218{
219 if (const_alpha == 256) {
220 for (int y = 0; y < h; ++y) {
221 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
222 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
223 BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length: w);
224 destPixels += dbpl;
225 srcPixels += sbpl;
226 }
227 } else if (const_alpha != 0) {
228 const_alpha = (const_alpha * 255) >> 8;
229 for (int y = 0; y < h; ++y) {
230 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
231 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
232 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length: w, const_alpha);
233 destPixels += dbpl;
234 srcPixels += sbpl;
235 }
236 }
237}
238
239void qt_blend_rgb32_on_rgb32_avx2(uchar *destPixels, int dbpl,
240 const uchar *srcPixels, int sbpl,
241 int w, int h,
242 int const_alpha)
243{
244 if (const_alpha == 256) {
245 for (int y = 0; y < h; ++y) {
246 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
247 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
248 ::memcpy(dest: dst, src: src, n: w * sizeof(uint));
249 srcPixels += sbpl;
250 destPixels += dbpl;
251 }
252 return;
253 }
254 if (const_alpha == 0)
255 return;
256
257 const __m256i half = _mm256_set1_epi16(w: 0x80);
258 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
259
260 const_alpha = (const_alpha * 255) >> 8;
261 int one_minus_const_alpha = 255 - const_alpha;
262 const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
263 const __m256i oneMinusConstAlpha = _mm256_set1_epi16(w: one_minus_const_alpha);
264 for (int y = 0; y < h; ++y) {
265 const quint32 *src = reinterpret_cast<const quint32 *>(srcPixels);
266 quint32 *dst = reinterpret_cast<quint32 *>(destPixels);
267 int x = 0;
268
269 // First, align dest to 32 bytes:
270 ALIGNMENT_PROLOGUE_32BYTES(dst, x, w)
271 dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
272
273 // 2) interpolate pixels with AVX2
274 for (; x < (w - 7); x += 8) {
275 const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
276 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
277 INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
278 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
279 }
280
281 // 3) Epilogue
282 SIMD_EPILOGUE(x, w, 7)
283 dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: one_minus_const_alpha);
284
285 srcPixels += sbpl;
286 destPixels += dbpl;
287 }
288}
289
290static Q_NEVER_INLINE
291void Q_DECL_VECTORCALL qt_memfillXX_avx2(uchar *dest, __m256i value256, qsizetype bytes)
292{
293 __m128i value128 = _mm256_castsi256_si128(a: value256);
294
295 // main body
296 __m256i *dst256 = reinterpret_cast<__m256i *>(dest);
297 uchar *end = dest + bytes;
298 while (reinterpret_cast<uchar *>(dst256 + 4) <= end) {
299 _mm256_storeu_si256(p: dst256 + 0, a: value256);
300 _mm256_storeu_si256(p: dst256 + 1, a: value256);
301 _mm256_storeu_si256(p: dst256 + 2, a: value256);
302 _mm256_storeu_si256(p: dst256 + 3, a: value256);
303 dst256 += 4;
304 }
305
306 // first epilogue: fewer than 128 bytes / 32 entries
307 bytes = end - reinterpret_cast<uchar *>(dst256);
308 switch (bytes / sizeof(value256)) {
309 case 3: _mm256_storeu_si256(p: dst256++, a: value256); Q_FALLTHROUGH();
310 case 2: _mm256_storeu_si256(p: dst256++, a: value256); Q_FALLTHROUGH();
311 case 1: _mm256_storeu_si256(p: dst256++, a: value256);
312 }
313
314 // second epilogue: fewer than 32 bytes
315 __m128i *dst128 = reinterpret_cast<__m128i *>(dst256);
316 if (bytes & sizeof(value128))
317 _mm_storeu_si128(p: dst128++, b: value128);
318
319 // third epilogue: fewer than 16 bytes
320 if (bytes & 8)
321 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(end - 8), a: value128);
322}
323
324void qt_memfill64_avx2(quint64 *dest, quint64 value, qsizetype count)
325{
326#if defined(Q_CC_GNU) && !defined(Q_CC_CLANG)
327 // work around https://gcc.gnu.org/bugzilla/show_bug.cgi?id=80820
328 __m128i value64 = _mm_set_epi64x(0, value); // _mm_cvtsi64_si128(value);
329# ifdef Q_PROCESSOR_X86_64
330 asm ("" : "+x" (value64));
331# endif
332 __m256i value256 = _mm256_broadcastq_epi64(value64);
333#else
334 __m256i value256 = _mm256_set1_epi64x(q: value);
335#endif
336
337 qt_memfillXX_avx2(dest: reinterpret_cast<uchar *>(dest), value256, bytes: count * sizeof(quint64));
338}
339
340void qt_memfill32_avx2(quint32 *dest, quint32 value, qsizetype count)
341{
342 if (count % 2) {
343 // odd number of pixels, round to even
344 *dest++ = value;
345 --count;
346 }
347 qt_memfillXX_avx2(dest: reinterpret_cast<uchar *>(dest), value256: _mm256_set1_epi32(i: value), bytes: count * sizeof(quint32));
348}
349
350void QT_FASTCALL comp_func_SourceOver_avx2(uint *destPixels, const uint *srcPixels, int length, uint const_alpha)
351{
352 Q_ASSERT(const_alpha < 256);
353
354 const quint32 *src = (const quint32 *) srcPixels;
355 quint32 *dst = (quint32 *) destPixels;
356
357 if (const_alpha == 255)
358 BLEND_SOURCE_OVER_ARGB32_AVX2(dst, src, length);
359 else
360 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_AVX2(dst, src, length, const_alpha);
361}
362
363#if QT_CONFIG(raster_64bit)
364void QT_FASTCALL comp_func_SourceOver_rgb64_avx2(QRgba64 *dst, const QRgba64 *src, int length, uint const_alpha)
365{
366 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
367 const __m256i half = _mm256_set1_epi32(i: 0x8000);
368 const __m256i one = _mm256_set1_epi32(i: 0xffff);
369 const __m256i colorMask = _mm256_set1_epi32(i: 0x0000ffff);
370 __m256i alphaMask = _mm256_set1_epi32(i: 0xff000000);
371 alphaMask = _mm256_unpacklo_epi8(a: alphaMask, b: alphaMask);
372 const __m256i alphaShuffleMask = _mm256_set_epi8(b31: char(0xff),b30: char(0xff),b29: 15,b28: 14,b27: char(0xff),b26: char(0xff),b25: 15,b24: 14,b23: char(0xff),b22: char(0xff),b21: 7,b20: 6,b19: char(0xff),b18: char(0xff),b17: 7,b16: 6,
373 b15: char(0xff),b14: char(0xff),b13: 15,b12: 14,b11: char(0xff),b10: char(0xff),b09: 15,b08: 14,b07: char(0xff),b06: char(0xff),b05: 7,b04: 6,b03: char(0xff),b02: char(0xff),b01: 7,b00: 6);
374
375 if (const_alpha == 255) {
376 int x = 0;
377 for (; x < length && (quintptr(dst + x) & 31); ++x)
378 blend_pixel(dst&: dst[x], src: src[x]);
379 for (; x < length - 3; x += 4) {
380 const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
381 if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
382 // Not all transparent
383 if (_mm256_testc_si256(a: srcVector, b: alphaMask)) {
384 // All opaque
385 _mm256_store_si256(p: (__m256i *)&dst[x], a: srcVector);
386 } else {
387 __m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
388 alphaChannel = _mm256_sub_epi32(a: one, b: alphaChannel);
389 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
390 BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
391 dstVector = _mm256_add_epi16(a: dstVector, b: srcVector);
392 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
393 }
394 }
395 }
396 SIMD_EPILOGUE(x, length, 3)
397 blend_pixel(dst&: dst[x], src: src[x]);
398 } else {
399 const __m256i constAlphaVector = _mm256_set1_epi32(i: const_alpha | (const_alpha << 8));
400 int x = 0;
401 for (; x < length && (quintptr(dst + x) & 31); ++x)
402 blend_pixel(dst&: dst[x], src: src[x], const_alpha);
403 for (; x < length - 3; x += 4) {
404 __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
405 if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
406 // Not all transparent
407 BYTE_MUL_RGB64_AVX2(pixelVector&: srcVector, alphaChannel: constAlphaVector, colorMask, half);
408
409 __m256i alphaChannel = _mm256_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
410 alphaChannel = _mm256_sub_epi32(a: one, b: alphaChannel);
411 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
412 BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel, colorMask, half);
413 dstVector = _mm256_add_epi16(a: dstVector, b: srcVector);
414 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
415 }
416 }
417 SIMD_EPILOGUE(x, length, 3)
418 blend_pixel(dst&: dst[x], src: src[x], const_alpha);
419 }
420}
421#endif
422
423#if QT_CONFIG(raster_fp)
424void QT_FASTCALL comp_func_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, const QRgbaFloat32 *src, int length, uint const_alpha)
425{
426 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
427
428 const float a = const_alpha / 255.0f;
429 const __m128 one = _mm_set1_ps(w: 1.0f);
430 const __m128 constAlphaVector = _mm_set1_ps(w: a);
431 const __m256 one256 = _mm256_set1_ps(w: 1.0f);
432 const __m256 constAlphaVector256 = _mm256_set1_ps(w: a);
433 int x = 0;
434 for (; x < length - 1; x += 2) {
435 __m256 srcVector = _mm256_loadu_ps(p: (const float *)&src[x]);
436 __m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
437 srcVector = _mm256_mul_ps(a: srcVector, b: constAlphaVector256);
438 __m256 alphaChannel = _mm256_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3));
439 alphaChannel = _mm256_sub_ps(a: one256, b: alphaChannel);
440 dstVector = _mm256_mul_ps(a: dstVector, b: alphaChannel);
441 dstVector = _mm256_add_ps(a: dstVector, b: srcVector);
442 _mm256_storeu_ps(p: (float *)(dst + x), a: dstVector);
443 }
444 if (x < length) {
445 __m128 srcVector = _mm_load_ps(p: (float *)(src + x));
446 __m128 dstVector = _mm_load_ps(p: (const float *)(dst + x));
447 srcVector = _mm_mul_ps(a: srcVector, b: constAlphaVector);
448 __m128 alphaChannel = _mm_permute_ps(srcVector, _MM_SHUFFLE(3, 3, 3, 3));
449 alphaChannel = _mm_sub_ps(a: one, b: alphaChannel);
450 dstVector = _mm_mul_ps(a: dstVector, b: alphaChannel);
451 dstVector = _mm_add_ps(a: dstVector, b: srcVector);
452 _mm_store_ps(p: (float *)(dst + x), a: dstVector);
453 }
454}
455#endif
456
457void QT_FASTCALL comp_func_Source_avx2(uint *dst, const uint *src, int length, uint const_alpha)
458{
459 if (const_alpha == 255) {
460 ::memcpy(dest: dst, src: src, n: length * sizeof(uint));
461 } else {
462 const int ialpha = 255 - const_alpha;
463
464 int x = 0;
465
466 // 1) prologue, align on 32 bytes
467 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
468 dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
469
470 // 2) interpolate pixels with AVX2
471 const __m256i half = _mm256_set1_epi16(w: 0x80);
472 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
473 const __m256i constAlphaVector = _mm256_set1_epi16(w: const_alpha);
474 const __m256i oneMinusConstAlpha = _mm256_set1_epi16(w: ialpha);
475 for (; x < length - 7; x += 8) {
476 const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
477 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
478 INTERPOLATE_PIXEL_255_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
479 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
480 }
481
482 // 3) Epilogue
483 SIMD_EPILOGUE(x, length, 7)
484 dst[x] = INTERPOLATE_PIXEL_255(x: src[x], a: const_alpha, y: dst[x], b: ialpha);
485 }
486}
487
488#if QT_CONFIG(raster_64bit)
489void QT_FASTCALL comp_func_Source_rgb64_avx2(QRgba64 *dst, const QRgba64 *src, int length, uint const_alpha)
490{
491 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
492 if (const_alpha == 255) {
493 ::memcpy(dest: dst, src: src, n: length * sizeof(QRgba64));
494 } else {
495 const uint ca = const_alpha | (const_alpha << 8); // adjust to [0-65535]
496 const uint cia = 65535 - ca;
497
498 int x = 0;
499
500 // 1) prologue, align on 32 bytes
501 for (; x < length && (quintptr(dst + x) & 31); ++x)
502 dst[x] = interpolate65535(x: src[x], alpha1: ca, y: dst[x], alpha2: cia);
503
504 // 2) interpolate pixels with AVX2
505 const __m256i half = _mm256_set1_epi32(i: 0x8000);
506 const __m256i colorMask = _mm256_set1_epi32(i: 0x0000ffff);
507 const __m256i constAlphaVector = _mm256_set1_epi32(i: ca);
508 const __m256i oneMinusConstAlpha = _mm256_set1_epi32(i: cia);
509 for (; x < length - 3; x += 4) {
510 const __m256i srcVector = _mm256_lddqu_si256(p: (const __m256i *)&src[x]);
511 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
512 INTERPOLATE_PIXEL_RGB64_AVX2(srcVector, dstVector, alphaChannel: constAlphaVector, oneMinusAlphaChannel: oneMinusConstAlpha, colorMask, half);
513 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
514 }
515
516 // 3) Epilogue
517 SIMD_EPILOGUE(x, length, 3)
518 dst[x] = interpolate65535(x: src[x], alpha1: ca, y: dst[x], alpha2: cia);
519 }
520}
521#endif
522
523#if QT_CONFIG(raster_fp)
524void QT_FASTCALL comp_func_Source_rgbafp_avx2(QRgbaFloat32 *dst, const QRgbaFloat32 *src, int length, uint const_alpha)
525{
526 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
527 if (const_alpha == 255) {
528 ::memcpy(dest: dst, src: src, n: length * sizeof(QRgbaFloat32));
529 } else {
530 const float ca = const_alpha / 255.f;
531 const float cia = 1.0f - ca;
532
533 const __m128 constAlphaVector = _mm_set1_ps(w: ca);
534 const __m128 oneMinusConstAlpha = _mm_set1_ps(w: cia);
535 const __m256 constAlphaVector256 = _mm256_set1_ps(w: ca);
536 const __m256 oneMinusConstAlpha256 = _mm256_set1_ps(w: cia);
537 int x = 0;
538 for (; x < length - 1; x += 2) {
539 __m256 srcVector = _mm256_loadu_ps(p: (const float *)&src[x]);
540 __m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
541 srcVector = _mm256_mul_ps(a: srcVector, b: constAlphaVector256);
542 dstVector = _mm256_mul_ps(a: dstVector, b: oneMinusConstAlpha256);
543 dstVector = _mm256_add_ps(a: dstVector, b: srcVector);
544 _mm256_storeu_ps(p: (float *)&dst[x], a: dstVector);
545 }
546 if (x < length) {
547 __m128 srcVector = _mm_load_ps(p: (const float *)&src[x]);
548 __m128 dstVector = _mm_load_ps(p: (const float *)&dst[x]);
549 srcVector = _mm_mul_ps(a: srcVector, b: constAlphaVector);
550 dstVector = _mm_mul_ps(a: dstVector, b: oneMinusConstAlpha);
551 dstVector = _mm_add_ps(a: dstVector, b: srcVector);
552 _mm_store_ps(p: (float *)&dst[x], a: dstVector);
553 }
554 }
555}
556#endif
557
558void QT_FASTCALL comp_func_solid_SourceOver_avx2(uint *destPixels, int length, uint color, uint const_alpha)
559{
560 if ((const_alpha & qAlpha(rgb: color)) == 255) {
561 qt_memfill32(destPixels, color, length);
562 } else {
563 if (const_alpha != 255)
564 color = BYTE_MUL(x: color, a: const_alpha);
565
566 const quint32 minusAlphaOfColor = qAlpha(rgb: ~color);
567 int x = 0;
568
569 quint32 *dst = (quint32 *) destPixels;
570 const __m256i colorVector = _mm256_set1_epi32(i: color);
571 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
572 const __m256i half = _mm256_set1_epi16(w: 0x80);
573 const __m256i minusAlphaOfColorVector = _mm256_set1_epi16(w: minusAlphaOfColor);
574
575 ALIGNMENT_PROLOGUE_32BYTES(dst, x, length)
576 destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
577
578 for (; x < length - 7; x += 8) {
579 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
580 BYTE_MUL_AVX2(pixelVector&: dstVector, alphaChannel: minusAlphaOfColorVector, colorMask, half);
581 dstVector = _mm256_add_epi8(a: colorVector, b: dstVector);
582 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
583 }
584 SIMD_EPILOGUE(x, length, 7)
585 destPixels[x] = color + BYTE_MUL(x: destPixels[x], a: minusAlphaOfColor);
586 }
587}
588
589#if QT_CONFIG(raster_64bit)
590void QT_FASTCALL comp_func_solid_SourceOver_rgb64_avx2(QRgba64 *destPixels, int length, QRgba64 color, uint const_alpha)
591{
592 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
593 if (const_alpha == 255 && color.isOpaque()) {
594 qt_memfill64((quint64*)destPixels, color, length);
595 } else {
596 if (const_alpha != 255)
597 color = multiplyAlpha255(rgba64: color, alpha255: const_alpha);
598
599 const uint minusAlphaOfColor = 65535 - color.alpha();
600 int x = 0;
601 quint64 *dst = (quint64 *) destPixels;
602 const __m256i colorVector = _mm256_set1_epi64x(q: color);
603 const __m256i colorMask = _mm256_set1_epi32(i: 0x0000ffff);
604 const __m256i half = _mm256_set1_epi32(i: 0x8000);
605 const __m256i minusAlphaOfColorVector = _mm256_set1_epi32(i: minusAlphaOfColor);
606
607 for (; x < length && (quintptr(dst + x) & 31); ++x)
608 destPixels[x] = color + multiplyAlpha65535(rgba64: destPixels[x], alpha65535: minusAlphaOfColor);
609
610 for (; x < length - 3; x += 4) {
611 __m256i dstVector = _mm256_load_si256(p: (__m256i *)&dst[x]);
612 BYTE_MUL_RGB64_AVX2(pixelVector&: dstVector, alphaChannel: minusAlphaOfColorVector, colorMask, half);
613 dstVector = _mm256_add_epi16(a: colorVector, b: dstVector);
614 _mm256_store_si256(p: (__m256i *)&dst[x], a: dstVector);
615 }
616 SIMD_EPILOGUE(x, length, 3)
617 destPixels[x] = color + multiplyAlpha65535(rgba64: destPixels[x], alpha65535: minusAlphaOfColor);
618 }
619}
620#endif
621
622#if QT_CONFIG(raster_fp)
623void QT_FASTCALL comp_func_solid_Source_rgbafp_avx2(QRgbaFloat32 *dst, int length, QRgbaFloat32 color, uint const_alpha)
624{
625 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
626 if (const_alpha == 255) {
627 for (int i = 0; i < length; ++i)
628 dst[i] = color;
629 } else {
630 const float a = const_alpha / 255.0f;
631 const __m128 alphaVector = _mm_set1_ps(w: a);
632 const __m128 minusAlphaVector = _mm_set1_ps(w: 1.0f - a);
633 __m128 colorVector = _mm_load_ps(p: (const float *)&color);
634 colorVector = _mm_mul_ps(a: colorVector, b: alphaVector);
635 const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1);
636 const __m256 minusAlphaVector256 = _mm256_set1_ps(w: 1.0f - a);
637 int x = 0;
638 for (; x < length - 1; x += 2) {
639 __m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
640 dstVector = _mm256_mul_ps(a: dstVector, b: minusAlphaVector256);
641 dstVector = _mm256_add_ps(a: dstVector, b: colorVector256);
642 _mm256_storeu_ps(p: (float *)&dst[x], a: dstVector);
643 }
644 if (x < length) {
645 __m128 dstVector = _mm_load_ps(p: (const float *)&dst[x]);
646 dstVector = _mm_mul_ps(a: dstVector, b: minusAlphaVector);
647 dstVector = _mm_add_ps(a: dstVector, b: colorVector);
648 _mm_store_ps(p: (float *)&dst[x], a: dstVector);
649 }
650 }
651}
652
653void QT_FASTCALL comp_func_solid_SourceOver_rgbafp_avx2(QRgbaFloat32 *dst, int length, QRgbaFloat32 color, uint const_alpha)
654{
655 Q_ASSERT(const_alpha < 256); // const_alpha is in [0-255]
656 if (const_alpha == 255 && color.a >= 1.0f) {
657 for (int i = 0; i < length; ++i)
658 dst[i] = color;
659 } else {
660 __m128 colorVector = _mm_load_ps(p: (const float *)&color);
661 if (const_alpha != 255)
662 colorVector = _mm_mul_ps(a: colorVector, b: _mm_set1_ps(w: const_alpha / 255.f));
663 __m128 minusAlphaOfColorVector =
664 _mm_sub_ps(a: _mm_set1_ps(w: 1.0f), _mm_permute_ps(colorVector, _MM_SHUFFLE(3, 3, 3, 3)));
665 const __m256 colorVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(colorVector), colorVector, 1);
666 const __m256 minusAlphaVector256 = _mm256_insertf128_ps(_mm256_castps128_ps256(minusAlphaOfColorVector),
667 minusAlphaOfColorVector, 1);
668 int x = 0;
669 for (; x < length - 1; x += 2) {
670 __m256 dstVector = _mm256_loadu_ps(p: (const float *)&dst[x]);
671 dstVector = _mm256_mul_ps(a: dstVector, b: minusAlphaVector256);
672 dstVector = _mm256_add_ps(a: dstVector, b: colorVector256);
673 _mm256_storeu_ps(p: (float *)&dst[x], a: dstVector);
674 }
675 if (x < length) {
676 __m128 dstVector = _mm_load_ps(p: (const float *)&dst[x]);
677 dstVector = _mm_mul_ps(a: dstVector, b: minusAlphaOfColorVector);
678 dstVector = _mm_add_ps(a: dstVector, b: colorVector);
679 _mm_store_ps(p: (float *)&dst[x], a: dstVector);
680 }
681 }
682}
683#endif
684
685#define interpolate_4_pixels_16_avx2(tlr1, tlr2, blr1, blr2, distx, disty, colorMask, v_256, b) \
686{ \
687 /* Correct for later unpack */ \
688 const __m256i vdistx = _mm256_permute4x64_epi64(distx, _MM_SHUFFLE(3, 1, 2, 0)); \
689 const __m256i vdisty = _mm256_permute4x64_epi64(disty, _MM_SHUFFLE(3, 1, 2, 0)); \
690 \
691 __m256i dxdy = _mm256_mullo_epi16 (vdistx, vdisty); \
692 const __m256i distx_ = _mm256_slli_epi16(vdistx, 4); \
693 const __m256i disty_ = _mm256_slli_epi16(vdisty, 4); \
694 __m256i idxidy = _mm256_add_epi16(dxdy, _mm256_sub_epi16(v_256, _mm256_add_epi16(distx_, disty_))); \
695 __m256i dxidy = _mm256_sub_epi16(distx_, dxdy); \
696 __m256i idxdy = _mm256_sub_epi16(disty_, dxdy); \
697 \
698 __m256i tlr1AG = _mm256_srli_epi16(tlr1, 8); \
699 __m256i tlr1RB = _mm256_and_si256(tlr1, colorMask); \
700 __m256i tlr2AG = _mm256_srli_epi16(tlr2, 8); \
701 __m256i tlr2RB = _mm256_and_si256(tlr2, colorMask); \
702 __m256i blr1AG = _mm256_srli_epi16(blr1, 8); \
703 __m256i blr1RB = _mm256_and_si256(blr1, colorMask); \
704 __m256i blr2AG = _mm256_srli_epi16(blr2, 8); \
705 __m256i blr2RB = _mm256_and_si256(blr2, colorMask); \
706 \
707 __m256i odxidy1 = _mm256_unpacklo_epi32(idxidy, dxidy); \
708 __m256i odxidy2 = _mm256_unpackhi_epi32(idxidy, dxidy); \
709 tlr1AG = _mm256_mullo_epi16(tlr1AG, odxidy1); \
710 tlr1RB = _mm256_mullo_epi16(tlr1RB, odxidy1); \
711 tlr2AG = _mm256_mullo_epi16(tlr2AG, odxidy2); \
712 tlr2RB = _mm256_mullo_epi16(tlr2RB, odxidy2); \
713 __m256i odxdy1 = _mm256_unpacklo_epi32(idxdy, dxdy); \
714 __m256i odxdy2 = _mm256_unpackhi_epi32(idxdy, dxdy); \
715 blr1AG = _mm256_mullo_epi16(blr1AG, odxdy1); \
716 blr1RB = _mm256_mullo_epi16(blr1RB, odxdy1); \
717 blr2AG = _mm256_mullo_epi16(blr2AG, odxdy2); \
718 blr2RB = _mm256_mullo_epi16(blr2RB, odxdy2); \
719 \
720 /* Add the values, and shift to only keep 8 significant bits per colors */ \
721 __m256i topAG = _mm256_hadd_epi32(tlr1AG, tlr2AG); \
722 __m256i topRB = _mm256_hadd_epi32(tlr1RB, tlr2RB); \
723 __m256i botAG = _mm256_hadd_epi32(blr1AG, blr2AG); \
724 __m256i botRB = _mm256_hadd_epi32(blr1RB, blr2RB); \
725 __m256i rAG = _mm256_add_epi16(topAG, botAG); \
726 __m256i rRB = _mm256_add_epi16(topRB, botRB); \
727 rRB = _mm256_srli_epi16(rRB, 8); \
728 /* Correct for hadd */ \
729 rAG = _mm256_permute4x64_epi64(rAG, _MM_SHUFFLE(3, 1, 2, 0)); \
730 rRB = _mm256_permute4x64_epi64(rRB, _MM_SHUFFLE(3, 1, 2, 0)); \
731 _mm256_storeu_si256((__m256i*)(b), _mm256_blendv_epi8(rAG, rRB, colorMask)); \
732}
733
734inline void fetchTransformedBilinear_pixelBounds(int, int l1, int l2, int &v1, int &v2)
735{
736 if (v1 < l1)
737 v2 = v1 = l1;
738 else if (v1 >= l2)
739 v2 = v1 = l2;
740 else
741 v2 = v1 + 1;
742 Q_ASSERT(v1 >= l1 && v1 <= l2);
743 Q_ASSERT(v2 >= l1 && v2 <= l2);
744}
745
746void QT_FASTCALL intermediate_adder_avx2(uint *b, uint *end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx);
747
748void QT_FASTCALL fetchTransformedBilinearARGB32PM_simple_scale_helper_avx2(uint *b, uint *end, const QTextureData &image,
749 int &fx, int &fy, int fdx, int /*fdy*/)
750{
751 int y1 = (fy >> 16);
752 int y2;
753 fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - 1, v1&: y1, v2&: y2);
754 const uint *s1 = (const uint *)image.scanLine(y: y1);
755 const uint *s2 = (const uint *)image.scanLine(y: y2);
756
757 const int disty = (fy & 0x0000ffff) >> 8;
758 const int idisty = 256 - disty;
759 const int length = end - b;
760
761 // The intermediate buffer is generated in the positive direction
762 const int adjust = (fdx < 0) ? fdx * length : 0;
763 const int offset = (fx + adjust) >> 16;
764 int x = offset;
765
766 IntermediateBuffer intermediate;
767 // count is the size used in the intermediate_buffer.
768 int count = (qint64(length) * qAbs(t: fdx) + FixedScale - 1) / FixedScale + 2;
769 // length is supposed to be <= BufferSize either because data->m11 < 1 or
770 // data->m11 < 2, and any larger buffers split
771 Q_ASSERT(count <= BufferSize + 2);
772 int f = 0;
773 int lim = qMin(a: count, b: image.x2 - x);
774 if (x < image.x1) {
775 Q_ASSERT(x < image.x2);
776 uint t = s1[image.x1];
777 uint b = s2[image.x1];
778 quint32 rb = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
779 quint32 ag = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
780 do {
781 intermediate.buffer_rb[f] = rb;
782 intermediate.buffer_ag[f] = ag;
783 f++;
784 x++;
785 } while (x < image.x1 && f < lim);
786 }
787
788 const __m256i disty_ = _mm256_set1_epi16(w: disty);
789 const __m256i idisty_ = _mm256_set1_epi16(w: idisty);
790 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
791
792 lim -= 7;
793 for (; f < lim; x += 8, f += 8) {
794 // Load 8 pixels from s1, and split the alpha-green and red-blue component
795 __m256i top = _mm256_loadu_si256(p: (const __m256i*)((const uint *)(s1)+x));
796 __m256i topAG = _mm256_srli_epi16(a: top, count: 8);
797 __m256i topRB = _mm256_and_si256(a: top, b: colorMask);
798 // Multiplies each color component by idisty
799 topAG = _mm256_mullo_epi16 (a: topAG, b: idisty_);
800 topRB = _mm256_mullo_epi16 (a: topRB, b: idisty_);
801
802 // Same for the s2 vector
803 __m256i bottom = _mm256_loadu_si256(p: (const __m256i*)((const uint *)(s2)+x));
804 __m256i bottomAG = _mm256_srli_epi16(a: bottom, count: 8);
805 __m256i bottomRB = _mm256_and_si256(a: bottom, b: colorMask);
806 bottomAG = _mm256_mullo_epi16 (a: bottomAG, b: disty_);
807 bottomRB = _mm256_mullo_epi16 (a: bottomRB, b: disty_);
808
809 // Add the values, and shift to only keep 8 significant bits per colors
810 __m256i rAG =_mm256_add_epi16(a: topAG, b: bottomAG);
811 rAG = _mm256_srli_epi16(a: rAG, count: 8);
812 _mm256_storeu_si256(p: (__m256i*)(&intermediate.buffer_ag[f]), a: rAG);
813 __m256i rRB =_mm256_add_epi16(a: topRB, b: bottomRB);
814 rRB = _mm256_srli_epi16(a: rRB, count: 8);
815 _mm256_storeu_si256(p: (__m256i*)(&intermediate.buffer_rb[f]), a: rRB);
816 }
817
818 for (; f < count; f++) { // Same as above but without simd
819 x = qMin(a: x, b: image.x2 - 1);
820
821 uint t = s1[x];
822 uint b = s2[x];
823
824 intermediate.buffer_rb[f] = (((t & 0xff00ff) * idisty + (b & 0xff00ff) * disty) >> 8) & 0xff00ff;
825 intermediate.buffer_ag[f] = ((((t>>8) & 0xff00ff) * idisty + ((b>>8) & 0xff00ff) * disty) >> 8) & 0xff00ff;
826 x++;
827 }
828
829 // Now interpolate the values from the intermediate_buffer to get the final result.
830 intermediate_adder_avx2(b, end, intermediate, offset, fx, fdx);
831}
832
833void QT_FASTCALL intermediate_adder_avx2(uint *b, uint *end, const IntermediateBuffer &intermediate, int offset, int &fx, int fdx)
834{
835 fx -= offset * FixedScale;
836
837 const __m128i v_fdx = _mm_set1_epi32(i: fdx * 4);
838 const __m128i v_blend = _mm_set1_epi32(i: 0x00800080);
839 const __m128i vdx_shuffle = _mm_set_epi8(b15: char(0x80), b14: 13, b13: char(0x80), b12: 13, b11: char(0x80), b10: 9, b9: char(0x80), b8: 9,
840 b7: char(0x80), b6: 5, b5: char(0x80), b4: 5, b3: char(0x80), b2: 1, b1: char(0x80), b0: 1);
841 __m128i v_fx = _mm_setr_epi32(i0: fx, i1: fx + fdx, i2: fx + fdx + fdx, i3: fx + fdx + fdx + fdx);
842
843 while (b < end - 3) {
844 const __m128i offset = _mm_srli_epi32(a: v_fx, count: 16);
845 __m256i vrb = _mm256_i32gather_epi64((const long long *)intermediate.buffer_rb, offset, 4);
846 __m256i vag = _mm256_i32gather_epi64((const long long *)intermediate.buffer_ag, offset, 4);
847
848 __m128i vdx = _mm_shuffle_epi8(a: v_fx, b: vdx_shuffle);
849 __m128i vidx = _mm_sub_epi16(a: _mm_set1_epi16(w: 256), b: vdx);
850 __m256i vmulx = _mm256_castsi128_si256(a: _mm_unpacklo_epi32(a: vidx, b: vdx));
851 vmulx = _mm256_inserti128_si256(vmulx, _mm_unpackhi_epi32(vidx, vdx), 1);
852
853 vrb = _mm256_mullo_epi16(a: vrb, b: vmulx);
854 vag = _mm256_mullo_epi16(a: vag, b: vmulx);
855
856 __m256i vrbag = _mm256_hadd_epi32(a: vrb, b: vag);
857 vrbag = _mm256_permute4x64_epi64(vrbag, _MM_SHUFFLE(3, 1, 2, 0));
858
859 __m128i rb = _mm256_castsi256_si128(a: vrbag);
860 __m128i ag = _mm256_extracti128_si256(vrbag, 1);
861 rb = _mm_srli_epi16(a: rb, count: 8);
862
863 _mm_storeu_si128(p: (__m128i*)b, b: _mm_blendv_epi8(V1: ag, V2: rb, M: v_blend));
864
865 b += 4;
866 v_fx = _mm_add_epi32(a: v_fx, b: v_fdx);
867 }
868 fx = _mm_cvtsi128_si32(a: v_fx);
869 while (b < end) {
870 const int x = (fx >> 16);
871
872 const uint distx = (fx & 0x0000ffff) >> 8;
873 const uint idistx = 256 - distx;
874 const uint rb = (intermediate.buffer_rb[x] * idistx + intermediate.buffer_rb[x + 1] * distx) & 0xff00ff00;
875 const uint ag = (intermediate.buffer_ag[x] * idistx + intermediate.buffer_ag[x + 1] * distx) & 0xff00ff00;
876 *b = (rb >> 8) | ag;
877 b++;
878 fx += fdx;
879 }
880 fx += offset * FixedScale;
881}
882
883void QT_FASTCALL fetchTransformedBilinearARGB32PM_downscale_helper_avx2(uint *b, uint *end, const QTextureData &image,
884 int &fx, int &fy, int fdx, int /*fdy*/)
885{
886 int y1 = (fy >> 16);
887 int y2;
888 fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - 1, v1&: y1, v2&: y2);
889 const uint *s1 = (const uint *)image.scanLine(y: y1);
890 const uint *s2 = (const uint *)image.scanLine(y: y2);
891 const int disty8 = (fy & 0x0000ffff) >> 8;
892 const int disty4 = (disty8 + 0x08) >> 4;
893
894 const qint64 min_fx = qint64(image.x1) * FixedScale;
895 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
896 while (b < end) {
897 int x1 = (fx >> 16);
898 int x2;
899 fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - 1, v1&: x1, v2&: x2);
900 if (x1 != x2)
901 break;
902 uint top = s1[x1];
903 uint bot = s2[x1];
904 *b = INTERPOLATE_PIXEL_256(x: top, a: 256 - disty8, y: bot, b: disty8);
905 fx += fdx;
906 ++b;
907 }
908 uint *boundedEnd = end;
909 if (fdx > 0)
910 boundedEnd = qMin(a: boundedEnd, b: b + (max_fx - fx) / fdx);
911 else if (fdx < 0)
912 boundedEnd = qMin(a: boundedEnd, b: b + (min_fx - fx) / fdx);
913
914 // A fast middle part without boundary checks
915 const __m256i vdistShuffle =
916 _mm256_setr_epi8(b31: 0, b30: char(0x80), b29: 0, b28: char(0x80), b27: 4, b26: char(0x80), b25: 4, b24: char(0x80), b23: 8, b22: char(0x80), b21: 8, b20: char(0x80), b19: 12, b18: char(0x80), b17: 12, b16: char(0x80),
917 b15: 0, b14: char(0x80), b13: 0, b12: char(0x80), b11: 4, b10: char(0x80), b09: 4, b08: char(0x80), b07: 8, b06: char(0x80), b05: 8, b04: char(0x80), b03: 12, b02: char(0x80), b01: 12, b00: char(0x80));
918 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
919 const __m256i v_256 = _mm256_set1_epi16(w: 256);
920 const __m256i v_disty = _mm256_set1_epi16(w: disty4);
921 const __m256i v_fdx = _mm256_set1_epi32(i: fdx * 8);
922 const __m256i v_fx_r = _mm256_set1_epi32(i: 0x08);
923 const __m256i v_index = _mm256_setr_epi32(i0: 0, i1: 1, i2: 2, i3: 3, i4: 4, i5: 5, i6: 6, i7: 7);
924 __m256i v_fx = _mm256_set1_epi32(i: fx);
925 v_fx = _mm256_add_epi32(a: v_fx, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdx), b: v_index));
926
927 while (b < boundedEnd - 7) {
928 const __m256i offset = _mm256_srli_epi32(a: v_fx, count: 16);
929 const __m128i offsetLo = _mm256_castsi256_si128(a: offset);
930 const __m128i offsetHi = _mm256_extracti128_si256(offset, 1);
931 const __m256i toplo = _mm256_i32gather_epi64((const long long *)s1, offsetLo, 4);
932 const __m256i tophi = _mm256_i32gather_epi64((const long long *)s1, offsetHi, 4);
933 const __m256i botlo = _mm256_i32gather_epi64((const long long *)s2, offsetLo, 4);
934 const __m256i bothi = _mm256_i32gather_epi64((const long long *)s2, offsetHi, 4);
935
936 __m256i v_distx = _mm256_srli_epi16(a: v_fx, count: 8);
937 v_distx = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_distx, b: v_fx_r), count: 4);
938 v_distx = _mm256_shuffle_epi8(a: v_distx, b: vdistShuffle);
939
940 interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
941 b += 8;
942 v_fx = _mm256_add_epi32(a: v_fx, b: v_fdx);
943 }
944 fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
945
946 while (b < boundedEnd) {
947 int x = (fx >> 16);
948 int distx8 = (fx & 0x0000ffff) >> 8;
949 *b = interpolate_4_pixels(t: s1 + x, b: s2 + x, distx: distx8, disty: disty8);
950 fx += fdx;
951 ++b;
952 }
953
954 while (b < end) {
955 int x1 = (fx >> 16);
956 int x2;
957 fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - 1, v1&: x1, v2&: x2);
958 uint tl = s1[x1];
959 uint tr = s1[x2];
960 uint bl = s2[x1];
961 uint br = s2[x2];
962 int distx8 = (fx & 0x0000ffff) >> 8;
963 *b = interpolate_4_pixels(tl, tr, bl, br, distx: distx8, disty: disty8);
964 fx += fdx;
965 ++b;
966 }
967}
968
969void QT_FASTCALL fetchTransformedBilinearARGB32PM_fast_rotate_helper_avx2(uint *b, uint *end, const QTextureData &image,
970 int &fx, int &fy, int fdx, int fdy)
971{
972 const qint64 min_fx = qint64(image.x1) * FixedScale;
973 const qint64 max_fx = qint64(image.x2 - 1) * FixedScale;
974 const qint64 min_fy = qint64(image.y1) * FixedScale;
975 const qint64 max_fy = qint64(image.y2 - 1) * FixedScale;
976 // first handle the possibly bounded part in the beginning
977 while (b < end) {
978 int x1 = (fx >> 16);
979 int x2;
980 int y1 = (fy >> 16);
981 int y2;
982 fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - 1, v1&: x1, v2&: x2);
983 fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - 1, v1&: y1, v2&: y2);
984 if (x1 != x2 && y1 != y2)
985 break;
986 const uint *s1 = (const uint *)image.scanLine(y: y1);
987 const uint *s2 = (const uint *)image.scanLine(y: y2);
988 uint tl = s1[x1];
989 uint tr = s1[x2];
990 uint bl = s2[x1];
991 uint br = s2[x2];
992 int distx = (fx & 0x0000ffff) >> 8;
993 int disty = (fy & 0x0000ffff) >> 8;
994 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
995 fx += fdx;
996 fy += fdy;
997 ++b;
998 }
999 uint *boundedEnd = end;
1000 if (fdx > 0)
1001 boundedEnd = qMin(a: boundedEnd, b: b + (max_fx - fx) / fdx);
1002 else if (fdx < 0)
1003 boundedEnd = qMin(a: boundedEnd, b: b + (min_fx - fx) / fdx);
1004 if (fdy > 0)
1005 boundedEnd = qMin(a: boundedEnd, b: b + (max_fy - fy) / fdy);
1006 else if (fdy < 0)
1007 boundedEnd = qMin(a: boundedEnd, b: b + (min_fy - fy) / fdy);
1008
1009 // until boundedEnd we can now have a fast middle part without boundary checks
1010 const __m256i vdistShuffle =
1011 _mm256_setr_epi8(b31: 0, b30: char(0x80), b29: 0, b28: char(0x80), b27: 4, b26: char(0x80), b25: 4, b24: char(0x80), b23: 8, b22: char(0x80), b21: 8, b20: char(0x80), b19: 12, b18: char(0x80), b17: 12, b16: char(0x80),
1012 b15: 0, b14: char(0x80), b13: 0, b12: char(0x80), b11: 4, b10: char(0x80), b09: 4, b08: char(0x80), b07: 8, b06: char(0x80), b05: 8, b04: char(0x80), b03: 12, b02: char(0x80), b01: 12, b00: char(0x80));
1013 const __m256i colorMask = _mm256_set1_epi32(i: 0x00ff00ff);
1014 const __m256i v_256 = _mm256_set1_epi16(w: 256);
1015 const __m256i v_fdx = _mm256_set1_epi32(i: fdx * 8);
1016 const __m256i v_fdy = _mm256_set1_epi32(i: fdy * 8);
1017 const __m256i v_fxy_r = _mm256_set1_epi32(i: 0x08);
1018 const __m256i v_index = _mm256_setr_epi32(i0: 0, i1: 1, i2: 2, i3: 3, i4: 4, i5: 5, i6: 6, i7: 7);
1019 __m256i v_fx = _mm256_set1_epi32(i: fx);
1020 __m256i v_fy = _mm256_set1_epi32(i: fy);
1021 v_fx = _mm256_add_epi32(a: v_fx, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdx), b: v_index));
1022 v_fy = _mm256_add_epi32(a: v_fy, b: _mm256_mullo_epi32(a: _mm256_set1_epi32(i: fdy), b: v_index));
1023
1024 const uchar *textureData = image.imageData;
1025 const qsizetype bytesPerLine = image.bytesPerLine;
1026 const __m256i vbpl = _mm256_set1_epi16(w: bytesPerLine/4);
1027
1028 while (b < boundedEnd - 7) {
1029 const __m256i vy = _mm256_packs_epi32(a: _mm256_srli_epi32(a: v_fy, count: 16), b: _mm256_setzero_si256());
1030 // 8x16bit * 8x16bit -> 8x32bit
1031 __m256i offset = _mm256_unpacklo_epi16(a: _mm256_mullo_epi16(a: vy, b: vbpl), b: _mm256_mulhi_epi16(a: vy, b: vbpl));
1032 offset = _mm256_add_epi32(a: offset, b: _mm256_srli_epi32(a: v_fx, count: 16));
1033 const __m128i offsetLo = _mm256_castsi256_si128(a: offset);
1034 const __m128i offsetHi = _mm256_extracti128_si256(offset, 1);
1035 const uint *topData = (const uint *)(textureData);
1036 const uint *botData = (const uint *)(textureData + bytesPerLine);
1037 const __m256i toplo = _mm256_i32gather_epi64((const long long *)topData, offsetLo, 4);
1038 const __m256i tophi = _mm256_i32gather_epi64((const long long *)topData, offsetHi, 4);
1039 const __m256i botlo = _mm256_i32gather_epi64((const long long *)botData, offsetLo, 4);
1040 const __m256i bothi = _mm256_i32gather_epi64((const long long *)botData, offsetHi, 4);
1041
1042 __m256i v_distx = _mm256_srli_epi16(a: v_fx, count: 8);
1043 __m256i v_disty = _mm256_srli_epi16(a: v_fy, count: 8);
1044 v_distx = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_distx, b: v_fxy_r), count: 4);
1045 v_disty = _mm256_srli_epi16(a: _mm256_add_epi32(a: v_disty, b: v_fxy_r), count: 4);
1046 v_distx = _mm256_shuffle_epi8(a: v_distx, b: vdistShuffle);
1047 v_disty = _mm256_shuffle_epi8(a: v_disty, b: vdistShuffle);
1048
1049 interpolate_4_pixels_16_avx2(toplo, tophi, botlo, bothi, v_distx, v_disty, colorMask, v_256, b);
1050 b += 8;
1051 v_fx = _mm256_add_epi32(a: v_fx, b: v_fdx);
1052 v_fy = _mm256_add_epi32(a: v_fy, b: v_fdy);
1053 }
1054 fx = _mm_extract_epi32(_mm256_castsi256_si128(v_fx) , 0);
1055 fy = _mm_extract_epi32(_mm256_castsi256_si128(v_fy) , 0);
1056
1057 while (b < boundedEnd) {
1058 int x = (fx >> 16);
1059 int y = (fy >> 16);
1060
1061 const uint *s1 = (const uint *)image.scanLine(y);
1062 const uint *s2 = (const uint *)image.scanLine(y: y + 1);
1063
1064 int distx = (fx & 0x0000ffff) >> 8;
1065 int disty = (fy & 0x0000ffff) >> 8;
1066 *b = interpolate_4_pixels(t: s1 + x, b: s2 + x, distx, disty);
1067
1068 fx += fdx;
1069 fy += fdy;
1070 ++b;
1071 }
1072
1073 while (b < end) {
1074 int x1 = (fx >> 16);
1075 int x2;
1076 int y1 = (fy >> 16);
1077 int y2;
1078
1079 fetchTransformedBilinear_pixelBounds(image.width, l1: image.x1, l2: image.x2 - 1, v1&: x1, v2&: x2);
1080 fetchTransformedBilinear_pixelBounds(image.height, l1: image.y1, l2: image.y2 - 1, v1&: y1, v2&: y2);
1081
1082 const uint *s1 = (const uint *)image.scanLine(y: y1);
1083 const uint *s2 = (const uint *)image.scanLine(y: y2);
1084
1085 uint tl = s1[x1];
1086 uint tr = s1[x2];
1087 uint bl = s2[x1];
1088 uint br = s2[x2];
1089
1090 int distx = (fx & 0x0000ffff) >> 8;
1091 int disty = (fy & 0x0000ffff) >> 8;
1092 *b = interpolate_4_pixels(tl, tr, bl, br, distx, disty);
1093
1094 fx += fdx;
1095 fy += fdy;
1096 ++b;
1097 }
1098}
1099
1100static inline __m256i epilogueMaskFromCount(qsizetype count)
1101{
1102 Q_ASSERT(count > 0);
1103 static const __m256i offsetMask = _mm256_setr_epi32(i0: 0, i1: 1, i2: 2, i3: 3, i4: 4, i5: 5, i6: 6, i7: 7);
1104 return _mm256_add_epi32(a: offsetMask, b: _mm256_set1_epi32(i: -count));
1105}
1106
1107template<bool RGBA>
1108static void convertARGBToARGB32PM_avx2(uint *buffer, const uint *src, qsizetype count)
1109{
1110 qsizetype i = 0;
1111 const __m256i alphaMask = _mm256_set1_epi32(i: 0xff000000);
1112 const __m256i rgbaMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 3, b4: 6, b5: 5, b6: 4, b7: 7, b8: 10, b9: 9, b10: 8, b11: 11, b12: 14, b13: 13, b14: 12, b15: 15));
1113 const __m256i shuffleMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: 6, b1: 7, b2: 6, b3: 7, b4: 6, b5: 7, b6: 6, b7: 7, b8: 14, b9: 15, b10: 14, b11: 15, b12: 14, b13: 15, b14: 14, b15: 15));
1114 const __m256i half = _mm256_set1_epi16(w: 0x0080);
1115 const __m256i zero = _mm256_setzero_si256();
1116
1117 for (; i < count - 7; i += 8) {
1118 __m256i srcVector = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src + i));
1119 if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
1120 // keep the two _mm_test[zc]_siXXX next to each other
1121 bool cf = _mm256_testc_si256(a: srcVector, b: alphaMask);
1122 if (RGBA)
1123 srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1124 if (!cf) {
1125 __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: zero);
1126 __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: zero);
1127 __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1128 __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1129 src1 = _mm256_mullo_epi16(a: src1, b: alpha1);
1130 src2 = _mm256_mullo_epi16(a: src2, b: alpha2);
1131 src1 = _mm256_add_epi16(a: src1, b: _mm256_srli_epi16(a: src1, count: 8));
1132 src2 = _mm256_add_epi16(a: src2, b: _mm256_srli_epi16(a: src2, count: 8));
1133 src1 = _mm256_add_epi16(a: src1, b: half);
1134 src2 = _mm256_add_epi16(a: src2, b: half);
1135 src1 = _mm256_srli_epi16(a: src1, count: 8);
1136 src2 = _mm256_srli_epi16(a: src2, count: 8);
1137 src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
1138 src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
1139 srcVector = _mm256_packus_epi16(a: src1, b: src2);
1140 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: srcVector);
1141 } else {
1142 if (buffer != src || RGBA)
1143 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: srcVector);
1144 }
1145 } else {
1146 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: zero);
1147 }
1148 }
1149
1150 if (i < count) {
1151 const __m256i epilogueMask = epilogueMaskFromCount(count: count - i);
1152 __m256i srcVector = _mm256_maskload_epi32(X: reinterpret_cast<const int *>(src + i), M: epilogueMask);
1153 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
1154
1155 if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
1156 // keep the two _mm_test[zc]_siXXX next to each other
1157 bool cf = _mm256_testc_si256(a: srcVector, b: epilogueAlphaMask);
1158 if (RGBA)
1159 srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1160 if (!cf) {
1161 __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: zero);
1162 __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: zero);
1163 __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1164 __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1165 src1 = _mm256_mullo_epi16(a: src1, b: alpha1);
1166 src2 = _mm256_mullo_epi16(a: src2, b: alpha2);
1167 src1 = _mm256_add_epi16(a: src1, b: _mm256_srli_epi16(a: src1, count: 8));
1168 src2 = _mm256_add_epi16(a: src2, b: _mm256_srli_epi16(a: src2, count: 8));
1169 src1 = _mm256_add_epi16(a: src1, b: half);
1170 src2 = _mm256_add_epi16(a: src2, b: half);
1171 src1 = _mm256_srli_epi16(a: src1, count: 8);
1172 src2 = _mm256_srli_epi16(a: src2, count: 8);
1173 src1 = _mm256_blend_epi16(src1, alpha1, 0x88);
1174 src2 = _mm256_blend_epi16(src2, alpha2, 0x88);
1175 srcVector = _mm256_packus_epi16(a: src1, b: src2);
1176 _mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: srcVector);
1177 } else {
1178 if (buffer != src || RGBA)
1179 _mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: srcVector);
1180 }
1181 } else {
1182 _mm256_maskstore_epi32(X: reinterpret_cast<int *>(buffer + i), M: epilogueMask, Y: zero);
1183 }
1184 }
1185}
1186
1187void QT_FASTCALL convertARGB32ToARGB32PM_avx2(uint *buffer, int count, const QList<QRgb> *)
1188{
1189 convertARGBToARGB32PM_avx2<false>(buffer, src: buffer, count);
1190}
1191
1192void QT_FASTCALL convertRGBA8888ToARGB32PM_avx2(uint *buffer, int count, const QList<QRgb> *)
1193{
1194 convertARGBToARGB32PM_avx2<true>(buffer, src: buffer, count);
1195}
1196
1197const uint *QT_FASTCALL fetchARGB32ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count,
1198 const QList<QRgb> *, QDitherInfo *)
1199{
1200 convertARGBToARGB32PM_avx2<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1201 return buffer;
1202}
1203
1204const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count,
1205 const QList<QRgb> *, QDitherInfo *)
1206{
1207 convertARGBToARGB32PM_avx2<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1208 return buffer;
1209}
1210
1211template<bool RGBA>
1212static void convertARGBToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, qsizetype count)
1213{
1214 qsizetype i = 0;
1215 const __m256i alphaMask = _mm256_set1_epi32(i: 0xff000000);
1216 const __m256i rgbaMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 3, b4: 6, b5: 5, b6: 4, b7: 7, b8: 10, b9: 9, b10: 8, b11: 11, b12: 14, b13: 13, b14: 12, b15: 15));
1217 const __m256i shuffleMask = _mm256_broadcastsi128_si256(X: _mm_setr_epi8(b0: 6, b1: 7, b2: 6, b3: 7, b4: 6, b5: 7, b6: 6, b7: 7, b8: 14, b9: 15, b10: 14, b11: 15, b12: 14, b13: 15, b14: 14, b15: 15));
1218 const __m256i zero = _mm256_setzero_si256();
1219
1220 for (; i < count - 7; i += 8) {
1221 __m256i dst1, dst2;
1222 __m256i srcVector = _mm256_loadu_si256(p: reinterpret_cast<const __m256i *>(src + i));
1223 if (!_mm256_testz_si256(a: srcVector, b: alphaMask)) {
1224 // keep the two _mm_test[zc]_siXXX next to each other
1225 bool cf = _mm256_testc_si256(a: srcVector, b: alphaMask);
1226 if (!RGBA)
1227 srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1228
1229 // The two unpack instructions unpack the low and upper halves of
1230 // each 128-bit half of the 256-bit register. Here's the tracking
1231 // of what's where: (p is 32-bit, P is 64-bit)
1232 // as loaded: [ p1, p2, p3, p4; p5, p6, p7, p8 ]
1233 // after permute4x64 [ p1, p2, p5, p6; p3, p4, p7, p8 ]
1234 // after unpacklo/hi [ P1, P2; P3, P4 ] [ P5, P6; P7, P8 ]
1235 srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
1236
1237 const __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: srcVector);
1238 const __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: srcVector);
1239 if (!cf) {
1240 const __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1241 const __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1242 dst1 = _mm256_mulhi_epu16(a: src1, b: alpha1);
1243 dst2 = _mm256_mulhi_epu16(a: src2, b: alpha2);
1244 dst1 = _mm256_add_epi16(a: dst1, b: _mm256_srli_epi16(a: dst1, count: 15));
1245 dst2 = _mm256_add_epi16(a: dst2, b: _mm256_srli_epi16(a: dst2, count: 15));
1246 dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
1247 dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
1248 } else {
1249 dst1 = src1;
1250 dst2 = src2;
1251 }
1252 } else {
1253 dst1 = dst2 = zero;
1254 }
1255 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i), a: dst1);
1256 _mm256_storeu_si256(p: reinterpret_cast<__m256i *>(buffer + i) + 1, a: dst2);
1257 }
1258
1259 if (i < count) {
1260 __m256i epilogueMask = epilogueMaskFromCount(count: count - i);
1261 const __m256i epilogueAlphaMask = _mm256_blendv_epi8(V1: _mm256_setzero_si256(), V2: alphaMask, M: epilogueMask);
1262 __m256i dst1, dst2;
1263 __m256i srcVector = _mm256_maskload_epi32(X: reinterpret_cast<const int *>(src + i), M: epilogueMask);
1264
1265 if (!_mm256_testz_si256(a: srcVector, b: epilogueAlphaMask)) {
1266 // keep the two _mm_test[zc]_siXXX next to each other
1267 bool cf = _mm256_testc_si256(a: srcVector, b: epilogueAlphaMask);
1268 if (!RGBA)
1269 srcVector = _mm256_shuffle_epi8(a: srcVector, b: rgbaMask);
1270 srcVector = _mm256_permute4x64_epi64(srcVector, _MM_SHUFFLE(3, 1, 2, 0));
1271 const __m256i src1 = _mm256_unpacklo_epi8(a: srcVector, b: srcVector);
1272 const __m256i src2 = _mm256_unpackhi_epi8(a: srcVector, b: srcVector);
1273 if (!cf) {
1274 const __m256i alpha1 = _mm256_shuffle_epi8(a: src1, b: shuffleMask);
1275 const __m256i alpha2 = _mm256_shuffle_epi8(a: src2, b: shuffleMask);
1276 dst1 = _mm256_mulhi_epu16(a: src1, b: alpha1);
1277 dst2 = _mm256_mulhi_epu16(a: src2, b: alpha2);
1278 dst1 = _mm256_add_epi16(a: dst1, b: _mm256_srli_epi16(a: dst1, count: 15));
1279 dst2 = _mm256_add_epi16(a: dst2, b: _mm256_srli_epi16(a: dst2, count: 15));
1280 dst1 = _mm256_blend_epi16(dst1, src1, 0x88);
1281 dst2 = _mm256_blend_epi16(dst2, src2, 0x88);
1282 } else {
1283 dst1 = src1;
1284 dst2 = src2;
1285 }
1286 } else {
1287 dst1 = dst2 = zero;
1288 }
1289 epilogueMask = _mm256_permute4x64_epi64(epilogueMask, _MM_SHUFFLE(3, 1, 2, 0));
1290 _mm256_maskstore_epi64(X: reinterpret_cast<qint64 *>(buffer + i),
1291 M: _mm256_unpacklo_epi32(a: epilogueMask, b: epilogueMask),
1292 Y: dst1);
1293 _mm256_maskstore_epi64(X: reinterpret_cast<qint64 *>(buffer + i + 4),
1294 M: _mm256_unpackhi_epi32(a: epilogueMask, b: epilogueMask),
1295 Y: dst2);
1296 }
1297}
1298
1299const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count,
1300 const QList<QRgb> *, QDitherInfo *)
1301{
1302 convertARGBToRGBA64PM_avx2<false>(buffer, src, count);
1303 return buffer;
1304}
1305
1306const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const uint *src, int count,
1307 const QList<QRgb> *, QDitherInfo *)
1308{
1309 convertARGBToRGBA64PM_avx2<true>(buffer, src, count);
1310 return buffer;
1311}
1312
1313const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1314 const QList<QRgb> *, QDitherInfo *)
1315{
1316 convertARGBToRGBA64PM_avx2<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1317 return buffer;
1318}
1319
1320const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1321 const QList<QRgb> *, QDitherInfo *)
1322{
1323 convertARGBToRGBA64PM_avx2<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
1324 return buffer;
1325}
1326
1327const QRgba64 *QT_FASTCALL fetchRGBA64ToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1328 const QList<QRgb> *, QDitherInfo *)
1329{
1330 const QRgba64 *s = reinterpret_cast<const QRgba64 *>(src) + index;
1331 int i = 0;
1332 const __m256i vh = _mm256_set1_epi32(i: 0x8000);
1333 for (; i < count - 3; i += 4) {
1334 __m256i vs256 = _mm256_loadu_si256(p: (const __m256i *)(s + i));
1335 __m256i va256 = _mm256_shufflelo_epi16(vs256, _MM_SHUFFLE(3, 3, 3, 3));
1336 va256 = _mm256_shufflehi_epi16(va256, _MM_SHUFFLE(3, 3, 3, 3));
1337 const __m256i vmullo = _mm256_mullo_epi16(a: vs256, b: va256);
1338 const __m256i vmulhi = _mm256_mulhi_epu16(a: vs256, b: va256);
1339 __m256i vslo = _mm256_unpacklo_epi16(a: vmullo, b: vmulhi);
1340 __m256i vshi = _mm256_unpackhi_epi16(a: vmullo, b: vmulhi);
1341 vslo = _mm256_add_epi32(a: vslo, b: _mm256_srli_epi32(a: vslo, count: 16));
1342 vshi = _mm256_add_epi32(a: vshi, b: _mm256_srli_epi32(a: vshi, count: 16));
1343 vslo = _mm256_add_epi32(a: vslo, b: vh);
1344 vshi = _mm256_add_epi32(a: vshi, b: vh);
1345 vslo = _mm256_srli_epi32(a: vslo, count: 16);
1346 vshi = _mm256_srli_epi32(a: vshi, count: 16);
1347 vs256 = _mm256_packus_epi32(V1: vslo, V2: vshi);
1348 _mm256_storeu_si256(p: (__m256i *)(buffer + i), a: vs256);
1349 }
1350 for (; i < count; ++i) {
1351 __m128i vs = _mm_loadl_epi64(p: (const __m128i *)(s + i));
1352 __m128i va = _mm_shufflelo_epi16(vs, _MM_SHUFFLE(3, 3, 3, 3));
1353 vs = multiplyAlpha65535(rgba64: vs, va);
1354 _mm_storel_epi64(p: (__m128i *)(buffer + i), a: vs);
1355 }
1356 return buffer;
1357}
1358
1359const uint *QT_FASTCALL fetchRGB16FToRGB32_avx2(uint *buffer, const uchar *src, int index, int count,
1360 const QList<QRgb> *, QDitherInfo *)
1361{
1362 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1363 const __m256 vf = _mm256_set1_ps(w: 255.0f);
1364 const __m256 vh = _mm256_set1_ps(w: 0.5f);
1365 int i = 0;
1366 for (; i + 1 < count; i += 2) {
1367 __m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1368 vsf = _mm256_mul_ps(a: vsf, b: vf);
1369 vsf = _mm256_add_ps(a: vsf, b: vh);
1370 __m256i vsi = _mm256_cvttps_epi32(a: vsf);
1371 vsi = _mm256_packs_epi32(a: vsi, b: vsi);
1372 vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1373 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1374 __m128i vsi128 = _mm256_castsi256_si128(a: vsi);
1375 vsi128 = _mm_packus_epi16(a: vsi128, b: vsi128);
1376 _mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi128);
1377 }
1378 if (i < count) {
1379 __m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1380 vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: 255.0f));
1381 vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: 0.5f));
1382 __m128i vsi = _mm_cvttps_epi32(a: vsf);
1383 vsi = _mm_packs_epi32(a: vsi, b: vsi);
1384 vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1385 vsi = _mm_packus_epi16(a: vsi, b: vsi);
1386 buffer[i] = _mm_cvtsi128_si32(a: vsi);
1387 }
1388 return buffer;
1389}
1390
1391const uint *QT_FASTCALL fetchRGBA16FToARGB32PM_avx2(uint *buffer, const uchar *src, int index, int count,
1392 const QList<QRgb> *, QDitherInfo *)
1393{
1394 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1395 const __m256 vf = _mm256_set1_ps(w: 255.0f);
1396 const __m256 vh = _mm256_set1_ps(w: 0.5f);
1397 int i = 0;
1398 for (; i + 1 < count; i += 2) {
1399 __m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1400 __m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1401 vsf = _mm256_mul_ps(a: vsf, b: vsa);
1402 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1403 vsf = _mm256_mul_ps(a: vsf, b: vf);
1404 vsf = _mm256_add_ps(a: vsf, b: vh);
1405 __m256i vsi = _mm256_cvttps_epi32(a: vsf);
1406 vsi = _mm256_packus_epi32(V1: vsi, V2: vsi);
1407 vsi = _mm256_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1408 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1409 __m128i vsi128 = _mm256_castsi256_si128(a: vsi);
1410 vsi128 = _mm_packus_epi16(a: vsi128, b: vsi128);
1411 _mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi128);
1412 }
1413 if (i < count) {
1414 __m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1415 __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1416 vsf = _mm_mul_ps(a: vsf, b: vsa);
1417 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1418 vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: 255.0f));
1419 vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: 0.5f));
1420 __m128i vsi = _mm_cvttps_epi32(a: vsf);
1421 vsi = _mm_packus_epi32(V1: vsi, V2: vsi);
1422 vsi = _mm_shufflelo_epi16(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1423 vsi = _mm_packus_epi16(a: vsi, b: vsi);
1424 buffer[i] = _mm_cvtsi128_si32(a: vsi);
1425 }
1426 return buffer;
1427}
1428
1429const QRgba64 *QT_FASTCALL fetchRGBA16FPMToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1430 const QList<QRgb> *, QDitherInfo *)
1431{
1432 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1433 const __m256 vf = _mm256_set1_ps(w: 65535.0f);
1434 const __m256 vh = _mm256_set1_ps(w: 0.5f);
1435 int i = 0;
1436 for (; i + 1 < count; i += 2) {
1437 __m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1438 vsf = _mm256_mul_ps(a: vsf, b: vf);
1439 vsf = _mm256_add_ps(a: vsf, b: vh);
1440 __m256i vsi = _mm256_cvttps_epi32(a: vsf);
1441 vsi = _mm256_packus_epi32(V1: vsi, V2: vsi);
1442 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1443 _mm_storeu_si128(p: (__m128i *)(buffer + i), b: _mm256_castsi256_si128(a: vsi));
1444 }
1445 if (i < count) {
1446 __m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1447 vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: 65535.0f));
1448 vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: 0.5f));
1449 __m128i vsi = _mm_cvttps_epi32(a: vsf);
1450 vsi = _mm_packus_epi32(V1: vsi, V2: vsi);
1451 _mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi);
1452 }
1453 return buffer;
1454}
1455
1456const QRgba64 *QT_FASTCALL fetchRGBA16FToRGBA64PM_avx2(QRgba64 *buffer, const uchar *src, int index, int count,
1457 const QList<QRgb> *, QDitherInfo *)
1458{
1459 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1460 const __m256 vf = _mm256_set1_ps(w: 65535.0f);
1461 const __m256 vh = _mm256_set1_ps(w: 0.5f);
1462 int i = 0;
1463 for (; i + 1 < count; i += 2) {
1464 __m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1465 __m256 vsa = _mm256_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
1466 vsf = _mm256_mul_ps(a: vsf, b: vsa);
1467 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1468 vsf = _mm256_mul_ps(a: vsf, b: vf);
1469 vsf = _mm256_add_ps(a: vsf, b: vh);
1470 __m256i vsi = _mm256_cvttps_epi32(a: vsf);
1471 vsi = _mm256_packus_epi32(V1: vsi, V2: vsi);
1472 vsi = _mm256_permute4x64_epi64(vsi, _MM_SHUFFLE(3, 1, 2, 0));
1473 _mm_storeu_si128(p: (__m128i *)(buffer + i), b: _mm256_castsi256_si128(a: vsi));
1474 }
1475 if (i < count) {
1476 __m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1477 __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
1478 vsf = _mm_mul_ps(a: vsf, b: vsa);
1479 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1480 vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: 65535.0f));
1481 vsf = _mm_add_ps(a: vsf, b: _mm_set1_ps(w: 0.5f));
1482 __m128i vsi = _mm_cvttps_epi32(a: vsf);
1483 vsi = _mm_packus_epi32(V1: vsi, V2: vsi);
1484 _mm_storel_epi64(p: (__m128i *)(buffer + i), a: vsi);
1485 }
1486 return buffer;
1487}
1488
1489void QT_FASTCALL storeRGB16FFromRGB32_avx2(uchar *dest, const uint *src, int index, int count,
1490 const QList<QRgb> *, QDitherInfo *)
1491{
1492 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1493 const __m256 vf = _mm256_set1_ps(w: 1.0f / 255.0f);
1494 int i = 0;
1495 for (; i + 1 < count; i += 2) {
1496 __m256i vsi = _mm256_cvtepu8_epi32(V: _mm_loadl_epi64(p: (const __m128i *)(src + i)));
1497 vsi = _mm256_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1498 __m256 vsf = _mm256_cvtepi32_ps(a: vsi);
1499 vsf = _mm256_mul_ps(a: vsf, b: vf);
1500 _mm_storeu_si128(p: (__m128i *)(d + i), _mm256_cvtps_ph(vsf, 0));
1501 }
1502 if (i < count) {
1503 __m128i vsi = _mm_cvtsi32_si128(a: src[i]);
1504 vsi = _mm_cvtepu8_epi32(V: vsi);
1505 vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1506 __m128 vsf = _mm_cvtepi32_ps(a: vsi);
1507 vsf = _mm_mul_ps(a: vsf, b: _mm_set1_ps(w: 1.0f / 255.0f));
1508 _mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1509 }
1510}
1511
1512void QT_FASTCALL storeRGBA16FFromARGB32PM_avx2(uchar *dest, const uint *src, int index, int count,
1513 const QList<QRgb> *, QDitherInfo *)
1514{
1515 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1516 const __m128 vf = _mm_set1_ps(w: 1.0f / 255.0f);
1517 for (int i = 0; i < count; ++i) {
1518 const uint s = src[i];
1519 __m128i vsi = _mm_cvtsi32_si128(a: s);
1520 vsi = _mm_cvtepu8_epi32(V: vsi);
1521 vsi = _mm_shuffle_epi32(vsi, _MM_SHUFFLE(3, 0, 1, 2));
1522 __m128 vsf = _mm_cvtepi32_ps(a: vsi);
1523 const uint8_t a = (s >> 24);
1524 if (a == 255)
1525 vsf = _mm_mul_ps(a: vsf, b: vf);
1526 else if (a == 0)
1527 vsf = _mm_set1_ps(w: 0.0f);
1528 else {
1529 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1530 __m128 vsr = _mm_rcp_ps(a: vsa);
1531 vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
1532 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
1533 vsf = _mm_mul_ps(a: vsf, b: vsr);
1534 }
1535 _mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1536 }
1537}
1538
1539#if QT_CONFIG(raster_fp)
1540const QRgbaFloat32 *QT_FASTCALL fetchRGBA16FToRGBA32F_avx2(QRgbaFloat32 *buffer, const uchar *src, int index, int count,
1541 const QList<QRgb> *, QDitherInfo *)
1542{
1543 const quint64 *s = reinterpret_cast<const quint64 *>(src) + index;
1544 int i = 0;
1545 for (; i + 1 < count; i += 2) {
1546 __m256 vsf = _mm256_cvtph_ps(a: _mm_loadu_si128(p: (const __m128i *)(s + i)));
1547 __m256 vsa = _mm256_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1548 vsf = _mm256_mul_ps(a: vsf, b: vsa);
1549 vsf = _mm256_blend_ps(vsf, vsa, 0x88);
1550 _mm256_storeu_ps(p: (float *)(buffer + i), a: vsf);
1551 }
1552 if (i < count) {
1553 __m128 vsf = _mm_cvtph_ps(a: _mm_loadl_epi64(p: (const __m128i *)(s + i)));
1554 __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1555 vsf = _mm_mul_ps(a: vsf, b: vsa);
1556 vsf = _mm_insert_ps(vsf, vsa, 0x30);
1557 _mm_store_ps(p: (float *)(buffer + i), a: vsf);
1558 }
1559 return buffer;
1560}
1561
1562void QT_FASTCALL storeRGBX16FFromRGBA32F_avx2(uchar *dest, const QRgbaFloat32 *src, int index, int count,
1563 const QList<QRgb> *, QDitherInfo *)
1564{
1565 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1566 const __m128 *s = reinterpret_cast<const __m128 *>(src);
1567 const __m128 zero = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f);
1568 for (int i = 0; i < count; ++i) {
1569 __m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(s + i));
1570 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1571 const float a = _mm_cvtss_f32(a: vsa);
1572 if (a == 1.0f)
1573 { }
1574 else if (a == 0.0f)
1575 vsf = zero;
1576 else {
1577 __m128 vsr = _mm_rcp_ps(a: vsa);
1578 vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
1579 vsf = _mm_mul_ps(a: vsf, b: vsr);
1580 vsf = _mm_insert_ps(vsf, _mm_set_ss(1.0f), 0x30);
1581 }
1582 _mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1583 }
1584}
1585
1586void QT_FASTCALL storeRGBA16FFromRGBA32F_avx2(uchar *dest, const QRgbaFloat32 *src, int index, int count,
1587 const QList<QRgb> *, QDitherInfo *)
1588{
1589 quint64 *d = reinterpret_cast<quint64 *>(dest) + index;
1590 const __m128 *s = reinterpret_cast<const __m128 *>(src);
1591 const __m128 zero = _mm_set1_ps(w: 0.0f);
1592 for (int i = 0; i < count; ++i) {
1593 __m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(s + i));
1594 const __m128 vsa = _mm_permute_ps(vsf, _MM_SHUFFLE(3, 3, 3, 3));
1595 const float a = _mm_cvtss_f32(a: vsa);
1596 if (a == 1.0f)
1597 { }
1598 else if (a == 0.0f)
1599 vsf = zero;
1600 else {
1601 __m128 vsr = _mm_rcp_ps(a: vsa);
1602 vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
1603 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
1604 vsf = _mm_mul_ps(a: vsf, b: vsr);
1605 }
1606 _mm_storel_epi64(p: (__m128i *)(d + i), _mm_cvtps_ph(vsf, 0));
1607 }
1608}
1609#endif
1610
1611QT_END_NAMESPACE
1612
1613#endif
1614

source code of qtbase/src/gui/painting/qdrawhelper_avx2.cpp