1// Copyright (C) 2016 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include <private/qdrawhelper_p.h>
5#include <private/qdrawingprimitive_sse2_p.h>
6#include <private/qpaintengine_raster_p.h>
7#include <private/qpixellayout_p.h>
8
9#if defined(QT_COMPILER_SUPPORTS_SSE4_1)
10
11QT_BEGIN_NAMESPACE
12
13#ifndef __haswell__
14template<bool RGBA>
15static void convertARGBToARGB32PM_sse4(uint *buffer, const uint *src, int count)
16{
17 int i = 0;
18 const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000);
19 const __m128i rgbaMask = _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 3, b4: 6, b5: 5, b6: 4, b7: 7, b8: 10, b9: 9, b10: 8, b11: 11, b12: 14, b13: 13, b14: 12, b15: 15);
20 const __m128i shuffleMask = _mm_setr_epi8(b0: 6, b1: 7, b2: 6, b3: 7, b4: 6, b5: 7, b6: 6, b7: 7, b8: 14, b9: 15, b10: 14, b11: 15, b12: 14, b13: 15, b14: 14, b15: 15);
21 const __m128i half = _mm_set1_epi16(w: 0x0080);
22 const __m128i zero = _mm_setzero_si128();
23
24 for (; i < count - 3; i += 4) {
25 __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[i]);
26 if (!_mm_testz_si128(M: srcVector, V: alphaMask)) {
27 if (!_mm_testc_si128(M: srcVector, V: alphaMask)) {
28 if (RGBA)
29 srcVector = _mm_shuffle_epi8(a: srcVector, b: rgbaMask);
30 __m128i src1 = _mm_unpacklo_epi8(a: srcVector, b: zero);
31 __m128i src2 = _mm_unpackhi_epi8(a: srcVector, b: zero);
32 __m128i alpha1 = _mm_shuffle_epi8(a: src1, b: shuffleMask);
33 __m128i alpha2 = _mm_shuffle_epi8(a: src2, b: shuffleMask);
34 src1 = _mm_mullo_epi16(a: src1, b: alpha1);
35 src2 = _mm_mullo_epi16(a: src2, b: alpha2);
36 src1 = _mm_add_epi16(a: src1, b: _mm_srli_epi16(a: src1, count: 8));
37 src2 = _mm_add_epi16(a: src2, b: _mm_srli_epi16(a: src2, count: 8));
38 src1 = _mm_add_epi16(a: src1, b: half);
39 src2 = _mm_add_epi16(a: src2, b: half);
40 src1 = _mm_srli_epi16(a: src1, count: 8);
41 src2 = _mm_srli_epi16(a: src2, count: 8);
42 src1 = _mm_blend_epi16(src1, alpha1, 0x88);
43 src2 = _mm_blend_epi16(src2, alpha2, 0x88);
44 srcVector = _mm_packus_epi16(a: src1, b: src2);
45 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
46 } else {
47 if (RGBA)
48 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: _mm_shuffle_epi8(a: srcVector, b: rgbaMask));
49 else if (buffer != src)
50 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
51 }
52 } else {
53 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
54 }
55 }
56
57 SIMD_EPILOGUE(i, count, 3) {
58 uint v = qPremultiply(x: src[i]);
59 buffer[i] = RGBA ? RGBA2ARGB(x: v) : v;
60 }
61}
62
63template<bool RGBA>
64static void convertARGBToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count)
65{
66 int i = 0;
67 const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000);
68 const __m128i rgbaMask = _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 3, b4: 6, b5: 5, b6: 4, b7: 7, b8: 10, b9: 9, b10: 8, b11: 11, b12: 14, b13: 13, b14: 12, b15: 15);
69 const __m128i shuffleMask = _mm_setr_epi8(b0: 6, b1: 7, b2: 6, b3: 7, b4: 6, b5: 7, b6: 6, b7: 7, b8: 14, b9: 15, b10: 14, b11: 15, b12: 14, b13: 15, b14: 14, b15: 15);
70 const __m128i zero = _mm_setzero_si128();
71
72 for (; i < count - 3; i += 4) {
73 __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[i]);
74 if (!_mm_testz_si128(M: srcVector, V: alphaMask)) {
75 bool cf = _mm_testc_si128(M: srcVector, V: alphaMask);
76
77 if (!RGBA)
78 srcVector = _mm_shuffle_epi8(a: srcVector, b: rgbaMask);
79 const __m128i src1 = _mm_unpacklo_epi8(a: srcVector, b: srcVector);
80 const __m128i src2 = _mm_unpackhi_epi8(a: srcVector, b: srcVector);
81 if (!cf) {
82 __m128i alpha1 = _mm_shuffle_epi8(a: src1, b: shuffleMask);
83 __m128i alpha2 = _mm_shuffle_epi8(a: src2, b: shuffleMask);
84 __m128i dst1 = _mm_mulhi_epu16(a: src1, b: alpha1);
85 __m128i dst2 = _mm_mulhi_epu16(a: src2, b: alpha2);
86 // Map 0->0xfffe to 0->0xffff
87 dst1 = _mm_add_epi16(a: dst1, b: _mm_srli_epi16(a: dst1, count: 15));
88 dst2 = _mm_add_epi16(a: dst2, b: _mm_srli_epi16(a: dst2, count: 15));
89 // correct alpha value:
90 dst1 = _mm_blend_epi16(dst1, src1, 0x88);
91 dst2 = _mm_blend_epi16(dst2, src2, 0x88);
92 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: dst1);
93 _mm_storeu_si128(p: (__m128i *)&buffer[i + 2], b: dst2);
94 } else {
95 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: src1);
96 _mm_storeu_si128(p: (__m128i *)&buffer[i + 2], b: src2);
97 }
98 } else {
99 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
100 _mm_storeu_si128(p: (__m128i *)&buffer[i + 2], b: zero);
101 }
102 }
103
104 SIMD_EPILOGUE(i, count, 3) {
105 const uint s = RGBA ? RGBA2ARGB(x: src[i]) : src[i];
106 buffer[i] = QRgba64::fromArgb32(rgb: s).premultiplied();
107 }
108}
109#endif // __haswell__
110
111static inline __m128 Q_DECL_VECTORCALL reciprocal_mul_ps(__m128 a, float mul)
112{
113 __m128 ia = _mm_rcp_ps(a: a); // Approximate 1/a
114 // Improve precision of ia using Newton-Raphson
115 ia = _mm_sub_ps(a: _mm_add_ps(a: ia, b: ia), b: _mm_mul_ps(a: ia, b: _mm_mul_ps(a: ia, b: a)));
116 ia = _mm_mul_ps(a: ia, b: _mm_set1_ps(w: mul));
117 return ia;
118}
119
120template<bool RGBA, bool RGBx>
121static inline void convertARGBFromARGB32PM_sse4(uint *buffer, const uint *src, int count)
122{
123 int i = 0;
124 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
125 for (; i < count; ++i) {
126 uint v = qUnpremultiply(p: src[i]);
127 if (RGBx)
128 v = 0xff000000 | v;
129 if (RGBA)
130 v = ARGB2RGBA(x: v);
131 buffer[i] = v;
132 }
133 return;
134 }
135 const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000);
136 const __m128i rgbaMask = _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 3, b4: 6, b5: 5, b6: 4, b7: 7, b8: 10, b9: 9, b10: 8, b11: 11, b12: 14, b13: 13, b14: 12, b15: 15);
137 const __m128i zero = _mm_setzero_si128();
138
139 for (; i < count - 3; i += 4) {
140 __m128i srcVector = _mm_loadu_si128(p: (const __m128i *)&src[i]);
141 if (!_mm_testz_si128(M: srcVector, V: alphaMask)) {
142 if (!_mm_testc_si128(M: srcVector, V: alphaMask)) {
143 __m128i srcVectorAlpha = _mm_srli_epi32(a: srcVector, count: 24);
144 if (RGBA)
145 srcVector = _mm_shuffle_epi8(a: srcVector, b: rgbaMask);
146 const __m128 a = _mm_cvtepi32_ps(a: srcVectorAlpha);
147 const __m128 ia = reciprocal_mul_ps(a, mul: 255.0f);
148 __m128i src1 = _mm_unpacklo_epi8(a: srcVector, b: zero);
149 __m128i src3 = _mm_unpackhi_epi8(a: srcVector, b: zero);
150 __m128i src2 = _mm_unpackhi_epi16(a: src1, b: zero);
151 __m128i src4 = _mm_unpackhi_epi16(a: src3, b: zero);
152 src1 = _mm_unpacklo_epi16(a: src1, b: zero);
153 src3 = _mm_unpacklo_epi16(a: src3, b: zero);
154 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
155 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
156 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
157 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
158 src1 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src1), b: ia1));
159 src2 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src2), b: ia2));
160 src3 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src3), b: ia3));
161 src4 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src4), b: ia4));
162 src1 = _mm_packus_epi32(V1: src1, V2: src2);
163 src3 = _mm_packus_epi32(V1: src3, V2: src4);
164 src1 = _mm_packus_epi16(a: src1, b: src3);
165 // Handle potential alpha == 0 values:
166 __m128i srcVectorAlphaMask = _mm_cmpeq_epi32(a: srcVectorAlpha, b: zero);
167 src1 = _mm_andnot_si128(a: srcVectorAlphaMask, b: src1);
168 // Fixup alpha values:
169 if (RGBx)
170 srcVector = _mm_or_si128(a: src1, b: alphaMask);
171 else
172 srcVector = _mm_blendv_epi8(V1: src1, V2: srcVector, M: alphaMask);
173 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
174 } else {
175 if (RGBA)
176 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: _mm_shuffle_epi8(a: srcVector, b: rgbaMask));
177 else if (buffer != src)
178 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: srcVector);
179 }
180 } else {
181 if (RGBx)
182 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: alphaMask);
183 else
184 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
185 }
186 }
187
188 SIMD_EPILOGUE(i, count, 3) {
189 uint v = qUnpremultiply_sse4(p: src[i]);
190 if (RGBx)
191 v = 0xff000000 | v;
192 if (RGBA)
193 v = ARGB2RGBA(x: v);
194 buffer[i] = v;
195 }
196}
197
198template<bool RGBA>
199static inline void convertARGBFromRGBA64PM_sse4(uint *buffer, const QRgba64 *src, int count)
200{
201 int i = 0;
202 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
203 for (; i < count; ++i) {
204 const QRgba64 v = src[i].unpremultiplied();
205 buffer[i] = RGBA ? toRgba8888(rgba64: v) : toArgb32(rgba64: v);
206 }
207 return;
208 }
209 const __m128i alphaMask = _mm_set1_epi64x(q: qint64(Q_UINT64_C(0xffff) << 48));
210 const __m128i alphaMask32 = _mm_set1_epi32(i: 0xff000000);
211 const __m128i rgbaMask = _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 3, b4: 6, b5: 5, b6: 4, b7: 7, b8: 10, b9: 9, b10: 8, b11: 11, b12: 14, b13: 13, b14: 12, b15: 15);
212 const __m128i zero = _mm_setzero_si128();
213
214 for (; i < count - 3; i += 4) {
215 __m128i srcVector1 = _mm_loadu_si128(p: (const __m128i *)&src[i]);
216 __m128i srcVector2 = _mm_loadu_si128(p: (const __m128i *)&src[i + 2]);
217 bool transparent1 = _mm_testz_si128(M: srcVector1, V: alphaMask);
218 bool opaque1 = _mm_testc_si128(M: srcVector1, V: alphaMask);
219 bool transparent2 = _mm_testz_si128(M: srcVector2, V: alphaMask);
220 bool opaque2 = _mm_testc_si128(M: srcVector2, V: alphaMask);
221
222 if (!(transparent1 && transparent2)) {
223 if (!(opaque1 && opaque2)) {
224 __m128i srcVector1Alpha = _mm_srli_epi64(a: srcVector1, count: 48);
225 __m128i srcVector2Alpha = _mm_srli_epi64(a: srcVector2, count: 48);
226 __m128i srcVectorAlpha = _mm_packus_epi32(V1: srcVector1Alpha, V2: srcVector2Alpha);
227 const __m128 a = _mm_cvtepi32_ps(a: srcVectorAlpha);
228 // Convert srcVectorAlpha to final 8-bit alpha channel
229 srcVectorAlpha = _mm_add_epi32(a: srcVectorAlpha, b: _mm_set1_epi32(i: 128));
230 srcVectorAlpha = _mm_sub_epi32(a: srcVectorAlpha, b: _mm_srli_epi32(a: srcVectorAlpha, count: 8));
231 srcVectorAlpha = _mm_srli_epi32(a: srcVectorAlpha, count: 8);
232 srcVectorAlpha = _mm_slli_epi32(a: srcVectorAlpha, count: 24);
233 const __m128 ia = reciprocal_mul_ps(a, mul: 255.0f);
234 __m128i src1 = _mm_unpacklo_epi16(a: srcVector1, b: zero);
235 __m128i src2 = _mm_unpackhi_epi16(a: srcVector1, b: zero);
236 __m128i src3 = _mm_unpacklo_epi16(a: srcVector2, b: zero);
237 __m128i src4 = _mm_unpackhi_epi16(a: srcVector2, b: zero);
238 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
239 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
240 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
241 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
242 src1 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src1), b: ia1));
243 src2 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src2), b: ia2));
244 src3 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src3), b: ia3));
245 src4 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src4), b: ia4));
246 src1 = _mm_packus_epi32(V1: src1, V2: src2);
247 src3 = _mm_packus_epi32(V1: src3, V2: src4);
248 // Handle potential alpha == 0 values:
249 __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(V1: srcVector1Alpha, V2: zero);
250 __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(V1: srcVector2Alpha, V2: zero);
251 src1 = _mm_andnot_si128(a: srcVector1AlphaMask, b: src1);
252 src3 = _mm_andnot_si128(a: srcVector2AlphaMask, b: src3);
253 src1 = _mm_packus_epi16(a: src1, b: src3);
254 // Fixup alpha values:
255 src1 = _mm_blendv_epi8(V1: src1, V2: srcVectorAlpha, M: alphaMask32);
256 // Fix RGB order
257 if (!RGBA)
258 src1 = _mm_shuffle_epi8(a: src1, b: rgbaMask);
259 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: src1);
260 } else {
261 __m128i src1 = _mm_unpacklo_epi16(a: srcVector1, b: zero);
262 __m128i src2 = _mm_unpackhi_epi16(a: srcVector1, b: zero);
263 __m128i src3 = _mm_unpacklo_epi16(a: srcVector2, b: zero);
264 __m128i src4 = _mm_unpackhi_epi16(a: srcVector2, b: zero);
265 src1 = _mm_add_epi32(a: src1, b: _mm_set1_epi32(i: 128));
266 src2 = _mm_add_epi32(a: src2, b: _mm_set1_epi32(i: 128));
267 src3 = _mm_add_epi32(a: src3, b: _mm_set1_epi32(i: 128));
268 src4 = _mm_add_epi32(a: src4, b: _mm_set1_epi32(i: 128));
269 src1 = _mm_sub_epi32(a: src1, b: _mm_srli_epi32(a: src1, count: 8));
270 src2 = _mm_sub_epi32(a: src2, b: _mm_srli_epi32(a: src2, count: 8));
271 src3 = _mm_sub_epi32(a: src3, b: _mm_srli_epi32(a: src3, count: 8));
272 src4 = _mm_sub_epi32(a: src4, b: _mm_srli_epi32(a: src4, count: 8));
273 src1 = _mm_srli_epi32(a: src1, count: 8);
274 src2 = _mm_srli_epi32(a: src2, count: 8);
275 src3 = _mm_srli_epi32(a: src3, count: 8);
276 src4 = _mm_srli_epi32(a: src4, count: 8);
277 src1 = _mm_packus_epi32(V1: src1, V2: src2);
278 src3 = _mm_packus_epi32(V1: src3, V2: src4);
279 src1 = _mm_packus_epi16(a: src1, b: src3);
280 if (!RGBA)
281 src1 = _mm_shuffle_epi8(a: src1, b: rgbaMask);
282 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: src1);
283 }
284 } else {
285 _mm_storeu_si128(p: (__m128i *)&buffer[i], b: zero);
286 }
287 }
288
289 SIMD_EPILOGUE(i, count, 3) {
290 buffer[i] = qConvertRgba64ToRgb32_sse4<RGBA ? PixelOrderRGB : PixelOrderBGR>(src[i]);
291 }
292}
293
294template<bool mask>
295static inline void convertRGBA64FromRGBA64PM_sse4(QRgba64 *buffer, const QRgba64 *src, int count)
296{
297 int i = 0;
298 if ((_MM_GET_EXCEPTION_MASK() & _MM_MASK_INVALID) == 0) {
299 for (; i < count; ++i) {
300 QRgba64 v = src[i].unpremultiplied();
301 if (mask)
302 v.setAlpha(65535);
303 buffer[i] = v;
304 }
305 return;
306 }
307 const __m128i alphaMask = _mm_set1_epi64x(q: qint64(Q_UINT64_C(0xffff) << 48));
308 const __m128i zero = _mm_setzero_si128();
309
310 for (; i < count - 3; i += 4) {
311 __m128i srcVector1 = _mm_loadu_si128(p: (const __m128i *)&src[i + 0]);
312 __m128i srcVector2 = _mm_loadu_si128(p: (const __m128i *)&src[i + 2]);
313 bool transparent1 = _mm_testz_si128(M: srcVector1, V: alphaMask);
314 bool opaque1 = _mm_testc_si128(M: srcVector1, V: alphaMask);
315 bool transparent2 = _mm_testz_si128(M: srcVector2, V: alphaMask);
316 bool opaque2 = _mm_testc_si128(M: srcVector2, V: alphaMask);
317
318 if (!(transparent1 && transparent2)) {
319 if (!(opaque1 && opaque2)) {
320 __m128i srcVector1Alpha = _mm_srli_epi64(a: srcVector1, count: 48);
321 __m128i srcVector2Alpha = _mm_srli_epi64(a: srcVector2, count: 48);
322 __m128i srcVectorAlpha = _mm_packus_epi32(V1: srcVector1Alpha, V2: srcVector2Alpha);
323 const __m128 a = _mm_cvtepi32_ps(a: srcVectorAlpha);
324 const __m128 ia = reciprocal_mul_ps(a, mul: 65535.0f);
325 __m128i src1 = _mm_unpacklo_epi16(a: srcVector1, b: zero);
326 __m128i src2 = _mm_unpackhi_epi16(a: srcVector1, b: zero);
327 __m128i src3 = _mm_unpacklo_epi16(a: srcVector2, b: zero);
328 __m128i src4 = _mm_unpackhi_epi16(a: srcVector2, b: zero);
329 __m128 ia1 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(0, 0, 0, 0));
330 __m128 ia2 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(1, 1, 1, 1));
331 __m128 ia3 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(2, 2, 2, 2));
332 __m128 ia4 = _mm_shuffle_ps(ia, ia, _MM_SHUFFLE(3, 3, 3, 3));
333 src1 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src1), b: ia1));
334 src2 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src2), b: ia2));
335 src3 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src3), b: ia3));
336 src4 = _mm_cvtps_epi32(a: _mm_mul_ps(a: _mm_cvtepi32_ps(a: src4), b: ia4));
337 src1 = _mm_packus_epi32(V1: src1, V2: src2);
338 src3 = _mm_packus_epi32(V1: src3, V2: src4);
339 // Handle potential alpha == 0 values:
340 __m128i srcVector1AlphaMask = _mm_cmpeq_epi64(V1: srcVector1Alpha, V2: zero);
341 __m128i srcVector2AlphaMask = _mm_cmpeq_epi64(V1: srcVector2Alpha, V2: zero);
342 src1 = _mm_andnot_si128(a: srcVector1AlphaMask, b: src1);
343 src3 = _mm_andnot_si128(a: srcVector2AlphaMask, b: src3);
344 // Fixup alpha values:
345 if (mask) {
346 src1 = _mm_or_si128(a: src1, b: alphaMask);
347 src3 = _mm_or_si128(a: src3, b: alphaMask);
348 } else {
349 src1 = _mm_blendv_epi8(V1: src1, V2: srcVector1, M: alphaMask);
350 src3 = _mm_blendv_epi8(V1: src3, V2: srcVector2, M: alphaMask);
351 }
352 _mm_storeu_si128(p: (__m128i *)&buffer[i + 0], b: src1);
353 _mm_storeu_si128(p: (__m128i *)&buffer[i + 2], b: src3);
354 } else {
355 if (mask) {
356 srcVector1 = _mm_or_si128(a: srcVector1, b: alphaMask);
357 srcVector2 = _mm_or_si128(a: srcVector2, b: alphaMask);
358 }
359 if (mask || src != buffer) {
360 _mm_storeu_si128(p: (__m128i *)&buffer[i + 0], b: srcVector1);
361 _mm_storeu_si128(p: (__m128i *)&buffer[i + 2], b: srcVector2);
362 }
363 }
364 } else {
365 _mm_storeu_si128(p: (__m128i *)&buffer[i + 0], b: zero);
366 _mm_storeu_si128(p: (__m128i *)&buffer[i + 2], b: zero);
367 }
368 }
369
370 SIMD_EPILOGUE(i, count, 3) {
371 QRgba64 v = src[i].unpremultiplied();
372 if (mask)
373 v.setAlpha(65535);
374 buffer[i] = v;
375 }
376}
377
378#ifndef __haswell__
379void QT_FASTCALL convertARGB32ToARGB32PM_sse4(uint *buffer, int count, const QList<QRgb> *)
380{
381 convertARGBToARGB32PM_sse4<false>(buffer, src: buffer, count);
382}
383
384void QT_FASTCALL convertRGBA8888ToARGB32PM_sse4(uint *buffer, int count, const QList<QRgb> *)
385{
386 convertARGBToARGB32PM_sse4<true>(buffer, src: buffer, count);
387}
388
389const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
390 const QList<QRgb> *, QDitherInfo *)
391{
392 convertARGBToRGBA64PM_sse4<false>(buffer, src, count);
393 return buffer;
394}
395
396const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uint *src, int count,
397 const QList<QRgb> *, QDitherInfo *)
398{
399 convertARGBToRGBA64PM_sse4<true>(buffer, src, count);
400 return buffer;
401}
402
403const uint *QT_FASTCALL fetchARGB32ToARGB32PM_sse4(uint *buffer, const uchar *src, int index, int count,
404 const QList<QRgb> *, QDitherInfo *)
405{
406 convertARGBToARGB32PM_sse4<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
407 return buffer;
408}
409
410const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_sse4(uint *buffer, const uchar *src, int index, int count,
411 const QList<QRgb> *, QDitherInfo *)
412{
413 convertARGBToARGB32PM_sse4<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
414 return buffer;
415}
416
417const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
418 const QList<QRgb> *, QDitherInfo *)
419{
420 convertARGBToRGBA64PM_sse4<false>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
421 return buffer;
422}
423
424const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_sse4(QRgba64 *buffer, const uchar *src, int index, int count,
425 const QList<QRgb> *, QDitherInfo *)
426{
427 convertARGBToRGBA64PM_sse4<true>(buffer, src: reinterpret_cast<const uint *>(src) + index, count);
428 return buffer;
429}
430#endif // __haswell__
431
432void QT_FASTCALL storeRGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
433 const QList<QRgb> *, QDitherInfo *)
434{
435 uint *d = reinterpret_cast<uint *>(dest) + index;
436 convertARGBFromARGB32PM_sse4<false,true>(buffer: d, src, count);
437}
438
439void QT_FASTCALL storeARGB32FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
440 const QList<QRgb> *, QDitherInfo *)
441{
442 uint *d = reinterpret_cast<uint *>(dest) + index;
443 convertARGBFromARGB32PM_sse4<false,false>(buffer: d, src, count);
444}
445
446void QT_FASTCALL storeRGBA8888FromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
447 const QList<QRgb> *, QDitherInfo *)
448{
449 uint *d = reinterpret_cast<uint *>(dest) + index;
450 convertARGBFromARGB32PM_sse4<true,false>(buffer: d, src, count);
451}
452
453void QT_FASTCALL storeRGBXFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
454 const QList<QRgb> *, QDitherInfo *)
455{
456 uint *d = reinterpret_cast<uint *>(dest) + index;
457 convertARGBFromARGB32PM_sse4<true,true>(buffer: d, src, count);
458}
459
460template<QtPixelOrder PixelOrder>
461void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4(uchar *dest, const uint *src, int index, int count,
462 const QList<QRgb> *, QDitherInfo *)
463{
464 uint *d = reinterpret_cast<uint *>(dest) + index;
465 for (int i = 0; i < count; ++i)
466 d[i] = qConvertArgb32ToA2rgb30_sse4<PixelOrder>(src[i]);
467}
468
469template
470void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderBGR>(uchar *dest, const uint *src, int index, int count,
471 const QList<QRgb> *, QDitherInfo *);
472template
473void QT_FASTCALL storeA2RGB30PMFromARGB32PM_sse4<PixelOrderRGB>(uchar *dest, const uint *src, int index, int count,
474 const QList<QRgb> *, QDitherInfo *);
475
476#if QT_CONFIG(raster_64bit)
477void QT_FASTCALL destStore64ARGB32_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length)
478{
479 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
480 convertARGBFromRGBA64PM_sse4<false>(buffer: dest, src: buffer, count: length);
481}
482
483void QT_FASTCALL destStore64RGBA8888_sse4(QRasterBuffer *rasterBuffer, int x, int y, const QRgba64 *buffer, int length)
484{
485 uint *dest = (uint*)rasterBuffer->scanLine(y) + x;
486 convertARGBFromRGBA64PM_sse4<true>(buffer: dest, src: buffer, count: length);
487}
488#endif
489
490void QT_FASTCALL storeARGB32FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
491 const QList<QRgb> *, QDitherInfo *)
492{
493 uint *d = (uint*)dest + index;
494 convertARGBFromRGBA64PM_sse4<false>(buffer: d, src, count);
495}
496
497void QT_FASTCALL storeRGBA8888FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
498 const QList<QRgb> *, QDitherInfo *)
499{
500 uint *d = (uint*)dest + index;
501 convertARGBFromRGBA64PM_sse4<true>(buffer: d, src, count);
502}
503
504void QT_FASTCALL storeRGBA64FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
505 const QList<QRgb> *, QDitherInfo *)
506{
507 QRgba64 *d = (QRgba64 *)dest + index;
508 convertRGBA64FromRGBA64PM_sse4<false>(buffer: d, src, count);
509}
510
511void QT_FASTCALL storeRGBx64FromRGBA64PM_sse4(uchar *dest, const QRgba64 *src, int index, int count,
512 const QList<QRgb> *, QDitherInfo *)
513{
514 QRgba64 *d = (QRgba64 *)dest + index;
515 convertRGBA64FromRGBA64PM_sse4<true>(buffer: d, src, count);
516}
517
518#if QT_CONFIG(raster_fp)
519const QRgbaFloat32 *QT_FASTCALL fetchRGBA32FToRGBA32F_sse4(QRgbaFloat32 *buffer, const uchar *src, int index, int count,
520 const QList<QRgb> *, QDitherInfo *)
521{
522 const QRgbaFloat32 *s = reinterpret_cast<const QRgbaFloat32 *>(src) + index;
523 for (int i = 0; i < count; ++i) {
524 __m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(s + i));
525 __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
526 vsf = _mm_mul_ps(a: vsf, b: vsa);
527 vsf = _mm_insert_ps(vsf, vsa, 0x30);
528 _mm_store_ps(p: reinterpret_cast<float *>(buffer + i), a: vsf);
529 }
530 return buffer;
531}
532
533void QT_FASTCALL storeRGBX32FFromRGBA32F_sse4(uchar *dest, const QRgbaFloat32 *src, int index, int count,
534 const QList<QRgb> *, QDitherInfo *)
535{
536 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
537 const __m128 zero = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f);
538 for (int i = 0; i < count; ++i) {
539 __m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(src + i));
540 const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
541 const float a = _mm_cvtss_f32(a: vsa);
542 if (a == 1.0f)
543 { }
544 else if (a == 0.0f)
545 vsf = zero;
546 else {
547 __m128 vsr = _mm_rcp_ps(a: vsa);
548 vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
549 vsf = _mm_mul_ps(a: vsf, b: vsr);
550 vsf = _mm_insert_ps(vsf, _mm_set_ss(1.0f), 0x30);
551 }
552 _mm_store_ps(p: reinterpret_cast<float *>(d + i), a: vsf);
553 }
554}
555
556void QT_FASTCALL storeRGBA32FFromRGBA32F_sse4(uchar *dest, const QRgbaFloat32 *src, int index, int count,
557 const QList<QRgb> *, QDitherInfo *)
558{
559 QRgbaFloat32 *d = reinterpret_cast<QRgbaFloat32 *>(dest) + index;
560 const __m128 zero = _mm_set1_ps(w: 0.0f);
561 for (int i = 0; i < count; ++i) {
562 __m128 vsf = _mm_load_ps(p: reinterpret_cast<const float *>(src + i));
563 const __m128 vsa = _mm_shuffle_ps(vsf, vsf, _MM_SHUFFLE(3, 3, 3, 3));
564 const float a = _mm_cvtss_f32(a: vsa);
565 if (a == 1.0f)
566 { }
567 else if (a == 0.0f)
568 vsf = zero;
569 else {
570 __m128 vsr = _mm_rcp_ps(a: vsa);
571 vsr = _mm_sub_ps(a: _mm_add_ps(a: vsr, b: vsr), b: _mm_mul_ps(a: vsr, b: _mm_mul_ps(a: vsr, b: vsa)));
572 vsr = _mm_insert_ps(vsr, _mm_set_ss(1.0f), 0x30);
573 vsf = _mm_mul_ps(a: vsf, b: vsr);
574 }
575 _mm_store_ps(p: reinterpret_cast<float *>(d + i), a: vsf);
576 }
577}
578#endif
579
580
581QT_END_NAMESPACE
582
583#endif
584

source code of qtbase/src/gui/painting/qdrawhelper_sse4.cpp