1// Copyright (C) 2018 The Qt Company Ltd.
2// Copyright (C) 2018 Intel Corporation.
3// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
4
5#include <private/qdrawhelper_x86_p.h>
6
7#if defined(QT_COMPILER_SUPPORTS_SSSE3)
8
9#include <private/qdrawingprimitive_sse2_p.h>
10
11QT_BEGIN_NAMESPACE
12
13/* The instruction palignr uses direct arguments, so we have to generate the code fo the different
14 shift (4, 8, 12). Checking the alignment inside the loop is unfortunately way too slow.
15 */
16#define BLENDING_LOOP(palignrOffset, length)\
17 for (; x-minusOffsetToAlignSrcOn16Bytes < length-7; x += 4) { \
18 const __m128i srcVectorLastLoaded = _mm_load_si128((const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes + 4]);\
19 const __m128i srcVector = _mm_alignr_epi8(srcVectorLastLoaded, srcVectorPrevLoaded, palignrOffset); \
20 const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask); \
21 if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) { \
22 _mm_store_si128((__m128i *)&dst[x], srcVector); \
23 } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) != 0xffff) { \
24 __m128i alphaChannel = _mm_shuffle_epi8(srcVector, alphaShuffleMask); \
25 alphaChannel = _mm_sub_epi16(one, alphaChannel); \
26 const __m128i dstVector = _mm_load_si128((__m128i *)&dst[x]); \
27 __m128i destMultipliedByOneMinusAlpha; \
28 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half); \
29 const __m128i result = _mm_add_epi8(srcVector, destMultipliedByOneMinusAlpha); \
30 _mm_store_si128((__m128i *)&dst[x], result); \
31 } \
32 srcVectorPrevLoaded = srcVectorLastLoaded;\
33 }
34
35
36// Basically blend src over dst with the const alpha defined as constAlphaVector.
37// nullVector, half, one, colorMask are constant across the whole image/texture, and should be defined as:
38//const __m128i nullVector = _mm_set1_epi32(0);
39//const __m128i half = _mm_set1_epi16(0x80);
40//const __m128i one = _mm_set1_epi16(0xff);
41//const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);
42//const __m128i alphaMask = _mm_set1_epi32(0xff000000);
43//
44// The computation being done is:
45// result = s + d * (1-alpha)
46// with shortcuts if fully opaque or fully transparent.
47static inline void Q_DECL_VECTORCALL
48BLEND_SOURCE_OVER_ARGB32_SSSE3(quint32 *dst, const quint32 *src, int length,
49 __m128i nullVector, __m128i half, __m128i one, __m128i colorMask, __m128i alphaMask)
50{
51 int x = 0;
52
53 /* First, get dst aligned. */
54 ALIGNMENT_PROLOGUE_16BYTES(dst, x, length) {
55 blend_pixel(dst&: dst[x], src: src[x]);
56 }
57
58 const int minusOffsetToAlignSrcOn16Bytes = (reinterpret_cast<quintptr>(&(src[x])) >> 2) & 0x3;
59
60 if (!minusOffsetToAlignSrcOn16Bytes) {
61 /* src is aligned, usual algorithm but with aligned operations.
62 See the SSE2 version for more documentation on the algorithm itself. */
63 const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(0xff),b14: 15,b13: char(0xff),b12: 15,b11: char(0xff),b10: 11,b9: char(0xff),b8: 11,b7: char(0xff),b6: 7,b5: char(0xff),b4: 7,b3: char(0xff),b2: 3,b1: char(0xff),b0: 3);
64 for (; x < length-3; x += 4) {
65 const __m128i srcVector = _mm_load_si128(p: (const __m128i *)&src[x]);
66 const __m128i srcVectorAlpha = _mm_and_si128(a: srcVector, b: alphaMask);
67 if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: alphaMask)) == 0xffff) {
68 _mm_store_si128(p: (__m128i *)&dst[x], b: srcVector);
69 } else if (_mm_movemask_epi8(a: _mm_cmpeq_epi32(a: srcVectorAlpha, b: nullVector)) != 0xffff) {
70 __m128i alphaChannel = _mm_shuffle_epi8(a: srcVector, b: alphaShuffleMask);
71 alphaChannel = _mm_sub_epi16(a: one, b: alphaChannel);
72 const __m128i dstVector = _mm_load_si128(p: (__m128i *)&dst[x]);
73 __m128i destMultipliedByOneMinusAlpha;
74 BYTE_MUL_SSE2(destMultipliedByOneMinusAlpha, dstVector, alphaChannel, colorMask, half);
75 const __m128i result = _mm_add_epi8(a: srcVector, b: destMultipliedByOneMinusAlpha);
76 _mm_store_si128(p: (__m128i *)&dst[x], b: result);
77 }
78 } /* end for() */
79 } else if ((length - x) >= 8) {
80 /* We use two vectors to extract the src: prevLoaded for the first pixels, lastLoaded for the current pixels. */
81 __m128i srcVectorPrevLoaded = _mm_load_si128(p: (const __m128i *)&src[x - minusOffsetToAlignSrcOn16Bytes]);
82 const int palignrOffset = minusOffsetToAlignSrcOn16Bytes << 2;
83
84 const __m128i alphaShuffleMask = _mm_set_epi8(b15: char(0xff),b14: 15,b13: char(0xff),b12: 15,b11: char(0xff),b10: 11,b9: char(0xff),b8: 11,b7: char(0xff),b6: 7,b5: char(0xff),b4: 7,b3: char(0xff),b2: 3,b1: char(0xff),b0: 3);
85 switch (palignrOffset) {
86 case 4:
87 BLENDING_LOOP(4, length)
88 break;
89 case 8:
90 BLENDING_LOOP(8, length)
91 break;
92 case 12:
93 BLENDING_LOOP(12, length)
94 break;
95 }
96 }
97 for (; x < length; ++x)
98 blend_pixel(dst&: dst[x], src: src[x]);
99}
100
101void qt_blend_argb32_on_argb32_ssse3(uchar *destPixels, int dbpl,
102 const uchar *srcPixels, int sbpl,
103 int w, int h,
104 int const_alpha)
105{
106 const quint32 *src = (const quint32 *) srcPixels;
107 quint32 *dst = (quint32 *) destPixels;
108 if (const_alpha == 256) {
109 const __m128i alphaMask = _mm_set1_epi32(i: 0xff000000);
110 const __m128i nullVector = _mm_setzero_si128();
111 const __m128i half = _mm_set1_epi16(w: 0x80);
112 const __m128i one = _mm_set1_epi16(w: 0xff);
113 const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff);
114
115 for (int y = 0; y < h; ++y) {
116 BLEND_SOURCE_OVER_ARGB32_SSSE3(dst, src, length: w, nullVector, half, one, colorMask, alphaMask);
117 dst = (quint32 *)(((uchar *) dst) + dbpl);
118 src = (const quint32 *)(((const uchar *) src) + sbpl);
119 }
120 } else if (const_alpha != 0) {
121 // dest = (s + d * sia) * ca + d * cia
122 // = s * ca + d * (sia * ca + cia)
123 // = s * ca + d * (1 - sa*ca)
124 const_alpha = (const_alpha * 255) >> 8;
125 const __m128i nullVector = _mm_setzero_si128();
126 const __m128i half = _mm_set1_epi16(w: 0x80);
127 const __m128i one = _mm_set1_epi16(w: 0xff);
128 const __m128i colorMask = _mm_set1_epi32(i: 0x00ff00ff);
129 const __m128i constAlphaVector = _mm_set1_epi16(w: const_alpha);
130 for (int y = 0; y < h; ++y) {
131 BLEND_SOURCE_OVER_ARGB32_WITH_CONST_ALPHA_SSE2(dst, src, w, nullVector, half, one, colorMask, constAlphaVector)
132 dst = (quint32 *)(((uchar *) dst) + dbpl);
133 src = (const quint32 *)(((const uchar *) src) + sbpl);
134 }
135 }
136}
137
138const uint *QT_FASTCALL fetchPixelsBPP24_ssse3(uint *buffer, const uchar *src, int index, int count)
139{
140 const quint24 *s = reinterpret_cast<const quint24 *>(src);
141 for (int i = 0; i < count; ++i)
142 buffer[i] = s[index + i];
143 return buffer;
144}
145
146extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_ssse3(quint32 *dst, const uchar *src, int len);
147
148const uint * QT_FASTCALL qt_fetchUntransformed_888_ssse3(uint *buffer, const Operator *, const QSpanData *data,
149 int y, int x, int length)
150{
151 const uchar *line = data->texture.scanLine(y) + x * 3;
152 qt_convert_rgb888_to_rgb32_ssse3(dst: buffer, src: line, len: length);
153 return buffer;
154}
155
156void qt_memfill24_ssse3(quint24 *dest, quint24 color, qsizetype count)
157{
158 // LCM of 12 and 16 bytes is 48 bytes (16 px)
159 quint32 v = color;
160 __m128i m = _mm_cvtsi32_si128(a: v);
161 quint24 *end = dest + count;
162
163 constexpr uchar x = 2, y = 1, z = 0;
164 alignas(__m128i) static const uchar
165 shuffleMask[16 + 1] = { x, y, z, x, y, z, x, y, z, x, y, z, x, y, z, x, y };
166
167 __m128i mval1 = _mm_shuffle_epi8(a: m, b: _mm_load_si128(p: reinterpret_cast<const __m128i *>(shuffleMask)));
168 __m128i mval2 = _mm_shuffle_epi8(a: m, b: _mm_loadu_si128(p: reinterpret_cast<const __m128i *>(shuffleMask + 1)));
169 __m128i mval3 = _mm_alignr_epi8(mval2, mval1, 2);
170
171 for ( ; dest + 16 <= end; dest += 16) {
172#ifdef __AVX__
173 // Store using 32-byte AVX instruction
174 __m256 mval12 = _mm256_castps128_ps256(_mm_castsi128_ps(mval1));
175 mval12 = _mm256_insertf128_ps(mval12, _mm_castsi128_ps(mval2), 1);
176 _mm256_storeu_ps(reinterpret_cast<float *>(dest), mval12);
177#else
178 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + 0, b: mval1);
179 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + 1, b: mval2);
180#endif
181 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(dest) + 2, b: mval3);
182 }
183
184 if (count < 3) {
185 if (count > 1)
186 end[-2] = v;
187 if (count)
188 end[-1] = v;
189 return;
190 }
191
192 // less than 16px/48B left
193 uchar *ptr = reinterpret_cast<uchar *>(dest);
194 uchar *ptr_end = reinterpret_cast<uchar *>(end);
195 qptrdiff left = ptr_end - ptr;
196 if (left >= 24) {
197 // 8px/24B or more left
198 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) + 0, b: mval1);
199 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr) + 1, a: mval2);
200 ptr += 24;
201 left -= 24;
202 }
203
204 // less than 8px/24B left
205
206 if (left >= 16) {
207 // but more than 5px/15B left
208 _mm_storeu_si128(p: reinterpret_cast<__m128i *>(ptr) , b: mval1);
209 } else if (left >= 8) {
210 // but more than 2px/6B left
211 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr), a: mval1);
212 }
213
214 if (left) {
215 // 1 or 2px left
216 // store 8 bytes ending with the right values (will overwrite a bit)
217 _mm_storel_epi64(p: reinterpret_cast<__m128i *>(ptr_end - 8), a: mval2);
218 }
219}
220
221void QT_FASTCALL rbSwap_888_ssse3(uchar *dst, const uchar *src, int count)
222{
223 int i = 0;
224
225 const static __m128i shuffleMask1 = _mm_setr_epi8(b0: 2, b1: 1, b2: 0, b3: 5, b4: 4, b5: 3, b6: 8, b7: 7, b8: 6, b9: 11, b10: 10, b11: 9, b12: 14, b13: 13, b14: 12, /*!!*/b15: 15);
226 const static __m128i shuffleMask2 = _mm_setr_epi8(b0: 0, /*!!*/b1: 1, b2: 4, b3: 3, b4: 2, b5: 7, b6: 6, b7: 5, b8: 10, b9: 9, b10: 8, b11: 13, b12: 12, b13: 11, /*!!*/b14: 14, b15: 15);
227 const static __m128i shuffleMask3 = _mm_setr_epi8(/*!!*/b0: 0, b1: 3, b2: 2, b3: 1, b4: 6, b5: 5, b6: 4, b7: 9, b8: 8, b9: 7, b10: 12, b11: 11, b12: 10, b13: 15, b14: 14, b15: 13);
228
229 for (; i + 15 < count; i += 16) {
230 __m128i s1 = _mm_loadu_si128(p: (const __m128i *)src);
231 __m128i s2 = _mm_loadu_si128(p: (const __m128i *)(src + 16));
232 __m128i s3 = _mm_loadu_si128(p: (const __m128i *)(src + 32));
233 s1 = _mm_shuffle_epi8(a: s1, b: shuffleMask1);
234 s2 = _mm_shuffle_epi8(a: s2, b: shuffleMask2);
235 s3 = _mm_shuffle_epi8(a: s3, b: shuffleMask3);
236 _mm_storeu_si128(p: (__m128i *)dst, b: s1);
237 _mm_storeu_si128(p: (__m128i *)(dst + 16), b: s2);
238 _mm_storeu_si128(p: (__m128i *)(dst + 32), b: s3);
239
240 // Now fix the last four misplaced values
241 std::swap(a&: dst[15], b&: dst[17]);
242 std::swap(a&: dst[30], b&: dst[32]);
243
244 src += 48;
245 dst += 48;
246 }
247
248 if (src != dst) {
249 SIMD_EPILOGUE(i, count, 15) {
250 dst[0] = src[2];
251 dst[1] = src[1];
252 dst[2] = src[0];
253 dst += 3;
254 src += 3;
255 }
256 } else {
257 SIMD_EPILOGUE(i, count, 15) {
258 std::swap(a&: dst[0], b&: dst[2]);
259 dst += 3;
260 }
261 }
262}
263
264QT_END_NAMESPACE
265
266#endif // QT_COMPILER_SUPPORTS_SSSE3
267

source code of qtbase/src/gui/painting/qdrawhelper_ssse3.cpp