Warning: That file was not part of the compilation database. It may have many parsing errors.
1 | /**************************************************************************** |
---|---|
2 | ** |
3 | ** Copyright (C) 2016 The Qt Company Ltd. |
4 | ** Contact: https://www.qt.io/licensing/ |
5 | ** |
6 | ** This file is part of the QtGui module of the Qt Toolkit. |
7 | ** |
8 | ** $QT_BEGIN_LICENSE:LGPL$ |
9 | ** Commercial License Usage |
10 | ** Licensees holding valid commercial Qt licenses may use this file in |
11 | ** accordance with the commercial license agreement provided with the |
12 | ** Software or, alternatively, in accordance with the terms contained in |
13 | ** a written agreement between you and The Qt Company. For licensing terms |
14 | ** and conditions see https://www.qt.io/terms-conditions. For further |
15 | ** information use the contact form at https://www.qt.io/contact-us. |
16 | ** |
17 | ** GNU Lesser General Public License Usage |
18 | ** Alternatively, this file may be used under the terms of the GNU Lesser |
19 | ** General Public License version 3 as published by the Free Software |
20 | ** Foundation and appearing in the file LICENSE.LGPL3 included in the |
21 | ** packaging of this file. Please review the following information to |
22 | ** ensure the GNU Lesser General Public License version 3 requirements |
23 | ** will be met: https://www.gnu.org/licenses/lgpl-3.0.html. |
24 | ** |
25 | ** GNU General Public License Usage |
26 | ** Alternatively, this file may be used under the terms of the GNU |
27 | ** General Public License version 2.0 or (at your option) the GNU General |
28 | ** Public license version 3 or any later version approved by the KDE Free |
29 | ** Qt Foundation. The licenses are as published by the Free Software |
30 | ** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3 |
31 | ** included in the packaging of this file. Please review the following |
32 | ** information to ensure the GNU General Public License requirements will |
33 | ** be met: https://www.gnu.org/licenses/gpl-2.0.html and |
34 | ** https://www.gnu.org/licenses/gpl-3.0.html. |
35 | ** |
36 | ** $QT_END_LICENSE$ |
37 | ** |
38 | ****************************************************************************/ |
39 | |
40 | #include <private/qdrawhelper_neon_p.h> |
41 | #include <private/qblendfunctions_p.h> |
42 | #include <private/qmath_p.h> |
43 | |
44 | #ifdef __ARM_NEON__ |
45 | |
46 | #include <private/qpaintengine_raster_p.h> |
47 | |
48 | QT_BEGIN_NAMESPACE |
49 | |
50 | void qt_memfill32(quint32 *dest, quint32 value, qsizetype count) |
51 | { |
52 | const int epilogueSize = count % 16; |
53 | #if defined(Q_CC_GHS) || defined(Q_CC_MSVC) |
54 | // inline assembler free version: |
55 | if (count >= 16) { |
56 | quint32 *const neonEnd = dest + count - epilogueSize; |
57 | const uint32x4_t valueVector1 = vdupq_n_u32(value); |
58 | const uint32x4x4_t valueVector4 = { valueVector1, valueVector1, valueVector1, valueVector1 }; |
59 | do { |
60 | vst4q_u32(dest, valueVector4); |
61 | dest += 16; |
62 | } while (dest != neonEnd); |
63 | } |
64 | #elif !defined(Q_PROCESSOR_ARM_64) |
65 | if (count >= 16) { |
66 | quint32 *const neonEnd = dest + count - epilogueSize; |
67 | register uint32x4_t valueVector1 asm ("q0") = vdupq_n_u32(value); |
68 | register uint32x4_t valueVector2 asm ("q1") = valueVector1; |
69 | while (dest != neonEnd) { |
70 | asm volatile ( |
71 | "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t" |
72 | "vst2.32 { d0, d1, d2, d3 }, [%[DST]] !\n\t" |
73 | : [DST]"+r"(dest) |
74 | : [VALUE1]"w"(valueVector1), [VALUE2] "w"(valueVector2) |
75 | : "memory" |
76 | ); |
77 | } |
78 | } |
79 | #else |
80 | if (count >= 16) { |
81 | quint32 *const neonEnd = dest + count - epilogueSize; |
82 | register uint32x4_t valueVector1 asm ("v0") = vdupq_n_u32(value); |
83 | register uint32x4_t valueVector2 asm ("v1") = valueVector1; |
84 | while (dest != neonEnd) { |
85 | asm volatile ( |
86 | "st2 { v0.4s, v1.4s }, [%[DST]], #32 \n\t" |
87 | "st2 { v0.4s, v1.4s }, [%[DST]], #32 \n\t" |
88 | : [DST]"+r"(dest) |
89 | : [VALUE1]"w"(valueVector1), [VALUE2] "w"(valueVector2) |
90 | : "memory" |
91 | ); |
92 | } |
93 | } |
94 | #endif |
95 | |
96 | switch (epilogueSize) |
97 | { |
98 | case 15: *dest++ = value; Q_FALLTHROUGH(); |
99 | case 14: *dest++ = value; Q_FALLTHROUGH(); |
100 | case 13: *dest++ = value; Q_FALLTHROUGH(); |
101 | case 12: *dest++ = value; Q_FALLTHROUGH(); |
102 | case 11: *dest++ = value; Q_FALLTHROUGH(); |
103 | case 10: *dest++ = value; Q_FALLTHROUGH(); |
104 | case 9: *dest++ = value; Q_FALLTHROUGH(); |
105 | case 8: *dest++ = value; Q_FALLTHROUGH(); |
106 | case 7: *dest++ = value; Q_FALLTHROUGH(); |
107 | case 6: *dest++ = value; Q_FALLTHROUGH(); |
108 | case 5: *dest++ = value; Q_FALLTHROUGH(); |
109 | case 4: *dest++ = value; Q_FALLTHROUGH(); |
110 | case 3: *dest++ = value; Q_FALLTHROUGH(); |
111 | case 2: *dest++ = value; Q_FALLTHROUGH(); |
112 | case 1: *dest++ = value; |
113 | } |
114 | } |
115 | |
116 | static inline uint16x8_t qvdiv_255_u16(uint16x8_t x, uint16x8_t half) |
117 | { |
118 | // result = (x + (x >> 8) + 0x80) >> 8 |
119 | |
120 | const uint16x8_t temp = vshrq_n_u16(x, 8); // x >> 8 |
121 | const uint16x8_t sum_part = vaddq_u16(x, half); // x + 0x80 |
122 | const uint16x8_t sum = vaddq_u16(temp, sum_part); |
123 | |
124 | return vshrq_n_u16(sum, 8); |
125 | } |
126 | |
127 | static inline uint16x8_t qvbyte_mul_u16(uint16x8_t x, uint16x8_t alpha, uint16x8_t half) |
128 | { |
129 | // t = qRound(x * alpha / 255.0) |
130 | |
131 | const uint16x8_t t = vmulq_u16(x, alpha); // t |
132 | return qvdiv_255_u16(t, half); |
133 | } |
134 | |
135 | static inline uint16x8_t qvinterpolate_pixel_255(uint16x8_t x, uint16x8_t a, uint16x8_t y, uint16x8_t b, uint16x8_t half) |
136 | { |
137 | // t = x * a + y * b |
138 | |
139 | const uint16x8_t ta = vmulq_u16(x, a); |
140 | const uint16x8_t tb = vmulq_u16(y, b); |
141 | |
142 | return qvdiv_255_u16(vaddq_u16(ta, tb), half); |
143 | } |
144 | |
145 | static inline uint16x8_t qvsource_over_u16(uint16x8_t src16, uint16x8_t dst16, uint16x8_t half, uint16x8_t full) |
146 | { |
147 | const uint16x4_t alpha16_high = vdup_lane_u16(vget_high_u16(src16), 3); |
148 | const uint16x4_t alpha16_low = vdup_lane_u16(vget_low_u16(src16), 3); |
149 | |
150 | const uint16x8_t alpha16 = vsubq_u16(full, vcombine_u16(alpha16_low, alpha16_high)); |
151 | |
152 | return vaddq_u16(src16, qvbyte_mul_u16(dst16, alpha16, half)); |
153 | } |
154 | |
155 | #if defined(ENABLE_PIXMAN_DRAWHELPERS) |
156 | extern "C"void |
157 | pixman_composite_over_8888_0565_asm_neon (int32_t w, |
158 | int32_t h, |
159 | uint16_t *dst, |
160 | int32_t dst_stride, |
161 | uint32_t *src, |
162 | int32_t src_stride); |
163 | |
164 | extern "C"void |
165 | pixman_composite_over_8888_8888_asm_neon (int32_t w, |
166 | int32_t h, |
167 | uint32_t *dst, |
168 | int32_t dst_stride, |
169 | uint32_t *src, |
170 | int32_t src_stride); |
171 | |
172 | extern "C"void |
173 | pixman_composite_src_0565_8888_asm_neon (int32_t w, |
174 | int32_t h, |
175 | uint32_t *dst, |
176 | int32_t dst_stride, |
177 | uint16_t *src, |
178 | int32_t src_stride); |
179 | |
180 | extern "C"void |
181 | pixman_composite_over_n_8_0565_asm_neon (int32_t w, |
182 | int32_t h, |
183 | uint16_t *dst, |
184 | int32_t dst_stride, |
185 | uint32_t src, |
186 | int32_t unused, |
187 | uint8_t *mask, |
188 | int32_t mask_stride); |
189 | |
190 | extern "C"void |
191 | pixman_composite_scanline_over_asm_neon (int32_t w, |
192 | const uint32_t *dst, |
193 | const uint32_t *src); |
194 | |
195 | extern "C"void |
196 | pixman_composite_src_0565_0565_asm_neon (int32_t w, |
197 | int32_t h, |
198 | uint16_t *dst, |
199 | int32_t dst_stride, |
200 | uint16_t *src, |
201 | int32_t src_stride); |
202 | // qblendfunctions.cpp |
203 | void qt_blend_argb32_on_rgb16_const_alpha(uchar *destPixels, int dbpl, |
204 | const uchar *srcPixels, int sbpl, |
205 | int w, int h, |
206 | int const_alpha); |
207 | |
208 | void qt_blend_rgb16_on_argb32_neon(uchar *destPixels, int dbpl, |
209 | const uchar *srcPixels, int sbpl, |
210 | int w, int h, |
211 | int const_alpha) |
212 | { |
213 | dbpl /= 4; |
214 | sbpl /= 2; |
215 | |
216 | quint32 *dst = (quint32 *) destPixels; |
217 | quint16 *src = (quint16 *) srcPixels; |
218 | |
219 | if (const_alpha != 256) { |
220 | quint8 a = (255 * const_alpha) >> 8; |
221 | quint8 ia = 255 - a; |
222 | |
223 | while (h--) { |
224 | for (int x=0; x<w; ++x) |
225 | dst[x] = INTERPOLATE_PIXEL_255(qConvertRgb16To32(src[x]), a, dst[x], ia); |
226 | dst += dbpl; |
227 | src += sbpl; |
228 | } |
229 | return; |
230 | } |
231 | |
232 | pixman_composite_src_0565_8888_asm_neon(w, h, dst, dbpl, src, sbpl); |
233 | } |
234 | |
235 | // qblendfunctions.cpp |
236 | void qt_blend_rgb16_on_rgb16(uchar *dst, int dbpl, |
237 | const uchar *src, int sbpl, |
238 | int w, int h, |
239 | int const_alpha); |
240 | |
241 | |
242 | template <int N> |
243 | static inline void scanLineBlit16(quint16 *dst, quint16 *src, int dstride) |
244 | { |
245 | if (N >= 2) { |
246 | ((quint32 *)dst)[0] = ((quint32 *)src)[0]; |
247 | __builtin_prefetch(dst + dstride, 1, 0); |
248 | } |
249 | for (int i = 1; i < N/2; ++i) |
250 | ((quint32 *)dst)[i] = ((quint32 *)src)[i]; |
251 | if (N & 1) |
252 | dst[N-1] = src[N-1]; |
253 | } |
254 | |
255 | template <int Width> |
256 | static inline void blockBlit16(quint16 *dst, quint16 *src, int dstride, int sstride, int h) |
257 | { |
258 | union { |
259 | quintptr address; |
260 | quint16 *pointer; |
261 | } u; |
262 | |
263 | u.pointer = dst; |
264 | |
265 | if (u.address & 2) { |
266 | while (h--) { |
267 | // align dst |
268 | dst[0] = src[0]; |
269 | if (Width > 1) |
270 | scanLineBlit16<Width-1>(dst + 1, src + 1, dstride); |
271 | dst += dstride; |
272 | src += sstride; |
273 | } |
274 | } else { |
275 | while (h--) { |
276 | scanLineBlit16<Width>(dst, src, dstride); |
277 | |
278 | dst += dstride; |
279 | src += sstride; |
280 | } |
281 | } |
282 | } |
283 | |
284 | void qt_blend_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, |
285 | const uchar *srcPixels, int sbpl, |
286 | int w, int h, |
287 | int const_alpha) |
288 | { |
289 | // testing show that the default memcpy is faster for widths 150 and up |
290 | if (const_alpha != 256 || w >= 150) { |
291 | qt_blend_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
292 | return; |
293 | } |
294 | |
295 | int dstride = dbpl / 2; |
296 | int sstride = sbpl / 2; |
297 | |
298 | quint16 *dst = (quint16 *) destPixels; |
299 | quint16 *src = (quint16 *) srcPixels; |
300 | |
301 | switch (w) { |
302 | #define BLOCKBLIT(n) case n: blockBlit16<n>(dst, src, dstride, sstride, h); return; |
303 | BLOCKBLIT(1); |
304 | BLOCKBLIT(2); |
305 | BLOCKBLIT(3); |
306 | BLOCKBLIT(4); |
307 | BLOCKBLIT(5); |
308 | BLOCKBLIT(6); |
309 | BLOCKBLIT(7); |
310 | BLOCKBLIT(8); |
311 | BLOCKBLIT(9); |
312 | BLOCKBLIT(10); |
313 | BLOCKBLIT(11); |
314 | BLOCKBLIT(12); |
315 | BLOCKBLIT(13); |
316 | BLOCKBLIT(14); |
317 | BLOCKBLIT(15); |
318 | #undef BLOCKBLIT |
319 | default: |
320 | break; |
321 | } |
322 | |
323 | pixman_composite_src_0565_0565_asm_neon (w, h, dst, dstride, src, sstride); |
324 | } |
325 | |
326 | extern "C"void blend_8_pixels_argb32_on_rgb16_neon(quint16 *dst, const quint32 *src, int const_alpha); |
327 | |
328 | void qt_blend_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, |
329 | const uchar *srcPixels, int sbpl, |
330 | int w, int h, |
331 | int const_alpha) |
332 | { |
333 | quint16 *dst = (quint16 *) destPixels; |
334 | quint32 *src = (quint32 *) srcPixels; |
335 | |
336 | if (const_alpha != 256) { |
337 | for (int y=0; y<h; ++y) { |
338 | int i = 0; |
339 | for (; i < w-7; i += 8) |
340 | blend_8_pixels_argb32_on_rgb16_neon(&dst[i], &src[i], const_alpha); |
341 | |
342 | if (i < w) { |
343 | int tail = w - i; |
344 | |
345 | quint16 dstBuffer[8]; |
346 | quint32 srcBuffer[8]; |
347 | |
348 | for (int j = 0; j < tail; ++j) { |
349 | dstBuffer[j] = dst[i + j]; |
350 | srcBuffer[j] = src[i + j]; |
351 | } |
352 | |
353 | blend_8_pixels_argb32_on_rgb16_neon(dstBuffer, srcBuffer, const_alpha); |
354 | |
355 | for (int j = 0; j < tail; ++j) |
356 | dst[i + j] = dstBuffer[j]; |
357 | } |
358 | |
359 | dst = (quint16 *)(((uchar *) dst) + dbpl); |
360 | src = (quint32 *)(((uchar *) src) + sbpl); |
361 | } |
362 | return; |
363 | } |
364 | |
365 | pixman_composite_over_8888_0565_asm_neon(w, h, dst, dbpl / 2, src, sbpl / 4); |
366 | } |
367 | #endif |
368 | |
369 | void qt_blend_argb32_on_argb32_scanline_neon(uint *dest, const uint *src, int length, uint const_alpha) |
370 | { |
371 | if (const_alpha == 255) { |
372 | #if defined(ENABLE_PIXMAN_DRAWHELPERS) |
373 | pixman_composite_scanline_over_asm_neon(length, dest, src); |
374 | #else |
375 | qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, 256); |
376 | #endif |
377 | } else { |
378 | qt_blend_argb32_on_argb32_neon((uchar *)dest, 4 * length, (uchar *)src, 4 * length, length, 1, (const_alpha * 256) / 255); |
379 | } |
380 | } |
381 | |
382 | void qt_blend_argb32_on_argb32_neon(uchar *destPixels, int dbpl, |
383 | const uchar *srcPixels, int sbpl, |
384 | int w, int h, |
385 | int const_alpha) |
386 | { |
387 | const uint *src = (const uint *) srcPixels; |
388 | uint *dst = (uint *) destPixels; |
389 | uint16x8_t half = vdupq_n_u16(0x80); |
390 | uint16x8_t full = vdupq_n_u16(0xff); |
391 | if (const_alpha == 256) { |
392 | #if defined(ENABLE_PIXMAN_DRAWHELPERS) |
393 | pixman_composite_over_8888_8888_asm_neon(w, h, (uint32_t *)destPixels, dbpl / 4, (uint32_t *)srcPixels, sbpl / 4); |
394 | #else |
395 | for (int y=0; y<h; ++y) { |
396 | int x = 0; |
397 | for (; x < w-3; x += 4) { |
398 | if (src[x] | src[x+1] | src[x+2] | src[x+3]) { |
399 | uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]); |
400 | uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]); |
401 | |
402 | const uint8x16_t src8 = vreinterpretq_u8_u32(src32); |
403 | const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32); |
404 | |
405 | const uint8x8_t src8_low = vget_low_u8(src8); |
406 | const uint8x8_t dst8_low = vget_low_u8(dst8); |
407 | |
408 | const uint8x8_t src8_high = vget_high_u8(src8); |
409 | const uint8x8_t dst8_high = vget_high_u8(dst8); |
410 | |
411 | const uint16x8_t src16_low = vmovl_u8(src8_low); |
412 | const uint16x8_t dst16_low = vmovl_u8(dst8_low); |
413 | |
414 | const uint16x8_t src16_high = vmovl_u8(src8_high); |
415 | const uint16x8_t dst16_high = vmovl_u8(dst8_high); |
416 | |
417 | const uint16x8_t result16_low = qvsource_over_u16(src16_low, dst16_low, half, full); |
418 | const uint16x8_t result16_high = qvsource_over_u16(src16_high, dst16_high, half, full); |
419 | |
420 | const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low)); |
421 | const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high)); |
422 | |
423 | vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high)); |
424 | } |
425 | } |
426 | for (; x<w; ++x) { |
427 | uint s = src[x]; |
428 | if (s >= 0xff000000) |
429 | dst[x] = s; |
430 | else if (s != 0) |
431 | dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
432 | } |
433 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
434 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
435 | } |
436 | #endif |
437 | } else if (const_alpha != 0) { |
438 | const_alpha = (const_alpha * 255) >> 8; |
439 | uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha); |
440 | for (int y = 0; y < h; ++y) { |
441 | int x = 0; |
442 | for (; x < w-3; x += 4) { |
443 | if (src[x] | src[x+1] | src[x+2] | src[x+3]) { |
444 | uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]); |
445 | uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]); |
446 | |
447 | const uint8x16_t src8 = vreinterpretq_u8_u32(src32); |
448 | const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32); |
449 | |
450 | const uint8x8_t src8_low = vget_low_u8(src8); |
451 | const uint8x8_t dst8_low = vget_low_u8(dst8); |
452 | |
453 | const uint8x8_t src8_high = vget_high_u8(src8); |
454 | const uint8x8_t dst8_high = vget_high_u8(dst8); |
455 | |
456 | const uint16x8_t src16_low = vmovl_u8(src8_low); |
457 | const uint16x8_t dst16_low = vmovl_u8(dst8_low); |
458 | |
459 | const uint16x8_t src16_high = vmovl_u8(src8_high); |
460 | const uint16x8_t dst16_high = vmovl_u8(dst8_high); |
461 | |
462 | const uint16x8_t srcalpha16_low = qvbyte_mul_u16(src16_low, const_alpha16, half); |
463 | const uint16x8_t srcalpha16_high = qvbyte_mul_u16(src16_high, const_alpha16, half); |
464 | |
465 | const uint16x8_t result16_low = qvsource_over_u16(srcalpha16_low, dst16_low, half, full); |
466 | const uint16x8_t result16_high = qvsource_over_u16(srcalpha16_high, dst16_high, half, full); |
467 | |
468 | const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low)); |
469 | const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high)); |
470 | |
471 | vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high)); |
472 | } |
473 | } |
474 | for (; x<w; ++x) { |
475 | uint s = src[x]; |
476 | if (s != 0) { |
477 | s = BYTE_MUL(s, const_alpha); |
478 | dst[x] = s + BYTE_MUL(dst[x], qAlpha(~s)); |
479 | } |
480 | } |
481 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
482 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
483 | } |
484 | } |
485 | } |
486 | |
487 | // qblendfunctions.cpp |
488 | void qt_blend_rgb32_on_rgb32(uchar *destPixels, int dbpl, |
489 | const uchar *srcPixels, int sbpl, |
490 | int w, int h, |
491 | int const_alpha); |
492 | |
493 | void qt_blend_rgb32_on_rgb32_neon(uchar *destPixels, int dbpl, |
494 | const uchar *srcPixels, int sbpl, |
495 | int w, int h, |
496 | int const_alpha) |
497 | { |
498 | if (const_alpha != 256) { |
499 | if (const_alpha != 0) { |
500 | const uint *src = (const uint *) srcPixels; |
501 | uint *dst = (uint *) destPixels; |
502 | uint16x8_t half = vdupq_n_u16(0x80); |
503 | const_alpha = (const_alpha * 255) >> 8; |
504 | int one_minus_const_alpha = 255 - const_alpha; |
505 | uint16x8_t const_alpha16 = vdupq_n_u16(const_alpha); |
506 | uint16x8_t one_minus_const_alpha16 = vdupq_n_u16(255 - const_alpha); |
507 | for (int y = 0; y < h; ++y) { |
508 | int x = 0; |
509 | for (; x < w-3; x += 4) { |
510 | uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]); |
511 | uint32x4_t dst32 = vld1q_u32((uint32_t *)&dst[x]); |
512 | |
513 | const uint8x16_t src8 = vreinterpretq_u8_u32(src32); |
514 | const uint8x16_t dst8 = vreinterpretq_u8_u32(dst32); |
515 | |
516 | const uint8x8_t src8_low = vget_low_u8(src8); |
517 | const uint8x8_t dst8_low = vget_low_u8(dst8); |
518 | |
519 | const uint8x8_t src8_high = vget_high_u8(src8); |
520 | const uint8x8_t dst8_high = vget_high_u8(dst8); |
521 | |
522 | const uint16x8_t src16_low = vmovl_u8(src8_low); |
523 | const uint16x8_t dst16_low = vmovl_u8(dst8_low); |
524 | |
525 | const uint16x8_t src16_high = vmovl_u8(src8_high); |
526 | const uint16x8_t dst16_high = vmovl_u8(dst8_high); |
527 | |
528 | const uint16x8_t result16_low = qvinterpolate_pixel_255(src16_low, const_alpha16, dst16_low, one_minus_const_alpha16, half); |
529 | const uint16x8_t result16_high = qvinterpolate_pixel_255(src16_high, const_alpha16, dst16_high, one_minus_const_alpha16, half); |
530 | |
531 | const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low)); |
532 | const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high)); |
533 | |
534 | vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high)); |
535 | } |
536 | for (; x<w; ++x) { |
537 | dst[x] = INTERPOLATE_PIXEL_255(src[x], const_alpha, dst[x], one_minus_const_alpha); |
538 | } |
539 | dst = (quint32 *)(((uchar *) dst) + dbpl); |
540 | src = (const quint32 *)(((const uchar *) src) + sbpl); |
541 | } |
542 | } |
543 | } else { |
544 | qt_blend_rgb32_on_rgb32(destPixels, dbpl, srcPixels, sbpl, w, h, const_alpha); |
545 | } |
546 | } |
547 | |
548 | #if defined(ENABLE_PIXMAN_DRAWHELPERS) |
549 | extern void qt_alphamapblit_quint16(QRasterBuffer *rasterBuffer, |
550 | int x, int y, const QRgba64 &color, |
551 | const uchar *map, |
552 | int mapWidth, int mapHeight, int mapStride, |
553 | const QClipData *clip, bool useGammaCorrection); |
554 | |
555 | void qt_alphamapblit_quint16_neon(QRasterBuffer *rasterBuffer, |
556 | int x, int y, const QRgba64 &color, |
557 | const uchar *bitmap, |
558 | int mapWidth, int mapHeight, int mapStride, |
559 | const QClipData *clip, bool useGammaCorrection) |
560 | { |
561 | if (clip || useGammaCorrection) { |
562 | qt_alphamapblit_quint16(rasterBuffer, x, y, color, bitmap, mapWidth, mapHeight, mapStride, clip, useGammaCorrection); |
563 | return; |
564 | } |
565 | |
566 | quint16 *dest = reinterpret_cast<quint16*>(rasterBuffer->scanLine(y)) + x; |
567 | const int destStride = rasterBuffer->bytesPerLine() / sizeof(quint16); |
568 | |
569 | uchar *mask = const_cast<uchar *>(bitmap); |
570 | const uint c = color.toArgb32(); |
571 | |
572 | pixman_composite_over_n_8_0565_asm_neon(mapWidth, mapHeight, dest, destStride, c, 0, mask, mapStride); |
573 | } |
574 | |
575 | extern "C"void blend_8_pixels_rgb16_on_rgb16_neon(quint16 *dst, const quint16 *src, int const_alpha); |
576 | |
577 | template <typename SRC, typename BlendFunc> |
578 | struct Blend_on_RGB16_SourceAndConstAlpha_Neon { |
579 | Blend_on_RGB16_SourceAndConstAlpha_Neon(BlendFunc blender, int const_alpha) |
580 | : m_index(0) |
581 | , m_blender(blender) |
582 | , m_const_alpha(const_alpha) |
583 | { |
584 | } |
585 | |
586 | inline void write(quint16 *dst, quint32 src) |
587 | { |
588 | srcBuffer[m_index++] = src; |
589 | |
590 | if (m_index == 8) { |
591 | m_blender(dst - 7, srcBuffer, m_const_alpha); |
592 | m_index = 0; |
593 | } |
594 | } |
595 | |
596 | inline void flush(quint16 *dst) |
597 | { |
598 | if (m_index > 0) { |
599 | quint16 dstBuffer[8]; |
600 | for (int i = 0; i < m_index; ++i) |
601 | dstBuffer[i] = dst[i - m_index]; |
602 | |
603 | m_blender(dstBuffer, srcBuffer, m_const_alpha); |
604 | |
605 | for (int i = 0; i < m_index; ++i) |
606 | dst[i - m_index] = dstBuffer[i]; |
607 | |
608 | m_index = 0; |
609 | } |
610 | } |
611 | |
612 | SRC srcBuffer[8]; |
613 | |
614 | int m_index; |
615 | BlendFunc m_blender; |
616 | int m_const_alpha; |
617 | }; |
618 | |
619 | template <typename SRC, typename BlendFunc> |
620 | Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc> |
621 | Blend_on_RGB16_SourceAndConstAlpha_Neon_create(BlendFunc blender, int const_alpha) |
622 | { |
623 | return Blend_on_RGB16_SourceAndConstAlpha_Neon<SRC, BlendFunc>(blender, const_alpha); |
624 | } |
625 | |
626 | void qt_scale_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, |
627 | const uchar *srcPixels, int sbpl, int srch, |
628 | const QRectF &targetRect, |
629 | const QRectF &sourceRect, |
630 | const QRect &clip, |
631 | int const_alpha) |
632 | { |
633 | if (const_alpha == 0) |
634 | return; |
635 | |
636 | qt_scale_image_16bit<quint32>(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, |
637 | Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha)); |
638 | } |
639 | |
640 | void qt_scale_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, |
641 | const uchar *srcPixels, int sbpl, int srch, |
642 | const QRectF &targetRect, |
643 | const QRectF &sourceRect, |
644 | const QRect &clip, |
645 | int const_alpha); |
646 | |
647 | void qt_scale_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, |
648 | const uchar *srcPixels, int sbpl, int srch, |
649 | const QRectF &targetRect, |
650 | const QRectF &sourceRect, |
651 | const QRect &clip, |
652 | int const_alpha) |
653 | { |
654 | if (const_alpha == 0) |
655 | return; |
656 | |
657 | if (const_alpha == 256) { |
658 | qt_scale_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, const_alpha); |
659 | return; |
660 | } |
661 | |
662 | qt_scale_image_16bit<quint16>(destPixels, dbpl, srcPixels, sbpl, srch, targetRect, sourceRect, clip, |
663 | Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha)); |
664 | } |
665 | |
666 | extern void qt_transform_image_rgb16_on_rgb16(uchar *destPixels, int dbpl, |
667 | const uchar *srcPixels, int sbpl, |
668 | const QRectF &targetRect, |
669 | const QRectF &sourceRect, |
670 | const QRect &clip, |
671 | const QTransform &targetRectTransform, |
672 | int const_alpha); |
673 | |
674 | void qt_transform_image_rgb16_on_rgb16_neon(uchar *destPixels, int dbpl, |
675 | const uchar *srcPixels, int sbpl, |
676 | const QRectF &targetRect, |
677 | const QRectF &sourceRect, |
678 | const QRect &clip, |
679 | const QTransform &targetRectTransform, |
680 | int const_alpha) |
681 | { |
682 | if (const_alpha == 0) |
683 | return; |
684 | |
685 | if (const_alpha == 256) { |
686 | qt_transform_image_rgb16_on_rgb16(destPixels, dbpl, srcPixels, sbpl, targetRect, sourceRect, clip, targetRectTransform, const_alpha); |
687 | return; |
688 | } |
689 | |
690 | qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl, |
691 | reinterpret_cast<const quint16 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform, |
692 | Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint16>(blend_8_pixels_rgb16_on_rgb16_neon, const_alpha)); |
693 | } |
694 | |
695 | void qt_transform_image_argb32_on_rgb16_neon(uchar *destPixels, int dbpl, |
696 | const uchar *srcPixels, int sbpl, |
697 | const QRectF &targetRect, |
698 | const QRectF &sourceRect, |
699 | const QRect &clip, |
700 | const QTransform &targetRectTransform, |
701 | int const_alpha) |
702 | { |
703 | if (const_alpha == 0) |
704 | return; |
705 | |
706 | qt_transform_image(reinterpret_cast<quint16 *>(destPixels), dbpl, |
707 | reinterpret_cast<const quint32 *>(srcPixels), sbpl, targetRect, sourceRect, clip, targetRectTransform, |
708 | Blend_on_RGB16_SourceAndConstAlpha_Neon_create<quint32>(blend_8_pixels_argb32_on_rgb16_neon, const_alpha)); |
709 | } |
710 | |
711 | static inline void convert_8_pixels_rgb16_to_argb32(quint32 *dst, const quint16 *src) |
712 | { |
713 | asm volatile ( |
714 | "vld1.16 { d0, d1 }, [%[SRC]]\n\t" |
715 | |
716 | /* convert 8 r5g6b5 pixel data from {d0, d1} to planar 8-bit format |
717 | and put data into d4 - red, d3 - green, d2 - blue */ |
718 | "vshrn.u16 d4, q0, #8\n\t" |
719 | "vshrn.u16 d3, q0, #3\n\t" |
720 | "vsli.u16 q0, q0, #5\n\t" |
721 | "vsri.u8 d4, d4, #5\n\t" |
722 | "vsri.u8 d3, d3, #6\n\t" |
723 | "vshrn.u16 d2, q0, #2\n\t" |
724 | |
725 | /* fill d5 - alpha with 0xff */ |
726 | "mov r2, #255\n\t" |
727 | "vdup.8 d5, r2\n\t" |
728 | |
729 | "vst4.8 { d2, d3, d4, d5 }, [%[DST]]" |
730 | : : [DST]"r"(dst), [SRC] "r"(src) |
731 | : "memory", "r2", "d0", "d1", "d2", "d3", "d4", "d5" |
732 | ); |
733 | } |
734 | |
735 | uint * QT_FASTCALL qt_destFetchRGB16_neon(uint *buffer, QRasterBuffer *rasterBuffer, int x, int y, int length) |
736 | { |
737 | const ushort *data = (const ushort *)rasterBuffer->scanLine(y) + x; |
738 | |
739 | int i = 0; |
740 | for (; i < length - 7; i += 8) |
741 | convert_8_pixels_rgb16_to_argb32(&buffer[i], &data[i]); |
742 | |
743 | if (i < length) { |
744 | quint16 srcBuffer[8]; |
745 | quint32 dstBuffer[8]; |
746 | |
747 | int tail = length - i; |
748 | for (int j = 0; j < tail; ++j) |
749 | srcBuffer[j] = data[i + j]; |
750 | |
751 | convert_8_pixels_rgb16_to_argb32(dstBuffer, srcBuffer); |
752 | |
753 | for (int j = 0; j < tail; ++j) |
754 | buffer[i + j] = dstBuffer[j]; |
755 | } |
756 | |
757 | return buffer; |
758 | } |
759 | |
760 | static inline void convert_8_pixels_argb32_to_rgb16(quint16 *dst, const quint32 *src) |
761 | { |
762 | asm volatile ( |
763 | "vld4.8 { d0, d1, d2, d3 }, [%[SRC]]\n\t" |
764 | |
765 | /* convert to r5g6b5 and store it into {d28, d29} */ |
766 | "vshll.u8 q14, d2, #8\n\t" |
767 | "vshll.u8 q8, d1, #8\n\t" |
768 | "vshll.u8 q9, d0, #8\n\t" |
769 | "vsri.u16 q14, q8, #5\n\t" |
770 | "vsri.u16 q14, q9, #11\n\t" |
771 | |
772 | "vst1.16 { d28, d29 }, [%[DST]]" |
773 | : : [DST]"r"(dst), [SRC] "r"(src) |
774 | : "memory", "d0", "d1", "d2", "d3", "d16", "d17", "d18", "d19", "d28", "d29" |
775 | ); |
776 | } |
777 | |
778 | void QT_FASTCALL qt_destStoreRGB16_neon(QRasterBuffer *rasterBuffer, int x, int y, const uint *buffer, int length) |
779 | { |
780 | quint16 *data = (quint16*)rasterBuffer->scanLine(y) + x; |
781 | |
782 | int i = 0; |
783 | for (; i < length - 7; i += 8) |
784 | convert_8_pixels_argb32_to_rgb16(&data[i], &buffer[i]); |
785 | |
786 | if (i < length) { |
787 | quint32 srcBuffer[8]; |
788 | quint16 dstBuffer[8]; |
789 | |
790 | int tail = length - i; |
791 | for (int j = 0; j < tail; ++j) |
792 | srcBuffer[j] = buffer[i + j]; |
793 | |
794 | convert_8_pixels_argb32_to_rgb16(dstBuffer, srcBuffer); |
795 | |
796 | for (int j = 0; j < tail; ++j) |
797 | data[i + j] = dstBuffer[j]; |
798 | } |
799 | } |
800 | #endif |
801 | |
802 | void QT_FASTCALL comp_func_solid_SourceOver_neon(uint *destPixels, int length, uint color, uint const_alpha) |
803 | { |
804 | if ((const_alpha & qAlpha(color)) == 255) { |
805 | qt_memfill32(destPixels, color, length); |
806 | } else { |
807 | if (const_alpha != 255) |
808 | color = BYTE_MUL(color, const_alpha); |
809 | |
810 | const quint32 minusAlphaOfColor = qAlpha(~color); |
811 | int x = 0; |
812 | |
813 | uint32_t *dst = (uint32_t *) destPixels; |
814 | const uint32x4_t colorVector = vdupq_n_u32(color); |
815 | uint16x8_t half = vdupq_n_u16(0x80); |
816 | const uint16x8_t minusAlphaOfColorVector = vdupq_n_u16(minusAlphaOfColor); |
817 | |
818 | for (; x < length-3; x += 4) { |
819 | uint32x4_t dstVector = vld1q_u32(&dst[x]); |
820 | |
821 | const uint8x16_t dst8 = vreinterpretq_u8_u32(dstVector); |
822 | |
823 | const uint8x8_t dst8_low = vget_low_u8(dst8); |
824 | const uint8x8_t dst8_high = vget_high_u8(dst8); |
825 | |
826 | const uint16x8_t dst16_low = vmovl_u8(dst8_low); |
827 | const uint16x8_t dst16_high = vmovl_u8(dst8_high); |
828 | |
829 | const uint16x8_t result16_low = qvbyte_mul_u16(dst16_low, minusAlphaOfColorVector, half); |
830 | const uint16x8_t result16_high = qvbyte_mul_u16(dst16_high, minusAlphaOfColorVector, half); |
831 | |
832 | const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result16_low)); |
833 | const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result16_high)); |
834 | |
835 | uint32x4_t blendedPixels = vcombine_u32(result32_low, result32_high); |
836 | uint32x4_t colorPlusBlendedPixels = vaddq_u32(colorVector, blendedPixels); |
837 | vst1q_u32(&dst[x], colorPlusBlendedPixels); |
838 | } |
839 | |
840 | SIMD_EPILOGUE(x, length, 3) |
841 | destPixels[x] = color + BYTE_MUL(destPixels[x], minusAlphaOfColor); |
842 | } |
843 | } |
844 | |
845 | void QT_FASTCALL comp_func_Plus_neon(uint *dst, const uint *src, int length, uint const_alpha) |
846 | { |
847 | if (const_alpha == 255) { |
848 | uint *const end = dst + length; |
849 | uint *const neonEnd = end - 3; |
850 | |
851 | while (dst < neonEnd) { |
852 | uint8x16_t vs = vld1q_u8((const uint8_t*)src); |
853 | const uint8x16_t vd = vld1q_u8((uint8_t*)dst); |
854 | vs = vqaddq_u8(vs, vd); |
855 | vst1q_u8((uint8_t*)dst, vs); |
856 | src += 4; |
857 | dst += 4; |
858 | }; |
859 | |
860 | while (dst != end) { |
861 | *dst = comp_func_Plus_one_pixel(*dst, *src); |
862 | ++dst; |
863 | ++src; |
864 | } |
865 | } else { |
866 | int x = 0; |
867 | const int one_minus_const_alpha = 255 - const_alpha; |
868 | const uint16x8_t constAlphaVector = vdupq_n_u16(const_alpha); |
869 | const uint16x8_t oneMinusconstAlphaVector = vdupq_n_u16(one_minus_const_alpha); |
870 | |
871 | const uint16x8_t half = vdupq_n_u16(0x80); |
872 | for (; x < length - 3; x += 4) { |
873 | const uint32x4_t src32 = vld1q_u32((uint32_t *)&src[x]); |
874 | const uint8x16_t src8 = vreinterpretq_u8_u32(src32); |
875 | uint8x16_t dst8 = vld1q_u8((uint8_t *)&dst[x]); |
876 | uint8x16_t result = vqaddq_u8(dst8, src8); |
877 | |
878 | uint16x8_t result_low = vmovl_u8(vget_low_u8(result)); |
879 | uint16x8_t result_high = vmovl_u8(vget_high_u8(result)); |
880 | |
881 | uint16x8_t dst_low = vmovl_u8(vget_low_u8(dst8)); |
882 | uint16x8_t dst_high = vmovl_u8(vget_high_u8(dst8)); |
883 | |
884 | result_low = qvinterpolate_pixel_255(result_low, constAlphaVector, dst_low, oneMinusconstAlphaVector, half); |
885 | result_high = qvinterpolate_pixel_255(result_high, constAlphaVector, dst_high, oneMinusconstAlphaVector, half); |
886 | |
887 | const uint32x2_t result32_low = vreinterpret_u32_u8(vmovn_u16(result_low)); |
888 | const uint32x2_t result32_high = vreinterpret_u32_u8(vmovn_u16(result_high)); |
889 | vst1q_u32((uint32_t *)&dst[x], vcombine_u32(result32_low, result32_high)); |
890 | } |
891 | |
892 | SIMD_EPILOGUE(x, length, 3) |
893 | dst[x] = comp_func_Plus_one_pixel_const_alpha(dst[x], src[x], const_alpha, one_minus_const_alpha); |
894 | } |
895 | } |
896 | |
897 | #if defined(ENABLE_PIXMAN_DRAWHELPERS) |
898 | static const int tileSize = 32; |
899 | |
900 | extern "C"void qt_rotate90_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count); |
901 | |
902 | void qt_memrotate90_16_neon(const uchar *srcPixels, int w, int h, int sstride, uchar *destPixels, int dstride) |
903 | { |
904 | const ushort *src = (const ushort *)srcPixels; |
905 | ushort *dest = (ushort *)destPixels; |
906 | |
907 | sstride /= sizeof(ushort); |
908 | dstride /= sizeof(ushort); |
909 | |
910 | const int pack = sizeof(quint32) / sizeof(ushort); |
911 | const int unaligned = |
912 | qMin(uint((quintptr(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h)); |
913 | const int restX = w % tileSize; |
914 | const int restY = (h - unaligned) % tileSize; |
915 | const int unoptimizedY = restY % pack; |
916 | const int numTilesX = w / tileSize + (restX > 0); |
917 | const int numTilesY = (h - unaligned) / tileSize + (restY >= pack); |
918 | |
919 | for (int tx = 0; tx < numTilesX; ++tx) { |
920 | const int startx = w - tx * tileSize - 1; |
921 | const int stopx = qMax(startx - tileSize, 0); |
922 | |
923 | if (unaligned) { |
924 | for (int x = startx; x >= stopx; --x) { |
925 | ushort *d = dest + (w - x - 1) * dstride; |
926 | for (int y = 0; y < unaligned; ++y) { |
927 | *d++ = src[y * sstride + x]; |
928 | } |
929 | } |
930 | } |
931 | |
932 | for (int ty = 0; ty < numTilesY; ++ty) { |
933 | const int starty = ty * tileSize + unaligned; |
934 | const int stopy = qMin(starty + tileSize, h - unoptimizedY); |
935 | |
936 | int x = startx; |
937 | // qt_rotate90_16_neon writes to eight rows, four pixels at a time |
938 | for (; x >= stopx + 7; x -= 8) { |
939 | ushort *d = dest + (w - x - 1) * dstride + starty; |
940 | const ushort *s = &src[starty * sstride + x - 7]; |
941 | qt_rotate90_16_neon(d, s, sstride * 2, dstride * 2, stopy - starty); |
942 | } |
943 | |
944 | for (; x >= stopx; --x) { |
945 | quint32 *d = reinterpret_cast<quint32*>(dest + (w - x - 1) * dstride + starty); |
946 | for (int y = starty; y < stopy; y += pack) { |
947 | quint32 c = src[y * sstride + x]; |
948 | for (int i = 1; i < pack; ++i) { |
949 | const int shift = (sizeof(int) * 8 / pack * i); |
950 | const ushort color = src[(y + i) * sstride + x]; |
951 | c |= color << shift; |
952 | } |
953 | *d++ = c; |
954 | } |
955 | } |
956 | } |
957 | |
958 | if (unoptimizedY) { |
959 | const int starty = h - unoptimizedY; |
960 | for (int x = startx; x >= stopx; --x) { |
961 | ushort *d = dest + (w - x - 1) * dstride + starty; |
962 | for (int y = starty; y < h; ++y) { |
963 | *d++ = src[y * sstride + x]; |
964 | } |
965 | } |
966 | } |
967 | } |
968 | } |
969 | |
970 | extern "C"void qt_rotate270_16_neon(quint16 *dst, const quint16 *src, int sstride, int dstride, int count); |
971 | |
972 | void qt_memrotate270_16_neon(const uchar *srcPixels, int w, int h, |
973 | int sstride, |
974 | uchar *destPixels, int dstride) |
975 | { |
976 | const ushort *src = (const ushort *)srcPixels; |
977 | ushort *dest = (ushort *)destPixels; |
978 | |
979 | sstride /= sizeof(ushort); |
980 | dstride /= sizeof(ushort); |
981 | |
982 | const int pack = sizeof(quint32) / sizeof(ushort); |
983 | const int unaligned = |
984 | qMin(uint((long(dest) & (sizeof(quint32)-1)) / sizeof(ushort)), uint(h)); |
985 | const int restX = w % tileSize; |
986 | const int restY = (h - unaligned) % tileSize; |
987 | const int unoptimizedY = restY % pack; |
988 | const int numTilesX = w / tileSize + (restX > 0); |
989 | const int numTilesY = (h - unaligned) / tileSize + (restY >= pack); |
990 | |
991 | for (int tx = 0; tx < numTilesX; ++tx) { |
992 | const int startx = tx * tileSize; |
993 | const int stopx = qMin(startx + tileSize, w); |
994 | |
995 | if (unaligned) { |
996 | for (int x = startx; x < stopx; ++x) { |
997 | ushort *d = dest + x * dstride; |
998 | for (int y = h - 1; y >= h - unaligned; --y) { |
999 | *d++ = src[y * sstride + x]; |
1000 | } |
1001 | } |
1002 | } |
1003 | |
1004 | for (int ty = 0; ty < numTilesY; ++ty) { |
1005 | const int starty = h - 1 - unaligned - ty * tileSize; |
1006 | const int stopy = qMax(starty - tileSize, unoptimizedY); |
1007 | |
1008 | int x = startx; |
1009 | // qt_rotate90_16_neon writes to eight rows, four pixels at a time |
1010 | for (; x < stopx - 7; x += 8) { |
1011 | ushort *d = dest + x * dstride + h - 1 - starty; |
1012 | const ushort *s = &src[starty * sstride + x]; |
1013 | qt_rotate90_16_neon(d + 7 * dstride, s, -sstride * 2, -dstride * 2, starty - stopy); |
1014 | } |
1015 | |
1016 | for (; x < stopx; ++x) { |
1017 | quint32 *d = reinterpret_cast<quint32*>(dest + x * dstride |
1018 | + h - 1 - starty); |
1019 | for (int y = starty; y > stopy; y -= pack) { |
1020 | quint32 c = src[y * sstride + x]; |
1021 | for (int i = 1; i < pack; ++i) { |
1022 | const int shift = (sizeof(int) * 8 / pack * i); |
1023 | const ushort color = src[(y - i) * sstride + x]; |
1024 | c |= color << shift; |
1025 | } |
1026 | *d++ = c; |
1027 | } |
1028 | } |
1029 | } |
1030 | if (unoptimizedY) { |
1031 | const int starty = unoptimizedY - 1; |
1032 | for (int x = startx; x < stopx; ++x) { |
1033 | ushort *d = dest + x * dstride + h - 1 - starty; |
1034 | for (int y = starty; y >= 0; --y) { |
1035 | *d++ = src[y * sstride + x]; |
1036 | } |
1037 | } |
1038 | } |
1039 | } |
1040 | } |
1041 | #endif |
1042 | |
1043 | class QSimdNeon |
1044 | { |
1045 | public: |
1046 | typedef int32x4_t Int32x4; |
1047 | typedef float32x4_t Float32x4; |
1048 | |
1049 | union Vect_buffer_i { Int32x4 v; int i[4]; }; |
1050 | union Vect_buffer_f { Float32x4 v; float f[4]; }; |
1051 | |
1052 | static inline Float32x4 v_dup(double x) { return vdupq_n_f32(float(x)); } |
1053 | static inline Float32x4 v_dup(float x) { return vdupq_n_f32(x); } |
1054 | static inline Int32x4 v_dup(int x) { return vdupq_n_s32(x); } |
1055 | static inline Int32x4 v_dup(uint x) { return vdupq_n_s32(x); } |
1056 | |
1057 | static inline Float32x4 v_add(Float32x4 a, Float32x4 b) { return vaddq_f32(a, b); } |
1058 | static inline Int32x4 v_add(Int32x4 a, Int32x4 b) { return vaddq_s32(a, b); } |
1059 | |
1060 | static inline Float32x4 v_max(Float32x4 a, Float32x4 b) { return vmaxq_f32(a, b); } |
1061 | static inline Float32x4 v_min(Float32x4 a, Float32x4 b) { return vminq_f32(a, b); } |
1062 | static inline Int32x4 v_min_16(Int32x4 a, Int32x4 b) { return vminq_s32(a, b); } |
1063 | |
1064 | static inline Int32x4 v_and(Int32x4 a, Int32x4 b) { return vandq_s32(a, b); } |
1065 | |
1066 | static inline Float32x4 v_sub(Float32x4 a, Float32x4 b) { return vsubq_f32(a, b); } |
1067 | static inline Int32x4 v_sub(Int32x4 a, Int32x4 b) { return vsubq_s32(a, b); } |
1068 | |
1069 | static inline Float32x4 v_mul(Float32x4 a, Float32x4 b) { return vmulq_f32(a, b); } |
1070 | |
1071 | static inline Float32x4 v_sqrt(Float32x4 x) { Float32x4 y = vrsqrteq_f32(x); y = vmulq_f32(y, vrsqrtsq_f32(x, vmulq_f32(y, y))); return vmulq_f32(x, y); } |
1072 | |
1073 | static inline Int32x4 v_toInt(Float32x4 x) { return vcvtq_s32_f32(x); } |
1074 | |
1075 | static inline Int32x4 v_greaterOrEqual(Float32x4 a, Float32x4 b) { return vreinterpretq_s32_u32(vcgeq_f32(a, b)); } |
1076 | }; |
1077 | |
1078 | const uint * QT_FASTCALL qt_fetch_radial_gradient_neon(uint *buffer, const Operator *op, const QSpanData *data, |
1079 | int y, int x, int length) |
1080 | { |
1081 | return qt_fetch_radial_gradient_template<QRadialFetchSimd<QSimdNeon>,uint>(buffer, op, data, y, x, length); |
1082 | } |
1083 | |
1084 | extern void QT_FASTCALL qt_convert_rgb888_to_rgb32_neon(quint32 *dst, const uchar *src, int len); |
1085 | |
1086 | const uint * QT_FASTCALL qt_fetchUntransformed_888_neon(uint *buffer, const Operator *, const QSpanData *data, |
1087 | int y, int x, int length) |
1088 | { |
1089 | const uchar *line = data->texture.scanLine(y) + x * 3; |
1090 | qt_convert_rgb888_to_rgb32_neon(buffer, line, length); |
1091 | return buffer; |
1092 | } |
1093 | |
1094 | #if Q_BYTE_ORDER == Q_LITTLE_ENDIAN |
1095 | static inline uint32x4_t vrgba2argb(uint32x4_t srcVector) |
1096 | { |
1097 | #if defined(Q_PROCESSOR_ARM_64) |
1098 | const uint8x16_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7, 10, 9, 8, 11, 14, 13, 12, 15}; |
1099 | #else |
1100 | const uint8x8_t rgbaMask = { 2, 1, 0, 3, 6, 5, 4, 7 }; |
1101 | #endif |
1102 | #if defined(Q_PROCESSOR_ARM_64) |
1103 | srcVector = vreinterpretq_u32_u8(vqtbl1q_u8(vreinterpretq_u8_u32(srcVector), rgbaMask)); |
1104 | #else |
1105 | // no vqtbl1q_u8, so use two vtbl1_u8 |
1106 | const uint8x8_t low = vtbl1_u8(vreinterpret_u8_u32(vget_low_u32(srcVector)), rgbaMask); |
1107 | const uint8x8_t high = vtbl1_u8(vreinterpret_u8_u32(vget_high_u32(srcVector)), rgbaMask); |
1108 | srcVector = vcombine_u32(vreinterpret_u32_u8(low), vreinterpret_u32_u8(high)); |
1109 | #endif |
1110 | return srcVector; |
1111 | } |
1112 | |
1113 | template<bool RGBA> |
1114 | static inline void convertARGBToARGB32PM_neon(uint *buffer, const uint *src, int count) |
1115 | { |
1116 | int i = 0; |
1117 | const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; |
1118 | const uint32x4_t blendMask = vdupq_n_u32(0xff000000); |
1119 | |
1120 | for (; i < count - 3; i += 4) { |
1121 | uint32x4_t srcVector = vld1q_u32(src + i); |
1122 | uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24); |
1123 | #if defined(Q_PROCESSOR_ARM_64) |
1124 | uint32_t alphaSum = vaddvq_u32(alphaVector); |
1125 | #else |
1126 | // no vaddvq_u32 |
1127 | uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector)); |
1128 | uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0); |
1129 | #endif |
1130 | if (alphaSum) { |
1131 | if (alphaSum != 255 * 4) { |
1132 | if (RGBA) |
1133 | srcVector = vrgba2argb(srcVector); |
1134 | const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(srcVector)); |
1135 | const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(srcVector)); |
1136 | const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask); |
1137 | const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask); |
1138 | uint16x8_t src1 = vmull_u8(s1, alpha1); |
1139 | uint16x8_t src2 = vmull_u8(s2, alpha2); |
1140 | src1 = vsraq_n_u16(src1, src1, 8); |
1141 | src2 = vsraq_n_u16(src2, src2, 8); |
1142 | const uint8x8_t d1 = vrshrn_n_u16(src1, 8); |
1143 | const uint8x8_t d2 = vrshrn_n_u16(src2, 8); |
1144 | const uint32x4_t d = vbslq_u32(blendMask, srcVector, vreinterpretq_u32_u8(vcombine_u8(d1, d2))); |
1145 | vst1q_u32(buffer + i, d); |
1146 | } else { |
1147 | if (RGBA) |
1148 | vst1q_u32(buffer + i, vrgba2argb(srcVector)); |
1149 | else if (buffer != src) |
1150 | vst1q_u32(buffer + i, srcVector); |
1151 | } |
1152 | } else { |
1153 | vst1q_u32(buffer + i, vdupq_n_u32(0)); |
1154 | } |
1155 | } |
1156 | |
1157 | SIMD_EPILOGUE(i, count, 3) { |
1158 | uint v = qPremultiply(src[i]); |
1159 | buffer[i] = RGBA ? RGBA2ARGB(v) : v; |
1160 | } |
1161 | } |
1162 | |
1163 | template<bool RGBA> |
1164 | static inline void convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count) |
1165 | { |
1166 | if (count <= 0) |
1167 | return; |
1168 | |
1169 | const uint8x8_t shuffleMask = { 3, 3, 3, 3, 7, 7, 7, 7}; |
1170 | const uint64x2_t blendMask = vdupq_n_u64(Q_UINT64_C(0xffff000000000000)); |
1171 | |
1172 | int i = 0; |
1173 | for (; i < count-3; i += 4) { |
1174 | uint32x4_t vs32 = vld1q_u32(src + i); |
1175 | uint32x4_t alphaVector = vshrq_n_u32(vs32, 24); |
1176 | #if defined(Q_PROCESSOR_ARM_64) |
1177 | uint32_t alphaSum = vaddvq_u32(alphaVector); |
1178 | #else |
1179 | // no vaddvq_u32 |
1180 | uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector)); |
1181 | uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0); |
1182 | #endif |
1183 | if (alphaSum) { |
1184 | if (!RGBA) |
1185 | vs32 = vrgba2argb(vs32); |
1186 | const uint8x16_t vs8 = vreinterpretq_u8_u32(vs32); |
1187 | const uint8x16x2_t v = vzipq_u8(vs8, vs8); |
1188 | if (alphaSum != 255 * 4) { |
1189 | const uint8x8_t s1 = vreinterpret_u8_u32(vget_low_u32(vs32)); |
1190 | const uint8x8_t s2 = vreinterpret_u8_u32(vget_high_u32(vs32)); |
1191 | const uint8x8_t alpha1 = vtbl1_u8(s1, shuffleMask); |
1192 | const uint8x8_t alpha2 = vtbl1_u8(s2, shuffleMask); |
1193 | uint16x8_t src1 = vmull_u8(s1, alpha1); |
1194 | uint16x8_t src2 = vmull_u8(s2, alpha2); |
1195 | // convert from 0->(255x255) to 0->(255x257) |
1196 | src1 = vsraq_n_u16(src1, src1, 7); |
1197 | src2 = vsraq_n_u16(src2, src2, 7); |
1198 | |
1199 | // now restore alpha from the trivial conversion |
1200 | const uint64x2_t d1 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[0]), vreinterpretq_u64_u16(src1)); |
1201 | const uint64x2_t d2 = vbslq_u64(blendMask, vreinterpretq_u64_u8(v.val[1]), vreinterpretq_u64_u16(src2)); |
1202 | |
1203 | vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d1)); |
1204 | buffer += 2; |
1205 | vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u64(d2)); |
1206 | buffer += 2; |
1207 | } else { |
1208 | vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[0])); |
1209 | buffer += 2; |
1210 | vst1q_u16((uint16_t *)buffer, vreinterpretq_u16_u8(v.val[1])); |
1211 | buffer += 2; |
1212 | } |
1213 | } else { |
1214 | vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0)); |
1215 | buffer += 2; |
1216 | vst1q_u16((uint16_t *)buffer, vdupq_n_u16(0)); |
1217 | buffer += 2; |
1218 | } |
1219 | } |
1220 | |
1221 | SIMD_EPILOGUE(i, count, 3) { |
1222 | uint s = src[i]; |
1223 | if (RGBA) |
1224 | s = RGBA2ARGB(s); |
1225 | *buffer++ = QRgba64::fromArgb32(s).premultiplied(); |
1226 | } |
1227 | } |
1228 | |
1229 | static inline float32x4_t reciprocal_mul_ps(float32x4_t a, float mul) |
1230 | { |
1231 | float32x4_t ia = vrecpeq_f32(a); // estimate 1/a |
1232 | ia = vmulq_f32(vrecpsq_f32(a, ia), vmulq_n_f32(ia, mul)); // estimate improvement step * mul |
1233 | return ia; |
1234 | } |
1235 | |
1236 | template<bool RGBA, bool RGBx> |
1237 | static inline void convertARGBFromARGB32PM_neon(uint *buffer, const uint *src, int count) |
1238 | { |
1239 | int i = 0; |
1240 | const uint32x4_t alphaMask = vdupq_n_u32(0xff000000); |
1241 | |
1242 | for (; i < count - 3; i += 4) { |
1243 | uint32x4_t srcVector = vld1q_u32(src + i); |
1244 | uint32x4_t alphaVector = vshrq_n_u32(srcVector, 24); |
1245 | #if defined(Q_PROCESSOR_ARM_64) |
1246 | uint32_t alphaSum = vaddvq_u32(alphaVector); |
1247 | #else |
1248 | // no vaddvq_u32 |
1249 | uint32x2_t tmp = vpadd_u32(vget_low_u32(alphaVector), vget_high_u32(alphaVector)); |
1250 | uint32_t alphaSum = vget_lane_u32(vpadd_u32(tmp, tmp), 0); |
1251 | #endif |
1252 | if (alphaSum) { |
1253 | if (alphaSum != 255 * 4) { |
1254 | if (RGBA) |
1255 | srcVector = vrgba2argb(srcVector); |
1256 | const float32x4_t a = vcvtq_f32_u32(alphaVector); |
1257 | const float32x4_t ia = reciprocal_mul_ps(a, 255.0f); |
1258 | // Convert 4x(4xU8) to 4x(4xF32) |
1259 | uint16x8_t tmp1 = vmovl_u8(vget_low_u8(vreinterpretq_u8_u32(srcVector))); |
1260 | uint16x8_t tmp3 = vmovl_u8(vget_high_u8(vreinterpretq_u8_u32(srcVector))); |
1261 | float32x4_t src1 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1))); |
1262 | float32x4_t src2 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1))); |
1263 | float32x4_t src3 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp3))); |
1264 | float32x4_t src4 = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp3))); |
1265 | src1 = vmulq_lane_f32(src1, vget_low_f32(ia), 0); |
1266 | src2 = vmulq_lane_f32(src2, vget_low_f32(ia), 1); |
1267 | src3 = vmulq_lane_f32(src3, vget_high_f32(ia), 0); |
1268 | src4 = vmulq_lane_f32(src4, vget_high_f32(ia), 1); |
1269 | // Convert 4x(4xF32) back to 4x(4xU8) (over a 8.1 fixed point format to get rounding) |
1270 | tmp1 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src1, 1), 1), |
1271 | vrshrn_n_u32(vcvtq_n_u32_f32(src2, 1), 1)); |
1272 | tmp3 = vcombine_u16(vrshrn_n_u32(vcvtq_n_u32_f32(src3, 1), 1), |
1273 | vrshrn_n_u32(vcvtq_n_u32_f32(src4, 1), 1)); |
1274 | uint32x4_t dstVector = vreinterpretq_u32_u8(vcombine_u8(vmovn_u16(tmp1), vmovn_u16(tmp3))); |
1275 | // Overwrite any undefined results from alpha==0 with zeros: |
1276 | #if defined(Q_PROCESSOR_ARM_64) |
1277 | uint32x4_t srcVectorAlphaMask = vceqzq_u32(alphaVector); |
1278 | #else |
1279 | uint32x4_t srcVectorAlphaMask = vceqq_u32(alphaVector, vdupq_n_u32(0)); |
1280 | #endif |
1281 | dstVector = vbicq_u32(dstVector, srcVectorAlphaMask); |
1282 | // Restore or mask alpha values: |
1283 | if (RGBx) |
1284 | dstVector = vorrq_u32(alphaMask, dstVector); |
1285 | else |
1286 | dstVector = vbslq_u32(alphaMask, srcVector, dstVector); |
1287 | vst1q_u32(&buffer[i], dstVector); |
1288 | } else { |
1289 | // 4xAlpha==255, no change except if we are doing RGBA->ARGB: |
1290 | if (RGBA) |
1291 | vst1q_u32(&buffer[i], vrgba2argb(srcVector)); |
1292 | else if (buffer != src) |
1293 | vst1q_u32(&buffer[i], srcVector); |
1294 | } |
1295 | } else { |
1296 | // 4xAlpha==0, always zero, except if output is RGBx: |
1297 | if (RGBx) |
1298 | vst1q_u32(&buffer[i], alphaMask); |
1299 | else |
1300 | vst1q_u32(&buffer[i], vdupq_n_u32(0)); |
1301 | } |
1302 | } |
1303 | |
1304 | SIMD_EPILOGUE(i, count, 3) { |
1305 | uint v = qUnpremultiply(src[i]); |
1306 | if (RGBx) |
1307 | v = 0xff000000 | v; |
1308 | if (RGBA) |
1309 | v = ARGB2RGBA(v); |
1310 | buffer[i] = v; |
1311 | } |
1312 | } |
1313 | |
1314 | void QT_FASTCALL convertARGB32ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *) |
1315 | { |
1316 | convertARGBToARGB32PM_neon<false>(buffer, buffer, count); |
1317 | } |
1318 | |
1319 | void QT_FASTCALL convertRGBA8888ToARGB32PM_neon(uint *buffer, int count, const QVector<QRgb> *) |
1320 | { |
1321 | convertARGBToARGB32PM_neon<true>(buffer, buffer, count); |
1322 | } |
1323 | |
1324 | const uint *QT_FASTCALL fetchARGB32ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count, |
1325 | const QVector<QRgb> *, QDitherInfo *) |
1326 | { |
1327 | convertARGBToARGB32PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); |
1328 | return buffer; |
1329 | } |
1330 | |
1331 | const uint *QT_FASTCALL fetchRGBA8888ToARGB32PM_neon(uint *buffer, const uchar *src, int index, int count, |
1332 | const QVector<QRgb> *, QDitherInfo *) |
1333 | { |
1334 | convertARGBToARGB32PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); |
1335 | return buffer; |
1336 | } |
1337 | |
1338 | const QRgba64 * QT_FASTCALL convertARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count, |
1339 | const QVector<QRgb> *, QDitherInfo *) |
1340 | { |
1341 | convertARGB32ToRGBA64PM_neon<false>(buffer, src, count); |
1342 | return buffer; |
1343 | } |
1344 | |
1345 | const QRgba64 * QT_FASTCALL convertRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uint *src, int count, |
1346 | const QVector<QRgb> *, QDitherInfo *) |
1347 | { |
1348 | convertARGB32ToRGBA64PM_neon<true>(buffer, src, count); |
1349 | return buffer; |
1350 | } |
1351 | |
1352 | const QRgba64 *QT_FASTCALL fetchARGB32ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count, |
1353 | const QVector<QRgb> *, QDitherInfo *) |
1354 | { |
1355 | convertARGB32ToRGBA64PM_neon<false>(buffer, reinterpret_cast<const uint *>(src) + index, count); |
1356 | return buffer; |
1357 | } |
1358 | |
1359 | const QRgba64 *QT_FASTCALL fetchRGBA8888ToRGBA64PM_neon(QRgba64 *buffer, const uchar *src, int index, int count, |
1360 | const QVector<QRgb> *, QDitherInfo *) |
1361 | { |
1362 | convertARGB32ToRGBA64PM_neon<true>(buffer, reinterpret_cast<const uint *>(src) + index, count); |
1363 | return buffer; |
1364 | } |
1365 | |
1366 | void QT_FASTCALL storeRGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, |
1367 | const QVector<QRgb> *, QDitherInfo *) |
1368 | { |
1369 | uint *d = reinterpret_cast<uint *>(dest) + index; |
1370 | convertARGBFromARGB32PM_neon<false,true>(d, src, count); |
1371 | } |
1372 | |
1373 | void QT_FASTCALL storeARGB32FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, |
1374 | const QVector<QRgb> *, QDitherInfo *) |
1375 | { |
1376 | uint *d = reinterpret_cast<uint *>(dest) + index; |
1377 | convertARGBFromARGB32PM_neon<false,false>(d, src, count); |
1378 | } |
1379 | |
1380 | void QT_FASTCALL storeRGBA8888FromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, |
1381 | const QVector<QRgb> *, QDitherInfo *) |
1382 | { |
1383 | uint *d = reinterpret_cast<uint *>(dest) + index; |
1384 | convertARGBFromARGB32PM_neon<true,false>(d, src, count); |
1385 | } |
1386 | |
1387 | void QT_FASTCALL storeRGBXFromARGB32PM_neon(uchar *dest, const uint *src, int index, int count, |
1388 | const QVector<QRgb> *, QDitherInfo *) |
1389 | { |
1390 | uint *d = reinterpret_cast<uint *>(dest) + index; |
1391 | convertARGBFromARGB32PM_neon<true,true>(d, src, count); |
1392 | } |
1393 | |
1394 | #endif // Q_BYTE_ORDER == Q_LITTLE_ENDIAN |
1395 | |
1396 | QT_END_NAMESPACE |
1397 | |
1398 | #endif // __ARM_NEON__ |
1399 | |
1400 |
Warning: That file was not part of the compilation database. It may have many parsing errors.