1// Copyright (C) 2022 The Qt Company Ltd.
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#include "qcolortransform.h"
5#include "qcolortransform_p.h"
6
7#include "qcolormatrix_p.h"
8#include "qcolorspace_p.h"
9#include "qcolortrc_p.h"
10#include "qcolortrclut_p.h"
11
12#include <QtCore/qatomic.h>
13#include <QtCore/qmath.h>
14#include <QtGui/qcolor.h>
15#include <QtGui/qimage.h>
16#include <QtGui/qtransform.h>
17#include <QtCore/private/qsimd_p.h>
18
19#include <qdebug.h>
20
21QT_BEGIN_NAMESPACE
22
23std::shared_ptr<QColorTrcLut> lutFromTrc(const QColorTrc &trc)
24{
25 if (trc.m_type == QColorTrc::Type::Table)
26 return QColorTrcLut::fromTransferTable(transTable: trc.m_table);
27 if (trc.m_type == QColorTrc::Type::Function)
28 return QColorTrcLut::fromTransferFunction(transfn: trc.m_fun);
29 qWarning() << "TRC uninitialized";
30 return nullptr;
31}
32
33void QColorTransformPrivate::updateLutsIn() const
34{
35 if (colorSpaceIn->lut.generated.loadAcquire())
36 return;
37 QMutexLocker lock(&QColorSpacePrivate::s_lutWriteLock);
38 if (colorSpaceIn->lut.generated.loadRelaxed())
39 return;
40
41 for (int i = 0; i < 3; ++i) {
42 if (!colorSpaceIn->trc[i].isValid())
43 return;
44 }
45
46 if (colorSpaceIn->trc[0] == colorSpaceIn->trc[1] && colorSpaceIn->trc[0] == colorSpaceIn->trc[2]) {
47 colorSpaceIn->lut[0] = lutFromTrc(trc: colorSpaceIn->trc[0]);
48 colorSpaceIn->lut[1] = colorSpaceIn->lut[0];
49 colorSpaceIn->lut[2] = colorSpaceIn->lut[0];
50 } else {
51 for (int i = 0; i < 3; ++i)
52 colorSpaceIn->lut[i] = lutFromTrc(trc: colorSpaceIn->trc[i]);
53 }
54
55 colorSpaceIn->lut.generated.storeRelease(newValue: 1);
56}
57
58void QColorTransformPrivate::updateLutsOut() const
59{
60 if (colorSpaceOut->lut.generated.loadAcquire())
61 return;
62 QMutexLocker lock(&QColorSpacePrivate::s_lutWriteLock);
63 if (colorSpaceOut->lut.generated.loadRelaxed())
64 return;
65 for (int i = 0; i < 3; ++i) {
66 if (!colorSpaceOut->trc[i].isValid())
67 return;
68 }
69
70 if (colorSpaceOut->trc[0] == colorSpaceOut->trc[1] && colorSpaceOut->trc[0] == colorSpaceOut->trc[2]) {
71 colorSpaceOut->lut[0] = lutFromTrc(trc: colorSpaceOut->trc[0]);
72 colorSpaceOut->lut[1] = colorSpaceOut->lut[0];
73 colorSpaceOut->lut[2] = colorSpaceOut->lut[0];
74 } else {
75 for (int i = 0; i < 3; ++i)
76 colorSpaceOut->lut[i] = lutFromTrc(trc: colorSpaceOut->trc[i]);
77 }
78
79 colorSpaceOut->lut.generated.storeRelease(newValue: 1);
80}
81
82/*!
83 \class QColorTransform
84 \brief The QColorTransform class is a transformation between color spaces.
85 \since 5.14
86
87 \ingroup painting
88 \ingroup appearance
89 \inmodule QtGui
90
91 QColorTransform is an instantiation of a transformation between color spaces.
92 It can be applied on color and pixels to convert them from one color space to
93 another.
94
95 Setting up a QColorTransform takes some preprocessing, so keeping around
96 QColorTransforms that you need often is recommended, instead of generating
97 them on the fly.
98*/
99
100
101QColorTransform::QColorTransform(const QColorTransform &colorTransform) noexcept = default;
102
103QColorTransform::~QColorTransform() = default;
104
105QT_DEFINE_QESDP_SPECIALIZATION_DTOR(QColorTransformPrivate)
106
107/*!
108 \since 6.4
109 Returns true if the color transform is the identity transform.
110*/
111bool QColorTransform::isIdentity() const noexcept
112{
113 return !d || d->isIdentity();
114}
115
116/*!
117 \fn bool QColorTransform::operator==(const QColorTransform &ct1, const QColorTransform &ct2)
118 \since 6.4
119 Returns true if \a ct1 defines the same color transformation as \a ct2.
120*/
121
122/*!
123 \fn bool QColorTransform::operator!=(const QColorTransform &ct1, const QColorTransform &ct2)
124 \since 6.4
125 Returns true if \a ct1 does not define the same transformation as \a ct2.
126*/
127
128/*! \internal
129*/
130bool QColorTransform::compare(const QColorTransform &other) const
131{
132 if (d == other.d)
133 return true;
134 if (bool(d) != bool(other.d))
135 return d ? d->isIdentity() : other.d->isIdentity();
136 if (d->colorMatrix != other.d->colorMatrix)
137 return false;
138 if (bool(d->colorSpaceIn) != bool(other.d->colorSpaceIn))
139 return false;
140 if (bool(d->colorSpaceOut) != bool(other.d->colorSpaceOut))
141 return false;
142 for (int i = 0; i < 3; ++i) {
143 if (d->colorSpaceIn && d->colorSpaceIn->trc[i] != other.d->colorSpaceIn->trc[i])
144 return false;
145 if (d->colorSpaceOut && d->colorSpaceOut->trc[i] != other.d->colorSpaceOut->trc[i])
146 return false;
147 }
148 return true;
149}
150
151/*!
152 Applies the color transformation on the QRgb value \a argb.
153
154 The input should be opaque or unpremultiplied.
155*/
156QRgb QColorTransform::map(QRgb argb) const
157{
158 if (!d)
159 return argb;
160 constexpr float f = 1.0f / 255.0f;
161 QColorVector c = { qRed(rgb: argb) * f, qGreen(rgb: argb) * f, qBlue(rgb: argb) * f };
162 if (d->colorSpaceIn->lut.generated.loadAcquire()) {
163 c.x = d->colorSpaceIn->lut[0]->toLinear(f: c.x);
164 c.y = d->colorSpaceIn->lut[1]->toLinear(f: c.y);
165 c.z = d->colorSpaceIn->lut[2]->toLinear(f: c.z);
166 } else {
167 c.x = d->colorSpaceIn->trc[0].apply(x: c.x);
168 c.y = d->colorSpaceIn->trc[1].apply(x: c.y);
169 c.z = d->colorSpaceIn->trc[2].apply(x: c.z);
170 }
171 c = d->colorMatrix.map(c);
172 c.x = std::max(a: 0.0f, b: std::min(a: 1.0f, b: c.x));
173 c.y = std::max(a: 0.0f, b: std::min(a: 1.0f, b: c.y));
174 c.z = std::max(a: 0.0f, b: std::min(a: 1.0f, b: c.z));
175 if (d->colorSpaceOut->lut.generated.loadAcquire()) {
176 c.x = d->colorSpaceOut->lut[0]->fromLinear(f: c.x);
177 c.y = d->colorSpaceOut->lut[1]->fromLinear(f: c.y);
178 c.z = d->colorSpaceOut->lut[2]->fromLinear(f: c.z);
179 } else {
180 c.x = d->colorSpaceOut->trc[0].applyInverse(x: c.x);
181 c.y = d->colorSpaceOut->trc[1].applyInverse(x: c.y);
182 c.z = d->colorSpaceOut->trc[2].applyInverse(x: c.z);
183 }
184
185 return qRgba(r: c.x * 255 + 0.5f, g: c.y * 255 + 0.5f, b: c.z * 255 + 0.5f, a: qAlpha(rgb: argb));
186}
187
188/*!
189 Applies the color transformation on the QRgba64 value \a rgba64.
190
191 The input should be opaque or unpremultiplied.
192*/
193QRgba64 QColorTransform::map(QRgba64 rgba64) const
194{
195 if (!d)
196 return rgba64;
197 constexpr float f = 1.0f / 65535.0f;
198 QColorVector c = { rgba64.red() * f, rgba64.green() * f, rgba64.blue() * f };
199 if (d->colorSpaceIn->lut.generated.loadAcquire()) {
200 c.x = d->colorSpaceIn->lut[0]->toLinear(f: c.x);
201 c.y = d->colorSpaceIn->lut[1]->toLinear(f: c.y);
202 c.z = d->colorSpaceIn->lut[2]->toLinear(f: c.z);
203 } else {
204 c.x = d->colorSpaceIn->trc[0].apply(x: c.x);
205 c.y = d->colorSpaceIn->trc[1].apply(x: c.y);
206 c.z = d->colorSpaceIn->trc[2].apply(x: c.z);
207 }
208 c = d->colorMatrix.map(c);
209 c.x = std::max(a: 0.0f, b: std::min(a: 1.0f, b: c.x));
210 c.y = std::max(a: 0.0f, b: std::min(a: 1.0f, b: c.y));
211 c.z = std::max(a: 0.0f, b: std::min(a: 1.0f, b: c.z));
212 if (d->colorSpaceOut->lut.generated.loadAcquire()) {
213 c.x = d->colorSpaceOut->lut[0]->fromLinear(f: c.x);
214 c.y = d->colorSpaceOut->lut[1]->fromLinear(f: c.y);
215 c.z = d->colorSpaceOut->lut[2]->fromLinear(f: c.z);
216 } else {
217 c.x = d->colorSpaceOut->trc[0].applyInverse(x: c.x);
218 c.y = d->colorSpaceOut->trc[1].applyInverse(x: c.y);
219 c.z = d->colorSpaceOut->trc[2].applyInverse(x: c.z);
220 }
221
222 return QRgba64::fromRgba64(red: c.x * 65535.f + 0.5f, green: c.y * 65535.f + 0.5f, blue: c.z * 65535.f + 0.5f, alpha: rgba64.alpha());
223}
224
225/*!
226 Applies the color transformation on the QRgbaFloat16 value \a rgbafp16.
227
228 The input should be opaque or unpremultiplied.
229 \since 6.4
230*/
231QRgbaFloat16 QColorTransform::map(QRgbaFloat16 rgbafp16) const
232{
233 if (!d)
234 return rgbafp16;
235 QColorVector c;
236 c.x = d->colorSpaceIn->trc[0].applyExtended(x: rgbafp16.r);
237 c.y = d->colorSpaceIn->trc[1].applyExtended(x: rgbafp16.g);
238 c.z = d->colorSpaceIn->trc[2].applyExtended(x: rgbafp16.b);
239 c = d->colorMatrix.map(c);
240 rgbafp16.r = qfloat16(d->colorSpaceOut->trc[0].applyInverseExtended(x: c.x));
241 rgbafp16.g = qfloat16(d->colorSpaceOut->trc[1].applyInverseExtended(x: c.y));
242 rgbafp16.b = qfloat16(d->colorSpaceOut->trc[2].applyInverseExtended(x: c.z));
243 return rgbafp16;
244}
245
246/*!
247 Applies the color transformation on the QRgbaFloat32 value \a rgbafp32.
248
249 The input should be opaque or unpremultiplied.
250 \since 6.4
251*/
252QRgbaFloat32 QColorTransform::map(QRgbaFloat32 rgbafp32) const
253{
254 if (!d)
255 return rgbafp32;
256 QColorVector c;
257 c.x = d->colorSpaceIn->trc[0].applyExtended(x: rgbafp32.r);
258 c.y = d->colorSpaceIn->trc[1].applyExtended(x: rgbafp32.g);
259 c.z = d->colorSpaceIn->trc[2].applyExtended(x: rgbafp32.b);
260 c = d->colorMatrix.map(c);
261 rgbafp32.r = d->colorSpaceOut->trc[0].applyInverseExtended(x: c.x);
262 rgbafp32.g = d->colorSpaceOut->trc[1].applyInverseExtended(x: c.y);
263 rgbafp32.b = d->colorSpaceOut->trc[2].applyInverseExtended(x: c.z);
264 return rgbafp32;
265}
266
267/*!
268 Applies the color transformation on the QColor value \a color.
269
270*/
271QColor QColorTransform::map(const QColor &color) const
272{
273 if (!d)
274 return color;
275 QColor clr = color;
276 if (color.spec() != QColor::ExtendedRgb || color.spec() != QColor::Rgb)
277 clr = clr.toRgb();
278
279 QColorVector c = { (float)clr.redF(), (float)clr.greenF(), (float)clr.blueF() };
280 if (clr.spec() == QColor::ExtendedRgb) {
281 c.x = d->colorSpaceIn->trc[0].applyExtended(x: c.x);
282 c.y = d->colorSpaceIn->trc[1].applyExtended(x: c.y);
283 c.z = d->colorSpaceIn->trc[2].applyExtended(x: c.z);
284 } else {
285 c.x = d->colorSpaceIn->trc[0].apply(x: c.x);
286 c.y = d->colorSpaceIn->trc[1].apply(x: c.y);
287 c.z = d->colorSpaceIn->trc[2].apply(x: c.z);
288 }
289 c = d->colorMatrix.map(c);
290 bool inGamut = c.x >= 0.0f && c.x <= 1.0f && c.y >= 0.0f && c.y <= 1.0f && c.z >= 0.0f && c.z <= 1.0f;
291 if (inGamut) {
292 if (d->colorSpaceOut->lut.generated.loadAcquire()) {
293 c.x = d->colorSpaceOut->lut[0]->fromLinear(f: c.x);
294 c.y = d->colorSpaceOut->lut[1]->fromLinear(f: c.y);
295 c.z = d->colorSpaceOut->lut[2]->fromLinear(f: c.z);
296 } else {
297 c.x = d->colorSpaceOut->trc[0].applyInverse(x: c.x);
298 c.y = d->colorSpaceOut->trc[1].applyInverse(x: c.y);
299 c.z = d->colorSpaceOut->trc[2].applyInverse(x: c.z);
300 }
301 } else {
302 c.x = d->colorSpaceOut->trc[0].applyInverseExtended(x: c.x);
303 c.y = d->colorSpaceOut->trc[1].applyInverseExtended(x: c.y);
304 c.z = d->colorSpaceOut->trc[2].applyInverseExtended(x: c.z);
305 }
306 QColor out;
307 out.setRgbF(r: c.x, g: c.y, b: c.z, a: color.alphaF());
308 return out;
309}
310
311// Optimized sub-routines for fast block based conversion:
312
313template<bool DoClamp = true>
314static void applyMatrix(QColorVector *buffer, const qsizetype len, const QColorMatrix &colorMatrix)
315{
316#if defined(__SSE2__)
317 const __m128 minV = _mm_set1_ps(w: 0.0f);
318 const __m128 maxV = _mm_set1_ps(w: 1.0f);
319 const __m128 xMat = _mm_loadu_ps(p: &colorMatrix.r.x);
320 const __m128 yMat = _mm_loadu_ps(p: &colorMatrix.g.x);
321 const __m128 zMat = _mm_loadu_ps(p: &colorMatrix.b.x);
322 for (qsizetype j = 0; j < len; ++j) {
323 __m128 c = _mm_loadu_ps(p: &buffer[j].x);
324 __m128 cx = _mm_shuffle_ps(c, c, _MM_SHUFFLE(0, 0, 0, 0));
325 __m128 cy = _mm_shuffle_ps(c, c, _MM_SHUFFLE(1, 1, 1, 1));
326 __m128 cz = _mm_shuffle_ps(c, c, _MM_SHUFFLE(2, 2, 2, 2));
327 cx = _mm_mul_ps(a: cx, b: xMat);
328 cy = _mm_mul_ps(a: cy, b: yMat);
329 cz = _mm_mul_ps(a: cz, b: zMat);
330 cx = _mm_add_ps(a: cx, b: cy);
331 cx = _mm_add_ps(a: cx, b: cz);
332 // Clamp:
333 if (DoClamp) {
334 cx = _mm_min_ps(a: cx, b: maxV);
335 cx = _mm_max_ps(a: cx, b: minV);
336 }
337 _mm_storeu_ps(p: &buffer[j].x, a: cx);
338 }
339#elif defined(__ARM_NEON__)
340 const float32x4_t minV = vdupq_n_f32(0.0f);
341 const float32x4_t maxV = vdupq_n_f32(1.0f);
342 const float32x4_t xMat = vld1q_f32(&colorMatrix.r.x);
343 const float32x4_t yMat = vld1q_f32(&colorMatrix.g.x);
344 const float32x4_t zMat = vld1q_f32(&colorMatrix.b.x);
345 for (qsizetype j = 0; j < len; ++j) {
346 float32x4_t c = vld1q_f32(&buffer[j].x);
347 float32x4_t cx = vmulq_n_f32(xMat, vgetq_lane_f32(c, 0));
348 float32x4_t cy = vmulq_n_f32(yMat, vgetq_lane_f32(c, 1));
349 float32x4_t cz = vmulq_n_f32(zMat, vgetq_lane_f32(c, 2));
350 cx = vaddq_f32(cx, cy);
351 cx = vaddq_f32(cx, cz);
352 // Clamp:
353 if (DoClamp) {
354 cx = vminq_f32(cx, maxV);
355 cx = vmaxq_f32(cx, minV);
356 }
357 vst1q_f32(&buffer[j].x, cx);
358 }
359#else
360 for (int j = 0; j < len; ++j) {
361 const QColorVector cv = colorMatrix.map(buffer[j]);
362 if (DoClamp) {
363 buffer[j].x = std::max(0.0f, std::min(1.0f, cv.x));
364 buffer[j].y = std::max(0.0f, std::min(1.0f, cv.y));
365 buffer[j].z = std::max(0.0f, std::min(1.0f, cv.z));
366 } else {
367 buffer[j] = cv;
368 }
369 }
370#endif
371}
372
373#if defined(__SSE2__) || defined(__ARM_NEON__)
374template<typename T>
375static constexpr inline bool isArgb();
376template<>
377constexpr inline bool isArgb<QRgb>() { return true; }
378template<>
379constexpr inline bool isArgb<QRgba64>() { return false; }
380
381template<typename T>
382static inline int getAlpha(const T &p);
383template<>
384inline int getAlpha<QRgb>(const QRgb &p)
385{ return qAlpha(rgb: p); }
386template<>
387inline int getAlpha<QRgba64>(const QRgba64 &p)
388{ return p.alpha(); }
389#endif
390
391template<typename T>
392static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr);
393template<typename T>
394static void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr);
395
396#if defined(__SSE2__)
397// Load to [0-alpha] in 4x32 SIMD
398template<typename T>
399static inline void loadP(const T &p, __m128i &v);
400
401template<>
402inline void loadP<QRgb>(const QRgb &p, __m128i &v)
403{
404 v = _mm_cvtsi32_si128(a: p);
405#if defined(__SSE4_1__)
406 v = _mm_cvtepu8_epi32(v);
407#else
408 v = _mm_unpacklo_epi8(a: v, b: _mm_setzero_si128());
409 v = _mm_unpacklo_epi16(a: v, b: _mm_setzero_si128());
410#endif
411}
412
413template<>
414inline void loadP<QRgba64>(const QRgba64 &p, __m128i &v)
415{
416 v = _mm_loadl_epi64(p: (const __m128i *)&p);
417#if defined(__SSE4_1__)
418 v = _mm_cvtepu16_epi32(v);
419#else
420 v = _mm_unpacklo_epi16(a: v, b: _mm_setzero_si128());
421#endif
422}
423
424template<typename T>
425static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
426{
427 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
428 const __m128 iFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
429 constexpr bool isARGB = isArgb<T>();
430 for (qsizetype i = 0; i < len; ++i) {
431 __m128i v;
432 loadP<T>(src[i], v);
433 __m128 vf = _mm_cvtepi32_ps(a: v);
434 // Approximate 1/a:
435 __m128 va = _mm_shuffle_ps(vf, vf, _MM_SHUFFLE(3, 3, 3, 3));
436 __m128 via = _mm_rcp_ps(a: va);
437 via = _mm_sub_ps(a: _mm_add_ps(a: via, b: via), b: _mm_mul_ps(a: via, b: _mm_mul_ps(a: via, b: va)));
438 // v * (1/a)
439 vf = _mm_mul_ps(a: vf, b: via);
440
441 // Handle zero alpha
442 __m128 vAlphaMask = _mm_cmpeq_ps(a: va, b: _mm_set1_ps(w: 0.0f));
443 vf = _mm_andnot_ps(a: vAlphaMask, b: vf);
444
445 // LUT
446 v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
447 const int ridx = isARGB ? _mm_extract_epi16(v, 4) : _mm_extract_epi16(v, 0);
448 const int gidx = _mm_extract_epi16(v, 2);
449 const int bidx = isARGB ? _mm_extract_epi16(v, 0) : _mm_extract_epi16(v, 4);
450 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], 0);
451 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], 2);
452 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], 4);
453 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: iFF00);
454
455 _mm_storeu_ps(p: &buffer[i].x, a: vf);
456 }
457}
458
459template<>
460void loadPremultiplied<QRgbaFloat32>(QColorVector *buffer, const QRgbaFloat32 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
461{
462 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
463 const __m128 viFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
464 const __m128 vZero = _mm_set1_ps(w: 0.0f);
465 const __m128 vOne = _mm_set1_ps(w: 1.0f);
466 for (qsizetype i = 0; i < len; ++i) {
467 __m128 vf = _mm_loadu_ps(p: &src[i].r);
468 // Approximate 1/a:
469 __m128 va = _mm_shuffle_ps(vf, vf, _MM_SHUFFLE(3, 3, 3, 3));
470 __m128 via = _mm_rcp_ps(a: va);
471 via = _mm_sub_ps(a: _mm_add_ps(a: via, b: via), b: _mm_mul_ps(a: via, b: _mm_mul_ps(a: via, b: va)));
472 // v * (1/a)
473 vf = _mm_mul_ps(a: vf, b: via);
474
475 // Handle zero alpha
476 __m128 vAlphaMask = _mm_cmpeq_ps(a: va, b: vZero);
477 vf = _mm_andnot_ps(a: vAlphaMask, b: vf);
478
479 // LUT
480 const __m128 under = _mm_cmplt_ps(a: vf, b: vZero);
481 const __m128 over = _mm_cmpgt_ps(a: vf, b: vOne);
482 if (_mm_movemask_ps(a: _mm_or_ps(a: under, b: over)) == 0) {
483 // Within gamut
484 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
485 const int ridx = _mm_extract_epi16(v, 0);
486 const int gidx = _mm_extract_epi16(v, 2);
487 const int bidx = _mm_extract_epi16(v, 4);
488 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], 0);
489 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], 2);
490 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], 4);
491 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: viFF00);
492 _mm_storeu_ps(p: &buffer[i].x, a: vf);
493 } else {
494 // Outside 0.0->1.0 gamut
495 _mm_storeu_ps(p: &buffer[i].x, a: vf);
496 buffer[i].x = d_ptr->colorSpaceIn->trc[0].applyExtended(x: buffer[i].x);
497 buffer[i].y = d_ptr->colorSpaceIn->trc[1].applyExtended(x: buffer[i].y);
498 buffer[i].z = d_ptr->colorSpaceIn->trc[2].applyExtended(x: buffer[i].z);
499 }
500 }
501}
502
503// Load to [0-4080] in 4x32 SIMD
504template<typename T>
505static inline void loadPU(const T &p, __m128i &v);
506
507template<>
508inline void loadPU<QRgb>(const QRgb &p, __m128i &v)
509{
510 v = _mm_cvtsi32_si128(a: p);
511#if defined(__SSE4_1__)
512 v = _mm_cvtepu8_epi32(v);
513#else
514 v = _mm_unpacklo_epi8(a: v, b: _mm_setzero_si128());
515 v = _mm_unpacklo_epi16(a: v, b: _mm_setzero_si128());
516#endif
517 v = _mm_slli_epi32(a: v, count: 4);
518}
519
520template<>
521inline void loadPU<QRgba64>(const QRgba64 &p, __m128i &v)
522{
523 v = _mm_loadl_epi64(p: (const __m128i *)&p);
524 v = _mm_sub_epi16(a: v, b: _mm_srli_epi16(a: v, count: 8));
525#if defined(__SSE4_1__)
526 v = _mm_cvtepu16_epi32(v);
527#else
528 v = _mm_unpacklo_epi16(a: v, b: _mm_setzero_si128());
529#endif
530 v = _mm_srli_epi32(a: v, count: 4);
531}
532
533template<typename T>
534void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
535{
536 constexpr bool isARGB = isArgb<T>();
537 const __m128 iFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
538 for (qsizetype i = 0; i < len; ++i) {
539 __m128i v;
540 loadPU<T>(src[i], v);
541 const int ridx = isARGB ? _mm_extract_epi16(v, 4) : _mm_extract_epi16(v, 0);
542 const int gidx = _mm_extract_epi16(v, 2);
543 const int bidx = isARGB ? _mm_extract_epi16(v, 0) : _mm_extract_epi16(v, 4);
544 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], 0);
545 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], 2);
546 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], 4);
547 __m128 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: iFF00);
548 _mm_storeu_ps(p: &buffer[i].x, a: vf);
549 }
550}
551
552template<>
553void loadUnpremultiplied<QRgbaFloat32>(QColorVector *buffer, const QRgbaFloat32 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
554{
555 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
556 const __m128 iFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
557 const __m128 vZero = _mm_set1_ps(w: 0.0f);
558 const __m128 vOne = _mm_set1_ps(w: 1.0f);
559 for (qsizetype i = 0; i < len; ++i) {
560 __m128 vf = _mm_loadu_ps(p: &src[i].r);
561 const __m128 under = _mm_cmplt_ps(a: vf, b: vZero);
562 const __m128 over = _mm_cmpgt_ps(a: vf, b: vOne);
563 if (_mm_movemask_ps(a: _mm_or_ps(a: under, b: over)) == 0) {
564 // Within gamut
565 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
566 const int ridx = _mm_extract_epi16(v, 0);
567 const int gidx = _mm_extract_epi16(v, 2);
568 const int bidx = _mm_extract_epi16(v, 4);
569 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], 0);
570 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], 2);
571 v = _mm_insert_epi16(v, d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], 4);
572 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: iFF00);
573 _mm_storeu_ps(p: &buffer[i].x, a: vf);
574 } else {
575 // Outside 0.0->1.0 gamut
576 buffer[i].x = d_ptr->colorSpaceIn->trc[0].applyExtended(x: src[i].r);
577 buffer[i].y = d_ptr->colorSpaceIn->trc[1].applyExtended(x: src[i].g);
578 buffer[i].z = d_ptr->colorSpaceIn->trc[2].applyExtended(x: src[i].b);
579 }
580 }
581}
582
583#elif defined(__ARM_NEON__)
584// Load to [0-alpha] in 4x32 SIMD
585template<typename T>
586static inline void loadP(const T &p, uint32x4_t &v);
587
588template<>
589inline void loadP<QRgb>(const QRgb &p, uint32x4_t &v)
590{
591 v = vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vmov_n_u32(p)))));
592}
593
594template<>
595inline void loadP<QRgba64>(const QRgba64 &p, uint32x4_t &v)
596{
597 v = vmovl_u16(vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&p))));
598}
599
600template<typename T>
601static void loadPremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
602{
603 constexpr bool isARGB = isArgb<T>();
604 const float iFF00 = 1.0f / (255 * 256);
605 for (qsizetype i = 0; i < len; ++i) {
606 uint32x4_t v;
607 loadP<T>(src[i], v);
608 float32x4_t vf = vcvtq_f32_u32(v);
609 // Approximate 1/a:
610 float32x4_t va = vdupq_n_f32(vgetq_lane_f32(vf, 3));
611 float32x4_t via = vrecpeq_f32(va); // estimate 1/a
612 via = vmulq_f32(vrecpsq_f32(va, via), via);
613
614 // v * (1/a)
615 vf = vmulq_f32(vf, via);
616
617 // Handle zero alpha
618#if defined(Q_PROCESSOR_ARM_64)
619 uint32x4_t vAlphaMask = vceqzq_f32(va);
620#else
621 uint32x4_t vAlphaMask = vceqq_f32(va, vdupq_n_f32(0.0));
622#endif
623 vf = vreinterpretq_f32_u32(vbicq_u32(vreinterpretq_u32_f32(vf), vAlphaMask));
624
625 // LUT
626 v = vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f)));
627 const int ridx = isARGB ? vgetq_lane_u32(v, 2) : vgetq_lane_u32(v, 0);
628 const int gidx = vgetq_lane_u32(v, 1);
629 const int bidx = isARGB ? vgetq_lane_u32(v, 0) : vgetq_lane_u32(v, 2);
630 v = vsetq_lane_u32(d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], v, 0);
631 v = vsetq_lane_u32(d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], v, 1);
632 v = vsetq_lane_u32(d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], v, 2);
633 vf = vmulq_n_f32(vcvtq_f32_u32(v), iFF00);
634
635 vst1q_f32(&buffer[i].x, vf);
636 }
637}
638
639// Load to [0-4080] in 4x32 SIMD
640template<typename T>
641static inline void loadPU(const T &p, uint32x4_t &v);
642
643template<>
644inline void loadPU<QRgb>(const QRgb &p, uint32x4_t &v)
645{
646 v = vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_u32(vmov_n_u32(p)))));
647 v = vshlq_n_u32(v, 4);
648}
649
650template<>
651inline void loadPU<QRgba64>(const QRgba64 &p, uint32x4_t &v)
652{
653 uint16x4_t v16 = vreinterpret_u16_u64(vld1_u64(reinterpret_cast<const uint64_t *>(&p)));
654 v16 = vsub_u16(v16, vshr_n_u16(v16, 8));
655 v = vmovl_u16(v16);
656 v = vshrq_n_u32(v, 4);
657}
658
659template<typename T>
660void loadUnpremultiplied(QColorVector *buffer, const T *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
661{
662 constexpr bool isARGB = isArgb<T>();
663 const float iFF00 = 1.0f / (255 * 256);
664 for (qsizetype i = 0; i < len; ++i) {
665 uint32x4_t v;
666 loadPU<T>(src[i], v);
667 const int ridx = isARGB ? vgetq_lane_u32(v, 2) : vgetq_lane_u32(v, 0);
668 const int gidx = vgetq_lane_u32(v, 1);
669 const int bidx = isARGB ? vgetq_lane_u32(v, 0) : vgetq_lane_u32(v, 2);
670 v = vsetq_lane_u32(d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx], v, 0);
671 v = vsetq_lane_u32(d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx], v, 1);
672 v = vsetq_lane_u32(d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx], v, 2);
673 float32x4_t vf = vmulq_n_f32(vcvtq_f32_u32(v), iFF00);
674 vst1q_f32(&buffer[i].x, vf);
675 }
676}
677#else
678template<>
679void loadPremultiplied<QRgb>(QColorVector *buffer, const QRgb *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
680{
681 for (qsizetype i = 0; i < len; ++i) {
682 const uint p = src[i];
683 const int a = qAlpha(p);
684 if (a) {
685 const float ia = 4080.0f / a;
686 const int ridx = int(qRed(p) * ia + 0.5f);
687 const int gidx = int(qGreen(p) * ia + 0.5f);
688 const int bidx = int(qBlue(p) * ia + 0.5f);
689 buffer[i].x = d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx] * (1.0f / (255 * 256));
690 buffer[i].y = d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx] * (1.0f / (255 * 256));
691 buffer[i].z = d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx] * (1.0f / (255 * 256));
692 } else {
693 buffer[i].x = buffer[i].y = buffer[i].z = 0.0f;
694 }
695 }
696}
697
698template<>
699void loadPremultiplied<QRgba64>(QColorVector *buffer, const QRgba64 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
700{
701 for (qsizetype i = 0; i < len; ++i) {
702 const QRgba64 &p = src[i];
703 const int a = p.alpha();
704 if (a) {
705 const float ia = 4080.0f / a;
706 const int ridx = int(p.red() * ia + 0.5f);
707 const int gidx = int(p.green() * ia + 0.5f);
708 const int bidx = int(p.blue() * ia + 0.5f);
709 buffer[i].x = d_ptr->colorSpaceIn->lut[0]->m_toLinear[ridx] * (1.0f / (255 * 256));
710 buffer[i].y = d_ptr->colorSpaceIn->lut[1]->m_toLinear[gidx] * (1.0f / (255 * 256));
711 buffer[i].z = d_ptr->colorSpaceIn->lut[2]->m_toLinear[bidx] * (1.0f / (255 * 256));
712 } else {
713 buffer[i].x = buffer[i].y = buffer[i].z = 0.0f;
714 }
715 }
716}
717
718template<>
719void loadUnpremultiplied<QRgb>(QColorVector *buffer, const QRgb *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
720{
721 for (qsizetype i = 0; i < len; ++i) {
722 const uint p = src[i];
723 buffer[i].x = d_ptr->colorSpaceIn->lut[0]->u8ToLinearF32(qRed(p));
724 buffer[i].y = d_ptr->colorSpaceIn->lut[1]->u8ToLinearF32(qGreen(p));
725 buffer[i].z = d_ptr->colorSpaceIn->lut[2]->u8ToLinearF32(qBlue(p));
726 }
727}
728
729template<>
730void loadUnpremultiplied<QRgba64>(QColorVector *buffer, const QRgba64 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
731{
732 for (qsizetype i = 0; i < len; ++i) {
733 const QRgba64 &p = src[i];
734 buffer[i].x = d_ptr->colorSpaceIn->lut[0]->u16ToLinearF32(p.red());
735 buffer[i].y = d_ptr->colorSpaceIn->lut[1]->u16ToLinearF32(p.green());
736 buffer[i].z = d_ptr->colorSpaceIn->lut[2]->u16ToLinearF32(p.blue());
737 }
738}
739#endif
740#if !defined(__SSE2__)
741template<>
742void loadPremultiplied<QRgbaFloat32>(QColorVector *buffer, const QRgbaFloat32 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
743{
744 for (qsizetype i = 0; i < len; ++i) {
745 const QRgbaFloat32 &p = src[i];
746 const float a = p.a;
747 if (a) {
748 const float ia = 1.0f / a;
749 buffer[i].x = d_ptr->colorSpaceIn->trc[0].applyExtended(p.r * ia);
750 buffer[i].y = d_ptr->colorSpaceIn->trc[1].applyExtended(p.g * ia);
751 buffer[i].z = d_ptr->colorSpaceIn->trc[2].applyExtended(p.b * ia);
752 } else {
753 buffer[i].x = buffer[i].y = buffer[i].z = 0.0f;
754 }
755 }
756}
757
758template<>
759void loadUnpremultiplied<QRgbaFloat32>(QColorVector *buffer, const QRgbaFloat32 *src, const qsizetype len, const QColorTransformPrivate *d_ptr)
760{
761 for (qsizetype i = 0; i < len; ++i) {
762 const QRgbaFloat32 &p = src[i];
763 buffer[i].x = d_ptr->colorSpaceIn->trc[0].applyExtended(p.r);
764 buffer[i].y = d_ptr->colorSpaceIn->trc[1].applyExtended(p.g);
765 buffer[i].z = d_ptr->colorSpaceIn->trc[2].applyExtended(p.b);
766 }
767}
768#endif
769
770#if defined(__SSE2__)
771template<typename T>
772static inline void storeP(T &p, __m128i &v, int a);
773template<>
774inline void storeP<QRgb>(QRgb &p, __m128i &v, int a)
775{
776 v = _mm_packs_epi32(a: v, b: v);
777 v = _mm_insert_epi16(v, a, 3);
778 p = _mm_cvtsi128_si32(a: _mm_packus_epi16(a: v, b: v));
779}
780template<>
781inline void storeP<QRgba64>(QRgba64 &p, __m128i &v, int a)
782{
783#if defined(__SSE4_1__)
784 v = _mm_packus_epi32(v, v);
785 v = _mm_insert_epi16(v, a, 3);
786 _mm_storel_epi64((__m128i *)&p, v);
787#else
788 const int r = _mm_extract_epi16(v, 0);
789 const int g = _mm_extract_epi16(v, 2);
790 const int b = _mm_extract_epi16(v, 4);
791 p = qRgba64(r, g, b, a);
792#endif
793}
794
795template<typename T>
796static void storePremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len,
797 const QColorTransformPrivate *d_ptr)
798{
799 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
800 const __m128 iFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
801 constexpr bool isARGB = isArgb<T>();
802 for (qsizetype i = 0; i < len; ++i) {
803 const int a = getAlpha<T>(src[i]);
804 __m128 vf = _mm_loadu_ps(p: &buffer[i].x);
805 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
806 __m128 va = _mm_mul_ps(a: _mm_set1_ps(w: a), b: iFF00);
807 const int ridx = _mm_extract_epi16(v, 0);
808 const int gidx = _mm_extract_epi16(v, 2);
809 const int bidx = _mm_extract_epi16(v, 4);
810 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 4 : 0);
811 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2);
812 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 4);
813 vf = _mm_cvtepi32_ps(a: v);
814 vf = _mm_mul_ps(a: vf, b: va);
815 v = _mm_cvtps_epi32(a: vf);
816 storeP<T>(dst[i], v, a);
817 }
818}
819
820template<>
821void storePremultiplied<QRgbaFloat32>(QRgbaFloat32 *dst, const QRgbaFloat32 *src,
822 const QColorVector *buffer, const qsizetype len,
823 const QColorTransformPrivate *d_ptr)
824{
825 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
826 const __m128 vZero = _mm_set1_ps(w: 0.0f);
827 const __m128 vOne = _mm_set1_ps(w: 1.0f);
828 const __m128 viFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
829 for (qsizetype i = 0; i < len; ++i) {
830 const float a = src[i].a;
831 __m128 va = _mm_set1_ps(w: a);
832 __m128 vf = _mm_loadu_ps(p: &buffer[i].x);
833 const __m128 under = _mm_cmplt_ps(a: vf, b: vZero);
834 const __m128 over = _mm_cmpgt_ps(a: vf, b: vOne);
835 if (_mm_movemask_ps(a: _mm_or_ps(a: under, b: over)) == 0) {
836 // Within gamut
837 va = _mm_mul_ps(a: va, b: viFF00);
838 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
839 const int ridx = _mm_extract_epi16(v, 0);
840 const int gidx = _mm_extract_epi16(v, 2);
841 const int bidx = _mm_extract_epi16(v, 4);
842 v = _mm_setzero_si128();
843 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 0);
844 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2);
845 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 4);
846 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: va);
847 _mm_store_ps(p: &dst[i].r, a: vf);
848 } else {
849 dst[i].r = d_ptr->colorSpaceOut->trc[0].applyInverseExtended(x: buffer[i].x);
850 dst[i].g = d_ptr->colorSpaceOut->trc[1].applyInverseExtended(x: buffer[i].y);
851 dst[i].b = d_ptr->colorSpaceOut->trc[2].applyInverseExtended(x: buffer[i].z);
852 vf = _mm_mul_ps(a: _mm_load_ps(p: &dst[i].r), b: va);
853 _mm_store_ps(p: &dst[i].r, a: vf);
854 }
855 dst[i].a = a;
856 }
857}
858
859template<typename T>
860static inline void storePU(T &p, __m128i &v, int a);
861template<>
862inline void storePU<QRgb>(QRgb &p, __m128i &v, int a)
863{
864 v = _mm_add_epi16(a: v, b: _mm_set1_epi16(w: 0x80));
865 v = _mm_srli_epi16(a: v, count: 8);
866 v = _mm_insert_epi16(v, a, 3);
867 p = _mm_cvtsi128_si32(a: _mm_packus_epi16(a: v, b: v));
868}
869template<>
870inline void storePU<QRgba64>(QRgba64 &p, __m128i &v, int a)
871{
872 v = _mm_add_epi16(a: v, b: _mm_srli_epi16(a: v, count: 8));
873 v = _mm_insert_epi16(v, a, 3);
874 _mm_storel_epi64(p: (__m128i *)&p, a: v);
875}
876
877template<typename T>
878static void storeUnpremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len,
879 const QColorTransformPrivate *d_ptr)
880{
881 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
882 constexpr bool isARGB = isArgb<T>();
883 for (qsizetype i = 0; i < len; ++i) {
884 const int a = getAlpha<T>(src[i]);
885 __m128 vf = _mm_loadu_ps(p: &buffer[i].x);
886 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
887 const int ridx = _mm_extract_epi16(v, 0);
888 const int gidx = _mm_extract_epi16(v, 2);
889 const int bidx = _mm_extract_epi16(v, 4);
890 v = _mm_setzero_si128();
891 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 2 : 0);
892 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1);
893 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 2);
894 storePU<T>(dst[i], v, a);
895 }
896}
897
898template<>
899void storeUnpremultiplied<QRgbaFloat32>(QRgbaFloat32 *dst, const QRgbaFloat32 *src,
900 const QColorVector *buffer, const qsizetype len,
901 const QColorTransformPrivate *d_ptr)
902{
903 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
904 const __m128 vZero = _mm_set1_ps(w: 0.0f);
905 const __m128 vOne = _mm_set1_ps(w: 1.0f);
906 const __m128 viFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
907 for (qsizetype i = 0; i < len; ++i) {
908 const float a = src[i].a;
909 __m128 vf = _mm_loadu_ps(p: &buffer[i].x);
910 const __m128 under = _mm_cmplt_ps(a: vf, b: vZero);
911 const __m128 over = _mm_cmpgt_ps(a: vf, b: vOne);
912 if (_mm_movemask_ps(a: _mm_or_ps(a: under, b: over)) == 0) {
913 // Within gamut
914 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
915 const int ridx = _mm_extract_epi16(v, 0);
916 const int gidx = _mm_extract_epi16(v, 2);
917 const int bidx = _mm_extract_epi16(v, 4);
918 v = _mm_setzero_si128();
919 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 0);
920 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2);
921 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 4);
922 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: viFF00);
923 _mm_storeu_ps(p: &dst[i].r, a: vf);
924 } else {
925 dst[i].r = d_ptr->colorSpaceOut->trc[0].applyInverseExtended(x: buffer[i].x);
926 dst[i].g = d_ptr->colorSpaceOut->trc[1].applyInverseExtended(x: buffer[i].y);
927 dst[i].b = d_ptr->colorSpaceOut->trc[2].applyInverseExtended(x: buffer[i].z);
928 }
929 dst[i].a = a;
930 }
931}
932
933template<typename T>
934static void storeOpaque(T *dst, const T *src, const QColorVector *buffer, const qsizetype len,
935 const QColorTransformPrivate *d_ptr)
936{
937 Q_UNUSED(src);
938 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
939 constexpr bool isARGB = isArgb<T>();
940 for (qsizetype i = 0; i < len; ++i) {
941 __m128 vf = _mm_loadu_ps(p: &buffer[i].x);
942 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
943 const int ridx = _mm_extract_epi16(v, 0);
944 const int gidx = _mm_extract_epi16(v, 2);
945 const int bidx = _mm_extract_epi16(v, 4);
946 v = _mm_setzero_si128();
947 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], isARGB ? 2 : 0);
948 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 1);
949 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], isARGB ? 0 : 2);
950 storePU<T>(dst[i], v, isARGB ? 255 : 0xffff);
951 }
952}
953
954template<>
955void storeOpaque<QRgbaFloat32>(QRgbaFloat32 *dst, const QRgbaFloat32 *src,
956 const QColorVector *buffer, const qsizetype len,
957 const QColorTransformPrivate *d_ptr)
958{
959 Q_UNUSED(src);
960 const __m128 v4080 = _mm_set1_ps(w: 4080.f);
961 const __m128 vZero = _mm_set1_ps(w: 0.0f);
962 const __m128 vOne = _mm_set1_ps(w: 1.0f);
963 const __m128 viFF00 = _mm_set1_ps(w: 1.0f / (255 * 256));
964 for (qsizetype i = 0; i < len; ++i) {
965 __m128 vf = _mm_loadu_ps(p: &buffer[i].x);
966 const __m128 under = _mm_cmplt_ps(a: vf, b: vZero);
967 const __m128 over = _mm_cmpgt_ps(a: vf, b: vOne);
968 if (_mm_movemask_ps(a: _mm_or_ps(a: under, b: over)) == 0) {
969 // Within gamut
970 __m128i v = _mm_cvtps_epi32(a: _mm_mul_ps(a: vf, b: v4080));
971 const int ridx = _mm_extract_epi16(v, 0);
972 const int gidx = _mm_extract_epi16(v, 2);
973 const int bidx = _mm_extract_epi16(v, 4);
974 v = _mm_setzero_si128();
975 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], 0);
976 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], 2);
977 v = _mm_insert_epi16(v, d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], 4);
978 vf = _mm_mul_ps(a: _mm_cvtepi32_ps(a: v), b: viFF00);
979 _mm_store_ps(p: &dst[i].r, a: vf);
980 } else {
981 dst[i].r = d_ptr->colorSpaceOut->trc[0].applyInverseExtended(x: buffer[i].x);
982 dst[i].g = d_ptr->colorSpaceOut->trc[1].applyInverseExtended(x: buffer[i].y);
983 dst[i].b = d_ptr->colorSpaceOut->trc[2].applyInverseExtended(x: buffer[i].z);
984 }
985 dst[i].a = 1.0f;
986 }
987}
988
989#elif defined(__ARM_NEON__)
990template<typename T>
991static inline void storeP(T &p, const uint16x4_t &v);
992template<>
993inline void storeP<QRgb>(QRgb &p, const uint16x4_t &v)
994{
995 p = vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(v, v))), 0);
996}
997template<>
998inline void storeP<QRgba64>(QRgba64 &p, const uint16x4_t &v)
999{
1000 vst1_u16((uint16_t *)&p, v);
1001}
1002
1003template<typename T>
1004static void storePremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len,
1005 const QColorTransformPrivate *d_ptr)
1006{
1007 const float iFF00 = 1.0f / (255 * 256);
1008 constexpr bool isARGB = isArgb<T>();
1009 for (qsizetype i = 0; i < len; ++i) {
1010 const int a = getAlpha<T>(src[i]);
1011 float32x4_t vf = vld1q_f32(&buffer[i].x);
1012 uint32x4_t v = vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f)));
1013 const int ridx = vgetq_lane_u32(v, 0);
1014 const int gidx = vgetq_lane_u32(v, 1);
1015 const int bidx = vgetq_lane_u32(v, 2);
1016 v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0);
1017 v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1);
1018 v = vsetq_lane_u32(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2);
1019 vf = vcvtq_f32_u32(v);
1020 vf = vmulq_n_f32(vf, a * iFF00);
1021 vf = vaddq_f32(vf, vdupq_n_f32(0.5f));
1022 v = vcvtq_u32_f32(vf);
1023 uint16x4_t v16 = vmovn_u32(v);
1024 v16 = vset_lane_u16(a, v16, 3);
1025 storeP<T>(dst[i], v16);
1026 }
1027}
1028
1029template<typename T>
1030static inline void storePU(T &p, uint16x4_t &v, int a);
1031template<>
1032inline void storePU<QRgb>(QRgb &p, uint16x4_t &v, int a)
1033{
1034 v = vadd_u16(v, vdup_n_u16(0x80));
1035 v = vshr_n_u16(v, 8);
1036 v = vset_lane_u16(a, v, 3);
1037 p = vget_lane_u32(vreinterpret_u32_u8(vmovn_u16(vcombine_u16(v, v))), 0);
1038}
1039template<>
1040inline void storePU<QRgba64>(QRgba64 &p, uint16x4_t &v, int a)
1041{
1042 v = vadd_u16(v, vshr_n_u16(v, 8));
1043 v = vset_lane_u16(a, v, 3);
1044 vst1_u16((uint16_t *)&p, v);
1045}
1046
1047template<typename T>
1048static void storeUnpremultiplied(T *dst, const T *src, const QColorVector *buffer, const qsizetype len,
1049 const QColorTransformPrivate *d_ptr)
1050{
1051 constexpr bool isARGB = isArgb<T>();
1052 for (qsizetype i = 0; i < len; ++i) {
1053 const int a = getAlpha<T>(src[i]);
1054 float32x4_t vf = vld1q_f32(&buffer[i].x);
1055 uint16x4_t v = vmovn_u32(vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f))));
1056 const int ridx = vget_lane_u16(v, 0);
1057 const int gidx = vget_lane_u16(v, 1);
1058 const int bidx = vget_lane_u16(v, 2);
1059 v = vset_lane_u16(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0);
1060 v = vset_lane_u16(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1);
1061 v = vset_lane_u16(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2);
1062 storePU<T>(dst[i], v, a);
1063 }
1064}
1065
1066template<typename T>
1067static void storeOpaque(T *dst, const T *src, const QColorVector *buffer, const qsizetype len,
1068 const QColorTransformPrivate *d_ptr)
1069{
1070 Q_UNUSED(src);
1071 constexpr bool isARGB = isArgb<T>();
1072 for (qsizetype i = 0; i < len; ++i) {
1073 float32x4_t vf = vld1q_f32(&buffer[i].x);
1074 uint16x4_t v = vmovn_u32(vcvtq_u32_f32(vaddq_f32(vmulq_n_f32(vf, 4080.f), vdupq_n_f32(0.5f))));
1075 const int ridx = vget_lane_u16(v, 0);
1076 const int gidx = vget_lane_u16(v, 1);
1077 const int bidx = vget_lane_u16(v, 2);
1078 v = vset_lane_u16(d_ptr->colorSpaceOut->lut[0]->m_fromLinear[ridx], v, isARGB ? 2 : 0);
1079 v = vset_lane_u16(d_ptr->colorSpaceOut->lut[1]->m_fromLinear[gidx], v, 1);
1080 v = vset_lane_u16(d_ptr->colorSpaceOut->lut[2]->m_fromLinear[bidx], v, isARGB ? 0 : 2);
1081 storePU<T>(dst[i], v, isARGB ? 255 : 0xffff);
1082 }
1083}
1084#else
1085static void storePremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
1086 const QColorTransformPrivate *d_ptr)
1087{
1088 for (qsizetype i = 0; i < len; ++i) {
1089 const int a = qAlpha(src[i]);
1090 const float fa = a / (255.0f * 256.0f);
1091 const float r = d_ptr->colorSpaceOut->lut[0]->m_fromLinear[int(buffer[i].x * 4080.0f + 0.5f)];
1092 const float g = d_ptr->colorSpaceOut->lut[1]->m_fromLinear[int(buffer[i].y * 4080.0f + 0.5f)];
1093 const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)];
1094 dst[i] = qRgba(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a);
1095 }
1096}
1097
1098static void storeUnpremultiplied(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
1099 const QColorTransformPrivate *d_ptr)
1100{
1101 for (qsizetype i = 0; i < len; ++i) {
1102 const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x);
1103 const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y);
1104 const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z);
1105 dst[i] = (src[i] & 0xff000000) | (r << 16) | (g << 8) | (b << 0);
1106 }
1107}
1108
1109static void storeOpaque(QRgb *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
1110 const QColorTransformPrivate *d_ptr)
1111{
1112 Q_UNUSED(src);
1113 for (qsizetype i = 0; i < len; ++i) {
1114 const int r = d_ptr->colorSpaceOut->lut[0]->u8FromLinearF32(buffer[i].x);
1115 const int g = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(buffer[i].y);
1116 const int b = d_ptr->colorSpaceOut->lut[2]->u8FromLinearF32(buffer[i].z);
1117 dst[i] = 0xff000000 | (r << 16) | (g << 8) | (b << 0);
1118 }
1119}
1120
1121static void storePremultiplied(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
1122 const QColorTransformPrivate *d_ptr)
1123{
1124 for (qsizetype i = 0; i < len; ++i) {
1125 const int a = src[i].alpha();
1126 const float fa = a / (255.0f * 256.0f);
1127 const float r = d_ptr->colorSpaceOut->lut[0]->m_fromLinear[int(buffer[i].x * 4080.0f + 0.5f)];
1128 const float g = d_ptr->colorSpaceOut->lut[1]->m_fromLinear[int(buffer[i].y * 4080.0f + 0.5f)];
1129 const float b = d_ptr->colorSpaceOut->lut[2]->m_fromLinear[int(buffer[i].z * 4080.0f + 0.5f)];
1130 dst[i] = qRgba64(r * fa + 0.5f, g * fa + 0.5f, b * fa + 0.5f, a);
1131 }
1132}
1133
1134static void storeUnpremultiplied(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
1135 const QColorTransformPrivate *d_ptr)
1136{
1137 for (qsizetype i = 0; i < len; ++i) {
1138 const int r = d_ptr->colorSpaceOut->lut[0]->u16FromLinearF32(buffer[i].x);
1139 const int g = d_ptr->colorSpaceOut->lut[1]->u16FromLinearF32(buffer[i].y);
1140 const int b = d_ptr->colorSpaceOut->lut[2]->u16FromLinearF32(buffer[i].z);
1141 dst[i] = qRgba64(r, g, b, src[i].alpha());
1142 }
1143}
1144
1145static void storeOpaque(QRgba64 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
1146 const QColorTransformPrivate *d_ptr)
1147{
1148 Q_UNUSED(src);
1149 for (qsizetype i = 0; i < len; ++i) {
1150 const int r = d_ptr->colorSpaceOut->lut[0]->u16FromLinearF32(buffer[i].x);
1151 const int g = d_ptr->colorSpaceOut->lut[1]->u16FromLinearF32(buffer[i].y);
1152 const int b = d_ptr->colorSpaceOut->lut[2]->u16FromLinearF32(buffer[i].z);
1153 dst[i] = qRgba64(r, g, b, 0xFFFF);
1154 }
1155}
1156#endif
1157#if !defined(__SSE2__)
1158static void storePremultiplied(QRgbaFloat32 *dst, const QRgbaFloat32 *src, const QColorVector *buffer,
1159 const qsizetype len, const QColorTransformPrivate *d_ptr)
1160{
1161 for (qsizetype i = 0; i < len; ++i) {
1162 const float a = src[i].a;
1163 dst[i].r = d_ptr->colorSpaceOut->trc[0].applyInverseExtended(buffer[i].x) * a;
1164 dst[i].g = d_ptr->colorSpaceOut->trc[1].applyInverseExtended(buffer[i].y) * a;
1165 dst[i].b = d_ptr->colorSpaceOut->trc[2].applyInverseExtended(buffer[i].z) * a;
1166 dst[i].a = a;
1167 }
1168}
1169
1170static void storeUnpremultiplied(QRgbaFloat32 *dst, const QRgbaFloat32 *src, const QColorVector *buffer,
1171 const qsizetype len, const QColorTransformPrivate *d_ptr)
1172{
1173 for (qsizetype i = 0; i < len; ++i) {
1174 const float a = src[i].a;
1175 dst[i].r = d_ptr->colorSpaceOut->trc[0].applyInverseExtended(buffer[i].x);
1176 dst[i].g = d_ptr->colorSpaceOut->trc[1].applyInverseExtended(buffer[i].y);
1177 dst[i].b = d_ptr->colorSpaceOut->trc[2].applyInverseExtended(buffer[i].z);
1178 dst[i].a = a;
1179 }
1180}
1181
1182static void storeOpaque(QRgbaFloat32 *dst, const QRgbaFloat32 *src, const QColorVector *buffer, const qsizetype len,
1183 const QColorTransformPrivate *d_ptr)
1184{
1185 Q_UNUSED(src);
1186 for (qsizetype i = 0; i < len; ++i) {
1187 dst[i].r = d_ptr->colorSpaceOut->trc[0].applyInverseExtended(buffer[i].x);
1188 dst[i].g = d_ptr->colorSpaceOut->trc[1].applyInverseExtended(buffer[i].y);
1189 dst[i].b = d_ptr->colorSpaceOut->trc[2].applyInverseExtended(buffer[i].z);
1190 dst[i].a = 1.0f;
1191 }
1192}
1193#endif
1194static void storeGray(quint8 *dst, const QRgb *src, const QColorVector *buffer, const qsizetype len,
1195 const QColorTransformPrivate *d_ptr)
1196{
1197 Q_UNUSED(src);
1198 for (qsizetype i = 0; i < len; ++i)
1199 dst[i] = d_ptr->colorSpaceOut->lut[1]->u8FromLinearF32(f: buffer[i].y);
1200}
1201
1202static void storeGray(quint16 *dst, const QRgba64 *src, const QColorVector *buffer, const qsizetype len,
1203 const QColorTransformPrivate *d_ptr)
1204{
1205 Q_UNUSED(src);
1206 for (qsizetype i = 0; i < len; ++i)
1207 dst[i] = d_ptr->colorSpaceOut->lut[1]->u16FromLinearF32(f: buffer[i].y);
1208}
1209
1210static constexpr qsizetype WorkBlockSize = 256;
1211
1212template <typename T, int Count = 1>
1213class QUninitialized
1214{
1215public:
1216 operator T*() { return reinterpret_cast<T *>(this); }
1217private:
1218 alignas(T) char data[sizeof(T) * Count];
1219};
1220
1221template<typename T>
1222void QColorTransformPrivate::apply(T *dst, const T *src, qsizetype count, TransformFlags flags) const
1223{
1224 if (!colorMatrix.isValid())
1225 return;
1226
1227 updateLutsIn();
1228 updateLutsOut();
1229
1230 bool doApplyMatrix = !colorMatrix.isIdentity();
1231 constexpr bool DoClip = !std::is_same_v<T, QRgbaFloat16> && !std::is_same_v<T, QRgbaFloat32>;
1232
1233 QUninitialized<QColorVector, WorkBlockSize> buffer;
1234
1235 qsizetype i = 0;
1236 while (i < count) {
1237 const qsizetype len = qMin(a: count - i, b: WorkBlockSize);
1238 if (flags & InputPremultiplied)
1239 loadPremultiplied(buffer, src + i, len, this);
1240 else
1241 loadUnpremultiplied(buffer, src + i, len, this);
1242
1243 if (doApplyMatrix)
1244 applyMatrix<DoClip>(buffer, len, colorMatrix);
1245
1246 if (flags & InputOpaque)
1247 storeOpaque(dst + i, src + i, buffer, len, this);
1248 else if (flags & OutputPremultiplied)
1249 storePremultiplied(dst + i, src + i, buffer, len, this);
1250 else
1251 storeUnpremultiplied(dst + i, src + i, buffer, len, this);
1252
1253 i += len;
1254 }
1255}
1256
1257template<typename D, typename S>
1258void QColorTransformPrivate::applyReturnGray(D *dst, const S *src, qsizetype count, TransformFlags flags) const
1259{
1260 if (!colorMatrix.isValid())
1261 return;
1262
1263 updateLutsIn();
1264 updateLutsOut();
1265
1266 QUninitialized<QColorVector, WorkBlockSize> buffer;
1267
1268 qsizetype i = 0;
1269 while (i < count) {
1270 const qsizetype len = qMin(a: count - i, b: WorkBlockSize);
1271 if (flags & InputPremultiplied)
1272 loadPremultiplied(buffer, src + i, len, this);
1273 else
1274 loadUnpremultiplied(buffer, src + i, len, this);
1275
1276 applyMatrix(buffer, len, colorMatrix);
1277
1278 storeGray(dst + i, src + i, buffer, len, this);
1279
1280 i += len;
1281 }
1282}
1283
1284/*!
1285 \internal
1286 \enum QColorTransformPrivate::TransformFlag
1287
1288 Defines how the transform is to be applied.
1289
1290 \value Unpremultiplied The input and output should both be unpremultiplied.
1291 \value InputOpaque The input is guaranteed to be opaque.
1292 \value InputPremultiplied The input is premultiplied.
1293 \value OutputPremultiplied The output should be premultiplied.
1294 \value Premultiplied Both input and output should both be premultiplied.
1295*/
1296
1297/*!
1298 \internal
1299 Prepares a color transformation for fast application. You do not need to
1300 call this explicitly as it will be called implicitly on the first transforms, but
1301 if you want predictable performance on the first transforms, you can perform it
1302 in advance.
1303
1304 \sa QColorTransform::map(), apply()
1305*/
1306void QColorTransformPrivate::prepare()
1307{
1308 updateLutsIn();
1309 updateLutsOut();
1310}
1311
1312/*!
1313 \internal
1314 Applies the color transformation on \a count QRgb pixels starting from
1315 \a src and stores the result in \a dst.
1316
1317 Thread-safe if prepare() has been called first.
1318
1319 Assumes unpremultiplied data by default. Set \a flags to change defaults.
1320
1321 \sa prepare()
1322*/
1323void QColorTransformPrivate::apply(QRgb *dst, const QRgb *src, qsizetype count, TransformFlags flags) const
1324{
1325 apply<QRgb>(dst, src, count, flags);
1326}
1327
1328/*!
1329 \internal
1330 Applies the color transformation on \a count QRgba64 pixels starting from
1331 \a src and stores the result in \a dst.
1332
1333 Thread-safe if prepare() has been called first.
1334
1335 Assumes unpremultiplied data by default. Set \a flags to change defaults.
1336
1337 \sa prepare()
1338*/
1339void QColorTransformPrivate::apply(QRgba64 *dst, const QRgba64 *src, qsizetype count, TransformFlags flags) const
1340{
1341 apply<QRgba64>(dst, src, count, flags);
1342}
1343
1344/*!
1345 \internal
1346 Applies the color transformation on \a count QRgbaFloat32 pixels starting from
1347 \a src and stores the result in \a dst.
1348
1349 Thread-safe if prepare() has been called first.
1350
1351 Assumes unpremultiplied data by default. Set \a flags to change defaults.
1352
1353 \sa prepare()
1354*/
1355void QColorTransformPrivate::apply(QRgbaFloat32 *dst, const QRgbaFloat32 *src, qsizetype count,
1356 TransformFlags flags) const
1357{
1358 apply<QRgbaFloat32>(dst, src, count, flags);
1359}
1360
1361/*!
1362 \internal
1363 Is to be called on a color-transform to XYZ, returns only luminance values.
1364
1365*/
1366void QColorTransformPrivate::apply(quint8 *dst, const QRgb *src, qsizetype count, TransformFlags flags) const
1367{
1368 applyReturnGray<quint8, QRgb>(dst, src, count, flags);
1369}
1370
1371/*!
1372 \internal
1373 Is to be called on a color-transform to XYZ, returns only luminance values.
1374
1375*/
1376void QColorTransformPrivate::apply(quint16 *dst, const QRgba64 *src, qsizetype count, TransformFlags flags) const
1377{
1378 applyReturnGray<quint16, QRgba64>(dst, src, count, flags);
1379}
1380
1381
1382/*!
1383 \internal
1384*/
1385bool QColorTransformPrivate::isIdentity() const
1386{
1387 if (!colorMatrix.isIdentity())
1388 return false;
1389 if (colorSpaceIn && colorSpaceOut) {
1390 if (colorSpaceIn->transferFunction != colorSpaceOut->transferFunction)
1391 return false;
1392 if (colorSpaceIn->transferFunction == QColorSpace::TransferFunction::Custom) {
1393 return colorSpaceIn->trc[0] == colorSpaceOut->trc[0]
1394 && colorSpaceIn->trc[1] == colorSpaceOut->trc[1]
1395 && colorSpaceIn->trc[2] == colorSpaceOut->trc[2];
1396 }
1397 } else {
1398 if (colorSpaceIn && colorSpaceIn->transferFunction != QColorSpace::TransferFunction::Linear)
1399 return false;
1400 if (colorSpaceOut && colorSpaceOut->transferFunction != QColorSpace::TransferFunction::Linear)
1401 return false;
1402 }
1403 return true;
1404}
1405
1406QT_END_NAMESPACE
1407

source code of qtbase/src/gui/painting/qcolortransform.cpp