1// Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com>
2// SPDX-License-Identifier: LicenseRef-Qt-Commercial OR LGPL-3.0-only OR GPL-2.0-only OR GPL-3.0-only
3
4#ifndef QT3DCORE_MATRIX4X4_SSE_P_H
5#define QT3DCORE_MATRIX4X4_SSE_P_H
6
7//
8// W A R N I N G
9// -------------
10//
11// This file is not part of the Qt3D API. It exists purely as an
12// implementation detail. This header file may change from version to
13// version without notice, or even be removed.
14//
15// We mean it.
16//
17
18#include <Qt3DCore/private/vector4d_p.h>
19#include <Qt3DCore/private/vector3d_p.h>
20#include <private/qsimd_p.h>
21#include <QMatrix4x4>
22
23#if defined(__AVX2__)
24#include "matrix4x4_avx2_p.h"
25#elif defined(__SSE2__)
26
27QT_BEGIN_NAMESPACE
28
29namespace Qt3DCore {
30
31class Matrix4x4_SSE
32{
33public:
34
35 Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); }
36 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {}
37
38 // QMatrix4x4::constData returns in column major order
39 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat)
40 {
41 // data may not be properly aligned, using unaligned loads
42 const float *data = mat.constData();
43 m_col1 = _mm_loadu_ps(p: data);
44 m_col2 = _mm_loadu_ps(p: data + 4);
45 m_col3 = _mm_loadu_ps(p: data + 8);
46 m_col4 = _mm_loadu_ps(p: data + 12);
47 }
48
49 // Assumes data is 16 bytes aligned (and in column major order)
50 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data)
51 {
52 m_col1 = _mm_load_ps(p: data);
53 m_col2 = _mm_load_ps(p: data + 4);
54 m_col3 = _mm_load_ps(p: data + 8);
55 m_col4 = _mm_load_ps(p: data + 12);
56 }
57
58 // In (row major) but we store in column major order
59 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14,
60 float m21, float m22, float m23, float m24,
61 float m31, float m32, float m33, float m34,
62 float m41, float m42, float m43, float m44)
63 {
64 m_col1 = _mm_set_ps(z: m41, y: m31, x: m21, w: m11);
65 m_col2 = _mm_set_ps(z: m42, y: m32, x: m22, w: m12);
66 m_col3 = _mm_set_ps(z: m43, y: m33, x: m23, w: m13);
67 m_col4 = _mm_set_ps(z: m44, y: m34, x: m24, w: m14);
68 }
69
70 Q_ALWAYS_INLINE void setToIdentity()
71 {
72 m_col1 = _mm_set_ss(w: 1.0f);
73 m_col2 = _mm_set_ps(z: 0.0f, y: 0.0f, x: 1.0f, w: 0.0f);
74 m_col3 = _mm_set_ps(z: 0.0f, y: 1.0f, x: 0.0f, w: 0.0f);
75 m_col4 = _mm_set_ps(z: 1.0f, y: 0.0f, x: 0.0f, w: 0.0f);
76 }
77
78 Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const
79 {
80 Matrix4x4_SSE c(Qt::Uninitialized);
81
82 const __m128 c1 = m_col1;
83 const __m128 c2 = m_col2;
84 const __m128 c3 = m_col3;
85 const __m128 c4 = m_col4;
86
87 // c11, c21, c31, c41
88 // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41)
89 // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42)
90 // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43)
91 // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44)
92 __m128 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m11()), b: c1);
93 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m21()), b: c2), b: tmp);
94 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m31()), b: c3), b: tmp);
95 c.m_col1 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m41()), b: c4), b: tmp);
96
97 // c21, c22, c23, c24
98 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m12()), b: c1);
99 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m22()), b: c2), b: tmp);
100 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m32()), b: c3), b: tmp);
101 c.m_col2 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m42()), b: c4), b: tmp);
102
103 // c31, c32, c33, c34
104 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m13()), b: c1);
105 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m23()), b: c2), b: tmp);
106 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m33()), b: c3), b: tmp);
107 c.m_col3 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m43()), b: c4), b: tmp);
108
109 // c41, c42, c43, c44
110 tmp = _mm_mul_ps(a: _mm_set1_ps(w: other.m14()), b: c1);
111 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m24()), b: c2), b: tmp);
112 tmp = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m34()), b: c3), b: tmp);
113 c.m_col4 = _mm_add_ps(a: _mm_mul_ps(a: _mm_set1_ps(w: other.m44()), b: c4), b: tmp);
114
115 return c;
116 }
117
118 Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const
119 {
120 Matrix4x4_SSE c(Qt::Uninitialized);
121
122 c.m_col1 = _mm_sub_ps(a: m_col1, b: other.m_col1);
123 c.m_col2 = _mm_sub_ps(a: m_col2, b: other.m_col2);
124 c.m_col3 = _mm_sub_ps(a: m_col3, b: other.m_col3);
125 c.m_col4 = _mm_sub_ps(a: m_col4, b: other.m_col4);
126
127 return c;
128 }
129
130 Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const
131 {
132 Matrix4x4_SSE c(Qt::Uninitialized);
133
134 c.m_col1 = _mm_add_ps(a: m_col1, b: other.m_col1);
135 c.m_col2 = _mm_add_ps(a: m_col2, b: other.m_col2);
136 c.m_col3 = _mm_add_ps(a: m_col3, b: other.m_col3);
137 c.m_col4 = _mm_add_ps(a: m_col4, b: other.m_col4);
138
139 return c;
140 }
141
142 Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other)
143 {
144 *this = *this * other;
145 return *this;
146 }
147
148 Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other)
149 {
150 *this = *this - other;
151 return *this;
152 }
153
154 Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other)
155 {
156 *this = *this + other;
157 return *this;
158 }
159
160 Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const
161 {
162 Matrix4x4_SSE c(Qt::Uninitialized);
163
164 // ~113 instructions
165 // 0b11011101 == 0xdd
166 // 0b10001000 == 0x88
167 const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd);
168 const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88);
169 const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd);
170 const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88);
171 c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88);
172 c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88);
173 c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd);
174 c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd);
175
176 return c;
177 }
178
179 Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const
180 {
181 // TO DO: Optimize
182 const QMatrix4x4 mat = toQMatrix4x4();
183 return Matrix4x4_SSE(mat.inverted());
184 }
185
186 Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const
187 {
188 // 0b1111 == 0xf
189 return (_mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col1, b: other.m_col1)) == 0xf &&
190 _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col2, b: other.m_col2)) == 0xf &&
191 _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col3, b: other.m_col3)) == 0xf &&
192 _mm_movemask_ps(a: _mm_cmpeq_ps(a: m_col4, b: other.m_col4)) == 0xf);
193 }
194
195 Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const
196 {
197 return !(*this == other);
198 }
199
200 Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(a: m_col1); }
201 Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(a: m_col2); }
202 Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(a: m_col3); }
203 Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(a: m_col4); }
204
205 Q_ALWAYS_INLINE float m21() const
206 {
207 // 0b01010101 = 0x55
208 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55));
209 }
210 Q_ALWAYS_INLINE float m22() const
211 {
212 // 0b01010101 = 0x55
213 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55));
214 }
215 Q_ALWAYS_INLINE float m23() const
216 {
217 // 0b01010101 = 0x55
218 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55));
219 }
220 Q_ALWAYS_INLINE float m24() const
221 {
222 // 0b01010101 = 0x55
223 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55));
224 }
225
226 Q_ALWAYS_INLINE float m31() const
227 {
228 // 0b10101010 = 0xaa
229 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa));
230 }
231 Q_ALWAYS_INLINE float m32() const
232 {
233 // 0b10101010 = 0xaa
234 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa));
235 }
236 Q_ALWAYS_INLINE float m33() const
237 {
238 // 0b10101010 = 0xaa
239 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa));
240 }
241 Q_ALWAYS_INLINE float m34() const
242 {
243 // 0b10101010 = 0xaa
244 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa));
245 }
246
247 Q_ALWAYS_INLINE float m41() const
248 {
249 // 0b11111111 = 0xff
250 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff));
251 }
252 Q_ALWAYS_INLINE float m42() const
253 {
254 // 0b11111111 = 0xff
255 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff));
256 }
257 Q_ALWAYS_INLINE float m43() const
258 {
259 // 0b11111111 = 0xff
260 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff));
261 }
262 Q_ALWAYS_INLINE float m44() const
263 {
264 // 0b11111111 = 0xff
265 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff));
266 }
267
268 Q_ALWAYS_INLINE Vector4D row(int index) const
269 {
270 switch (index) {
271 case 0:
272 return Vector4D(m11(), m12(), m13(), m14());
273 case 1:
274 return Vector4D(m21(), m22(), m23(), m24());
275 case 2:
276 return Vector4D(m31(), m32(), m33(), m34());
277 case 3:
278 return Vector4D(m41(), m42(), m43(), m44());
279 default:
280 Q_UNREACHABLE_RETURN(Vector4D());
281 }
282 }
283
284 Q_ALWAYS_INLINE Vector4D column(int index) const
285 {
286 Vector4D c(Qt::Uninitialized);
287 switch (index) {
288 case 0:
289 c.m_xyzw = m_col1;
290 break;
291 case 1:
292 c.m_xyzw = m_col2;
293 break;
294 case 2:
295 c.m_xyzw = m_col3;
296 break;
297 case 3:
298 c.m_xyzw = m_col4;
299 break;
300 default:
301 Q_UNREACHABLE_RETURN(Vector4D());
302 }
303 return c;
304 }
305
306 Q_ALWAYS_INLINE float operator()(int row, int column) const {
307 return this->row(index: row)[column];
308 }
309
310 Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(),
311 m21(), m22(), m23(), m24(),
312 m31(), m32(), m33(), m34(),
313 m41(), m42(), m43(), m44()); }
314
315 Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const
316 {
317 return *this * point;
318 }
319
320 Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const
321 {
322 return *this * point;
323 }
324
325 Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const
326 {
327 const Vector3D_SSE row1(m11(), m12(), m13());
328 const Vector3D_SSE row2(m21(), m22(), m23());
329 const Vector3D_SSE row3(m31(), m32(), m33());
330
331 return Vector3D(Vector3D_SSE::dotProduct(a: row1, b: vector),
332 Vector3D_SSE::dotProduct(a: row2, b: vector),
333 Vector3D_SSE::dotProduct(a: row3, b: vector));
334 }
335
336 friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix);
337 friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector);
338
339 friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix);
340 friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector);
341
342 friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m);
343
344private:
345 // Internally we will store the matrix as indicated below
346 // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major)
347 // struct
348 // {
349 // float m_m11, m_m21, m_m31, m_m41;
350 // float m_m12, m_m22, m_m32, m_m42;
351 // float m_m13, m_m23, m_m33, m_m43;
352 // float m_m14, m_m24, m_m34, m_m44;
353 // };
354 // struct
355 // {
356 // float m[16];
357 // };
358 __m128 m_col1;
359 __m128 m_col2;
360 __m128 m_col3;
361 __m128 m_col4;
362};
363
364Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix)
365{
366 const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vector.m_xyzw);
367 const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vector.m_xyzw);
368 const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vector.m_xyzw);
369 const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vector.m_xyzw);
370
371
372 // 0b01000100 == 0x44
373 // 0b11101110 == 0xee
374
375 // vCol1.x, vCol1.y, vCol2.x, vCol2.y
376 __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44);
377 // vCol1.z, vCol1.w, vCol2.z, vCol2.w
378 __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee);
379
380 // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
381 const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2);
382
383 // vCol3.x, vCol3.y, vCol4.x, vCol4.y
384 tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44);
385 // vCol3.z, vCol3.w, vCol4.z, vCol4.w
386 tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee);
387
388 // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
389 const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2);
390
391 // 0b10001000 == 0x88
392 // 0b11011101 == 0xdd
393
394 // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
395 tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88);
396 // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
397 tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd);
398
399 Vector4D v(Qt::Uninitialized);
400 v.m_xyzw = _mm_add_ps(a: tmp1, b: tmp2);
401 return v;
402}
403
404Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector)
405{
406 const Matrix4x4_SSE transposed = matrix.transposed();
407 return vector * transposed;
408}
409
410Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix)
411{
412 const __m128 vec4 = _mm_set_ps(z: 1.0f, y: vector.z(), x: vector.y(), w: vector.x());
413
414 const __m128 vCol1 = _mm_mul_ps(a: matrix.m_col1, b: vec4);
415 const __m128 vCol2 = _mm_mul_ps(a: matrix.m_col2, b: vec4);
416 const __m128 vCol3 = _mm_mul_ps(a: matrix.m_col3, b: vec4);
417 const __m128 vCol4 = _mm_mul_ps(a: matrix.m_col4, b: vec4);
418
419 // 0b01000100 == 0x44
420 // 0b11101110 == 0xee
421
422 // vCol1.x, vCol1.y, vCol2.x, vCol2.y
423 __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44);
424 // vCol1.z, vCol1.w, vCol2.z, vCol2.w
425 __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee);
426
427 // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
428 const __m128 tmpSum01 = _mm_add_ps(a: tmp1, b: tmp2);
429
430 // vCol3.x, vCol3.y, vCol4.x, vCol4.y
431 tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44);
432 // vCol3.z, vCol3.w, vCol4.z, vCol4.w
433 tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee);
434
435 // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
436 const __m128 tmpSum02 = _mm_add_ps(a: tmp1, b: tmp2);
437
438 // 0b10001000 == 0x88
439 // 0b11011101 == 0xdd
440
441 // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
442 tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88);
443 // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
444 tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd);
445
446 const __m128 result = _mm_add_ps(a: tmp1, b: tmp2);
447 // 0b11111111 = 0xff
448 const __m128 divisor = _mm_shuffle_ps(result, result, 0xff);
449 Vector3D v(Qt::Uninitialized);
450 v.m_xyzw = _mm_div_ps(a: result, b: divisor);
451 return v;
452}
453
454Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector)
455{
456 const Matrix4x4_SSE transposed = matrix.transposed();
457 return vector * transposed;
458}
459
460} // Qt3DCore
461
462
463Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE);
464
465QT_END_NAMESPACE
466
467Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE)
468
469#endif // __SSE2__
470
471#endif // QT3DCORE_MATRIX4X4_SSE_P_H
472

source code of qt3d/src/core/transforms/matrix4x4_sse_p.h