1/****************************************************************************
2**
3** Copyright (C) 2016 Paul Lemire <paul.lemire350@gmail.com>
4** Contact: https://www.qt.io/licensing/
5**
6** This file is part of the Qt3D module of the Qt Toolkit.
7**
8** $QT_BEGIN_LICENSE:LGPL$
9** Commercial License Usage
10** Licensees holding valid commercial Qt licenses may use this file in
11** accordance with the commercial license agreement provided with the
12** Software or, alternatively, in accordance with the terms contained in
13** a written agreement between you and The Qt Company. For licensing terms
14** and conditions see https://www.qt.io/terms-conditions. For further
15** information use the contact form at https://www.qt.io/contact-us.
16**
17** GNU Lesser General Public License Usage
18** Alternatively, this file may be used under the terms of the GNU Lesser
19** General Public License version 3 as published by the Free Software
20** Foundation and appearing in the file LICENSE.LGPL3 included in the
21** packaging of this file. Please review the following information to
22** ensure the GNU Lesser General Public License version 3 requirements
23** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
24**
25** GNU General Public License Usage
26** Alternatively, this file may be used under the terms of the GNU
27** General Public License version 2.0 or (at your option) the GNU General
28** Public license version 3 or any later version approved by the KDE Free
29** Qt Foundation. The licenses are as published by the Free Software
30** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
31** included in the packaging of this file. Please review the following
32** information to ensure the GNU General Public License requirements will
33** be met: https://www.gnu.org/licenses/gpl-2.0.html and
34** https://www.gnu.org/licenses/gpl-3.0.html.
35**
36** $QT_END_LICENSE$
37**
38****************************************************************************/
39
40#ifndef QT3DCORE_MATRIX4X4_SSE_P_H
41#define QT3DCORE_MATRIX4X4_SSE_P_H
42
43//
44// W A R N I N G
45// -------------
46//
47// This file is not part of the Qt3D API. It exists purely as an
48// implementation detail. This header file may change from version to
49// version without notice, or even be removed.
50//
51// We mean it.
52//
53
54#include <Qt3DCore/private/vector4d_p.h>
55#include <Qt3DCore/private/vector3d_p.h>
56#include <private/qsimd_p.h>
57#include <QMatrix4x4>
58
59#ifdef QT_COMPILER_SUPPORTS_SSE2
60
61QT_BEGIN_NAMESPACE
62
63namespace Qt3DCore {
64
65class Matrix4x4_SSE
66{
67public:
68
69 Q_ALWAYS_INLINE Matrix4x4_SSE() { setToIdentity(); }
70 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(Qt::Initialization) {}
71
72 // QMatrix4x4::constData returns in column major order
73 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(const QMatrix4x4 &mat)
74 {
75 // data may not be properly aligned, using unaligned loads
76 const float *data = mat.constData();
77 m_col1 = _mm_loadu_ps(data);
78 m_col2 = _mm_loadu_ps(data + 4);
79 m_col3 = _mm_loadu_ps(data + 8);
80 m_col4 = _mm_loadu_ps(data + 12);
81 }
82
83 // Assumes data is 16 bytes aligned (and in column major order)
84 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float *data)
85 {
86 m_col1 = _mm_load_ps(data);
87 m_col2 = _mm_load_ps(data + 4);
88 m_col3 = _mm_load_ps(data + 8);
89 m_col4 = _mm_load_ps(data + 12);
90 }
91
92 // In (row major) but we store in column major order
93 explicit Q_ALWAYS_INLINE Matrix4x4_SSE(float m11, float m12, float m13, float m14,
94 float m21, float m22, float m23, float m24,
95 float m31, float m32, float m33, float m34,
96 float m41, float m42, float m43, float m44)
97 {
98 m_col1 = _mm_set_ps(m41, m31, m21, m11);
99 m_col2 = _mm_set_ps(m42, m32, m22, m12);
100 m_col3 = _mm_set_ps(m43, m33, m23, m13);
101 m_col4 = _mm_set_ps(m44, m34, m24, m14);
102 }
103
104 Q_ALWAYS_INLINE void setToIdentity()
105 {
106 m_col1 = _mm_set_ss(1.0f);
107 m_col2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f);
108 m_col3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f);
109 m_col4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f);
110 }
111
112 Q_ALWAYS_INLINE Matrix4x4_SSE operator*(const Matrix4x4_SSE &other) const
113 {
114 Matrix4x4_SSE c(Qt::Uninitialized);
115
116 const __m128 c1 = m_col1;
117 const __m128 c2 = m_col2;
118 const __m128 c3 = m_col3;
119 const __m128 c4 = m_col4;
120
121 // c11, c21, c31, c41
122 // 1) (m11 x n11), (m11 x n21), (m11 x n31), (m11 x n41)
123 // 2) (m11 x n11) + (m21 x n12), (m11 x n21) + (m21 x n22), (m11 x n31) + (m21 x n32), (m11 x n41) + (m21 x n42)
124 // 3) (m11 x n11) + (m21 x n21) + (m31 x n13), (m11 x n21) + (m21 x n22) + (m31 x n 23), (m11 x n31) + (m21 x n32) + (m31 x n33), (m11 x n41) + (m21 x n42) (m31 x n43)
125 // 4) (m11 x n11) + (m21 x n21) + (m31 x n13) + (m41 x n14), (m11 x n21) + (m21 x n22) + (m31 x n 23) + (m41 x n24), (m11 x n31) + (m21 x n32) + (m31 x n33) + (m41 x n34), (m11 x n41) + (m21 x n42) (m31 x n43) + (m41 x n44)
126 __m128 tmp = _mm_mul_ps(_mm_set1_ps(other.m11()), c1);
127 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m21()), c2), tmp);
128 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m31()), c3), tmp);
129 c.m_col1 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m41()), c4), tmp);
130
131 // c21, c22, c23, c24
132 tmp = _mm_mul_ps(_mm_set1_ps(other.m12()), c1);
133 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m22()), c2), tmp);
134 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m32()), c3), tmp);
135 c.m_col2 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m42()), c4), tmp);
136
137 // c31, c32, c33, c34
138 tmp = _mm_mul_ps(_mm_set1_ps(other.m13()), c1);
139 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m23()), c2), tmp);
140 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m33()), c3), tmp);
141 c.m_col3 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m43()), c4), tmp);
142
143 // c41, c42, c43, c44
144 tmp = _mm_mul_ps(_mm_set1_ps(other.m14()), c1);
145 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m24()), c2), tmp);
146 tmp = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m34()), c3), tmp);
147 c.m_col4 = _mm_add_ps(_mm_mul_ps(_mm_set1_ps(other.m44()), c4), tmp);
148
149 return c;
150 }
151
152 Q_ALWAYS_INLINE Matrix4x4_SSE operator-(const Matrix4x4_SSE &other) const
153 {
154 Matrix4x4_SSE c(Qt::Uninitialized);
155
156 c.m_col1 = _mm_sub_ps(m_col1, other.m_col1);
157 c.m_col2 = _mm_sub_ps(m_col2, other.m_col2);
158 c.m_col3 = _mm_sub_ps(m_col3, other.m_col3);
159 c.m_col4 = _mm_sub_ps(m_col4, other.m_col4);
160
161 return c;
162 }
163
164 Q_ALWAYS_INLINE Matrix4x4_SSE operator+(const Matrix4x4_SSE &other) const
165 {
166 Matrix4x4_SSE c(Qt::Uninitialized);
167
168 c.m_col1 = _mm_add_ps(m_col1, other.m_col1);
169 c.m_col2 = _mm_add_ps(m_col2, other.m_col2);
170 c.m_col3 = _mm_add_ps(m_col3, other.m_col3);
171 c.m_col4 = _mm_add_ps(m_col4, other.m_col4);
172
173 return c;
174 }
175
176 Q_ALWAYS_INLINE Matrix4x4_SSE &operator*=(const Matrix4x4_SSE &other)
177 {
178 *this = *this * other;
179 return *this;
180 }
181
182 Q_ALWAYS_INLINE Matrix4x4_SSE &operator-=(const Matrix4x4_SSE &other)
183 {
184 *this = *this - other;
185 return *this;
186 }
187
188 Q_ALWAYS_INLINE Matrix4x4_SSE &operator+=(const Matrix4x4_SSE &other)
189 {
190 *this = *this + other;
191 return *this;
192 }
193
194 Q_ALWAYS_INLINE Matrix4x4_SSE transposed() const
195 {
196 Matrix4x4_SSE c(Qt::Uninitialized);
197
198 // ~113 instructions
199 // 0b11011101 == 0xdd
200 // 0b10001000 == 0x88
201 const __m128 tmp1 = _mm_shuffle_ps(m_col1, m_col2, 0xdd);
202 const __m128 tmp2 = _mm_shuffle_ps(m_col1, m_col2, 0x88);
203 const __m128 tmp3 = _mm_shuffle_ps(m_col3, m_col4, 0xdd);
204 const __m128 tmp4 = _mm_shuffle_ps(m_col3, m_col4, 0x88);
205 c.m_col1 = _mm_shuffle_ps(tmp2, tmp4, 0x88);
206 c.m_col2 = _mm_shuffle_ps(tmp1, tmp3, 0x88);
207 c.m_col3 = _mm_shuffle_ps(tmp2, tmp4, 0xdd);
208 c.m_col4 = _mm_shuffle_ps(tmp1, tmp3, 0xdd);
209
210 return c;
211 }
212
213 Q_ALWAYS_INLINE Matrix4x4_SSE inverted() const
214 {
215 // TO DO: Optimize
216 const QMatrix4x4 mat = toQMatrix4x4();
217 return Matrix4x4_SSE(mat.inverted());
218 }
219
220 Q_ALWAYS_INLINE bool operator==(const Matrix4x4_SSE &other) const
221 {
222 // 0b1111 == 0xf
223 return (_mm_movemask_ps(_mm_cmpeq_ps(m_col1, other.m_col1)) == 0xf &&
224 _mm_movemask_ps(_mm_cmpeq_ps(m_col2, other.m_col2)) == 0xf &&
225 _mm_movemask_ps(_mm_cmpeq_ps(m_col3, other.m_col3)) == 0xf &&
226 _mm_movemask_ps(_mm_cmpeq_ps(m_col4, other.m_col4)) == 0xf);
227 }
228
229 Q_ALWAYS_INLINE bool operator!=(const Matrix4x4_SSE &other) const
230 {
231 return !(*this == other);
232 }
233
234 Q_ALWAYS_INLINE float m11() const { return _mm_cvtss_f32(m_col1); }
235 Q_ALWAYS_INLINE float m12() const { return _mm_cvtss_f32(m_col2); }
236 Q_ALWAYS_INLINE float m13() const { return _mm_cvtss_f32(m_col3); }
237 Q_ALWAYS_INLINE float m14() const { return _mm_cvtss_f32(m_col4); }
238
239 Q_ALWAYS_INLINE float m21() const
240 {
241 // 0b01010101 = 0x55
242 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0x55));
243 }
244 Q_ALWAYS_INLINE float m22() const
245 {
246 // 0b01010101 = 0x55
247 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0x55));
248 }
249 Q_ALWAYS_INLINE float m23() const
250 {
251 // 0b01010101 = 0x55
252 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0x55));
253 }
254 Q_ALWAYS_INLINE float m24() const
255 {
256 // 0b01010101 = 0x55
257 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0x55));
258 }
259
260 Q_ALWAYS_INLINE float m31() const
261 {
262 // 0b10101010 = 0xaa
263 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xaa));
264 }
265 Q_ALWAYS_INLINE float m32() const
266 {
267 // 0b10101010 = 0xaa
268 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xaa));
269 }
270 Q_ALWAYS_INLINE float m33() const
271 {
272 // 0b10101010 = 0xaa
273 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xaa));
274 }
275 Q_ALWAYS_INLINE float m34() const
276 {
277 // 0b10101010 = 0xaa
278 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xaa));
279 }
280
281 Q_ALWAYS_INLINE float m41() const
282 {
283 // 0b11111111 = 0xff
284 return _mm_cvtss_f32(_mm_shuffle_ps(m_col1, m_col1, 0xff));
285 }
286 Q_ALWAYS_INLINE float m42() const
287 {
288 // 0b11111111 = 0xff
289 return _mm_cvtss_f32(_mm_shuffle_ps(m_col2, m_col2, 0xff));
290 }
291 Q_ALWAYS_INLINE float m43() const
292 {
293 // 0b11111111 = 0xff
294 return _mm_cvtss_f32(_mm_shuffle_ps(m_col3, m_col3, 0xff));
295 }
296 Q_ALWAYS_INLINE float m44() const
297 {
298 // 0b11111111 = 0xff
299 return _mm_cvtss_f32(_mm_shuffle_ps(m_col4, m_col4, 0xff));
300 }
301
302 Q_ALWAYS_INLINE Vector4D row(int index) const
303 {
304 switch (index) {
305 case 0:
306 return Vector4D(m11(), m12(), m13(), m14());
307 case 1:
308 return Vector4D(m21(), m22(), m23(), m24());
309 case 2:
310 return Vector4D(m31(), m32(), m33(), m34());
311 case 3:
312 return Vector4D(m41(), m42(), m43(), m44());
313 default:
314 Q_UNREACHABLE();
315 return Vector4D();
316 }
317 }
318
319 Q_ALWAYS_INLINE Vector4D column(int index) const
320 {
321 Vector4D c(Qt::Uninitialized);
322 switch (index) {
323 case 0:
324 c.m_xyzw = m_col1;
325 break;
326 case 1:
327 c.m_xyzw = m_col2;
328 break;
329 case 2:
330 c.m_xyzw = m_col3;
331 break;
332 case 3:
333 c.m_xyzw = m_col4;
334 break;
335 default:
336 Q_UNREACHABLE();
337 return Vector4D();
338 }
339 return c;
340 }
341
342 Q_ALWAYS_INLINE QMatrix4x4 toQMatrix4x4() const { return QMatrix4x4(m11(), m12(), m13(), m14(),
343 m21(), m22(), m23(), m24(),
344 m31(), m32(), m33(), m34(),
345 m41(), m42(), m43(), m44()); }
346
347 Q_ALWAYS_INLINE Vector3D_SSE map(const Vector3D_SSE &point) const
348 {
349 return *this * point;
350 }
351
352 Q_ALWAYS_INLINE Vector4D_SSE map(const Vector4D_SSE &point) const
353 {
354 return *this * point;
355 }
356
357 Q_ALWAYS_INLINE Vector3D_SSE mapVector(const Vector3D_SSE &vector) const
358 {
359 const __m128 row1 = _mm_set_ps(0.0f, m13(), m12(), m11());
360 const __m128 row2 = _mm_set_ps(0.0f, m23(), m22(), m21());
361 const __m128 row3 = _mm_set_ps(0.0f, m33(), m32(), m31());
362
363 const __m128 tmp = _mm_add_ps(_mm_mul_ps(vector.m_xyzw, row1), _mm_mul_ps(vector.m_xyzw, row2));
364
365 Vector3D_SSE v(Qt::Uninitialized);
366 v.m_xyzw = _mm_add_ps(tmp, _mm_mul_ps(vector.m_xyzw, row3));
367 return v;
368 }
369
370 friend Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix);
371 friend Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector);
372
373 friend Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix);
374 friend Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector);
375
376 friend Q_3DCORE_PRIVATE_EXPORT QDebug operator<<(QDebug dbg, const Matrix4x4_SSE &m);
377private:
378 // Internally we will store the matrix as indicated below
379 // Q_DECL_ALIGN(16) // aligned on 16 bytes boundary for SSE (column major)
380 // struct
381 // {
382 // float m_m11, m_m21, m_m31, m_m41;
383 // float m_m12, m_m22, m_m32, m_m42;
384 // float m_m13, m_m23, m_m33, m_m43;
385 // float m_m14, m_m24, m_m34, m_m44;
386 // };
387 // struct
388 // {
389 // float m[16];
390 // };
391 __m128 m_col1;
392 __m128 m_col2;
393 __m128 m_col3;
394 __m128 m_col4;
395};
396
397Q_ALWAYS_INLINE Vector4D operator*(const Vector4D &vector, const Matrix4x4_SSE &matrix)
398{
399 const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vector.m_xyzw);
400 const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vector.m_xyzw);
401 const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vector.m_xyzw);
402 const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vector.m_xyzw);
403
404
405 // 0b01000100 == 0x44
406 // 0b11101110 == 0xee
407
408 // vCol1.x, vCol1.y, vCol2.x, vCol2.y
409 __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44);
410 // vCol1.z, vCol1.w, vCol2.z, vCol2.w
411 __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee);
412
413 // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
414 const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2);
415
416 // vCol3.x, vCol3.y, vCol4.x, vCol4.y
417 tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44);
418 // vCol3.z, vCol3.w, vCol4.z, vCol4.w
419 tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee);
420
421 // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
422 const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2);
423
424 // 0b10001000 == 0x88
425 // 0b11011101 == 0xdd
426
427 // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
428 tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88);
429 // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
430 tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd);
431
432 Vector4D v(Qt::Uninitialized);
433 v.m_xyzw = _mm_add_ps(tmp1, tmp2);
434 return v;
435}
436
437Q_ALWAYS_INLINE Vector4D operator*(const Matrix4x4_SSE &matrix, const Vector4D &vector)
438{
439 const Matrix4x4_SSE transposed = matrix.transposed();
440 return vector * transposed;
441}
442
443Q_ALWAYS_INLINE Vector3D operator*(const Vector3D &vector, const Matrix4x4_SSE &matrix)
444{
445 const __m128 vec4 = _mm_set_ps(1.0f, vector.z(), vector.y(), vector.x());
446
447 const __m128 vCol1 = _mm_mul_ps(matrix.m_col1, vec4);
448 const __m128 vCol2 = _mm_mul_ps(matrix.m_col2, vec4);
449 const __m128 vCol3 = _mm_mul_ps(matrix.m_col3, vec4);
450 const __m128 vCol4 = _mm_mul_ps(matrix.m_col4, vec4);
451
452 // 0b01000100 == 0x44
453 // 0b11101110 == 0xee
454
455 // vCol1.x, vCol1.y, vCol2.x, vCol2.y
456 __m128 tmp1 = _mm_shuffle_ps(vCol1, vCol2, 0x44);
457 // vCol1.z, vCol1.w, vCol2.z, vCol2.w
458 __m128 tmp2 = _mm_shuffle_ps(vCol1, vCol2, 0xee);
459
460 // vCol1.x + vCol1.z, vCol1.y + vCol1.w, vCol2.x + vCol2.z, vCol2.y + vCol2.w,
461 const __m128 tmpSum01 = _mm_add_ps(tmp1, tmp2);
462
463 // vCol3.x, vCol3.y, vCol4.x, vCol4.y
464 tmp1 = _mm_shuffle_ps(vCol3, vCol4, 0x44);
465 // vCol3.z, vCol3.w, vCol4.z, vCol4.w
466 tmp2 = _mm_shuffle_ps(vCol3, vCol4, 0xee);
467
468 // vCol3.x + vCol3.z, vCol3.y + vCol3.w, vCol4.x + vCol4.z, vCol4.y + vCol4.w,
469 const __m128 tmpSum02 = _mm_add_ps(tmp1, tmp2);
470
471 // 0b10001000 == 0x88
472 // 0b11011101 == 0xdd
473
474 // vCol1.x + vCol1.z, vCol2.x + vCol2.z, vCol3.x + vCol3.z, vCol4.x + vCol4.z,
475 tmp1 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0x88);
476 // vCol1.y + vCol1.w, vCol2.y + vCol2.w, vCol3.y + vCol3.w, vCol4.y + vCol4.w,
477 tmp2 = _mm_shuffle_ps(tmpSum01, tmpSum02, 0xdd);
478
479 const __m128 result = _mm_add_ps(tmp1, tmp2);
480 // 0b11111111 = 0xff
481 const __m128 divisor = _mm_shuffle_ps(result, result, 0xff);
482 Vector3D v(Qt::Uninitialized);
483 v.m_xyzw = _mm_div_ps(result, divisor);
484 return v;
485}
486
487Q_3DCORE_PRIVATE_EXPORT Q_ALWAYS_INLINE Vector3D operator*(const Matrix4x4_SSE &matrix, const Vector3D &vector)
488{
489 const Matrix4x4_SSE transposed = matrix.transposed();
490 return vector * transposed;
491}
492
493} // Qt3DCore
494
495
496Q_DECLARE_TYPEINFO(Qt3DCore::Matrix4x4_SSE, Q_PRIMITIVE_TYPE);
497
498QT_END_NAMESPACE
499
500Q_DECLARE_METATYPE(Qt3DCore::Matrix4x4_SSE)
501
502#endif // QT_COMPILER_SUPPORTS_SSE2
503
504#endif // QT3DCORE_MATRIX4X4_SSE_P_H
505