1/****************************************************************************
2**
3** Copyright (C) 2016 The Qt Company Ltd.
4** Copyright (C) 2018 Intel Corporation.
5** Contact: https://www.qt.io/licensing/
6**
7** This file is part of the QtCore module of the Qt Toolkit.
8**
9** $QT_BEGIN_LICENSE:LGPL$
10** Commercial License Usage
11** Licensees holding valid commercial Qt licenses may use this file in
12** accordance with the commercial license agreement provided with the
13** Software or, alternatively, in accordance with the terms contained in
14** a written agreement between you and The Qt Company. For licensing terms
15** and conditions see https://www.qt.io/terms-conditions. For further
16** information use the contact form at https://www.qt.io/contact-us.
17**
18** GNU Lesser General Public License Usage
19** Alternatively, this file may be used under the terms of the GNU Lesser
20** General Public License version 3 as published by the Free Software
21** Foundation and appearing in the file LICENSE.LGPL3 included in the
22** packaging of this file. Please review the following information to
23** ensure the GNU Lesser General Public License version 3 requirements
24** will be met: https://www.gnu.org/licenses/lgpl-3.0.html.
25**
26** GNU General Public License Usage
27** Alternatively, this file may be used under the terms of the GNU
28** General Public License version 2.0 or (at your option) the GNU General
29** Public license version 3 or any later version approved by the KDE Free
30** Qt Foundation. The licenses are as published by the Free Software
31** Foundation and appearing in the file LICENSE.GPL2 and LICENSE.GPL3
32** included in the packaging of this file. Please review the following
33** information to ensure the GNU General Public License requirements will
34** be met: https://www.gnu.org/licenses/gpl-2.0.html and
35** https://www.gnu.org/licenses/gpl-3.0.html.
36**
37** $QT_END_LICENSE$
38**
39****************************************************************************/
40
41#ifndef QSIMD_P_H
42#define QSIMD_P_H
43
44//
45// W A R N I N G
46// -------------
47//
48// This file is not part of the Qt API. It exists purely as an
49// implementation detail. This header file may change from version to
50// version without notice, or even be removed.
51//
52// We mean it.
53//
54
55#include <QtCore/private/qglobal_p.h>
56
57/*
58 * qt_module_config.prf defines the QT_COMPILER_SUPPORTS_XXX macros.
59 * They mean the compiler supports the necessary flags and the headers
60 * for the x86 and ARM intrinsics:
61 * - GCC: the -mXXX or march=YYY flag is necessary before #include
62 * up to 4.8; GCC >= 4.9 can include unconditionally
63 * - Intel CC: #include can happen unconditionally
64 * - MSVC: #include can happen unconditionally
65 * - RVCT: ???
66 *
67 * We will try to include all headers possible under this configuration.
68 *
69 * MSVC does not define __SSE2__ & family, so we will define them. MSVC 2013 &
70 * up do define __AVX__ if the -arch:AVX option is passed on the command-line.
71 *
72 * Supported XXX are:
73 * Flag | Arch | GCC | Intel CC | MSVC |
74 * ARM_NEON | ARM | I & C | None | ? |
75 * SSE2 | x86 | I & C | I & C | I & C |
76 * SSE3 | x86 | I & C | I & C | I only |
77 * SSSE3 | x86 | I & C | I & C | I only |
78 * SSE4_1 | x86 | I & C | I & C | I only |
79 * SSE4_2 | x86 | I & C | I & C | I only |
80 * AVX | x86 | I & C | I & C | I & C |
81 * AVX2 | x86 | I & C | I & C | I only |
82 * AVX512xx | x86 | I & C | I & C | I only |
83 * I = intrinsics; C = code generation
84 *
85 * Code can use the following constructs to determine compiler support & status:
86 * - #ifdef __XXX__ (e.g: #ifdef __AVX__ or #ifdef __ARM_NEON__)
87 * If this test passes, then the compiler is already generating code for that
88 * given sub-architecture. The intrinsics for that sub-architecture are
89 * #included and can be used without restriction or runtime check.
90 *
91 * - #if QT_COMPILER_SUPPORTS(XXX)
92 * If this test passes, then the compiler is able to generate code for that
93 * given sub-architecture in another translation unit, given the right set of
94 * flags. Use of the intrinsics is not guaranteed. This is useful with
95 * runtime detection (see below).
96 *
97 * - #if QT_COMPILER_SUPPORTS_HERE(XXX)
98 * If this test passes, then the compiler is able to generate code for that
99 * given sub-architecture in this translation unit, even if it is not doing
100 * that now (it might be). Individual functions may be tagged with
101 * QT_FUNCTION_TARGET(XXX) to cause the compiler to generate code for that
102 * sub-arch. Only inside such functions is the use of the intrisics
103 * guaranteed to work. This is useful with runtime detection (see below).
104 *
105 * Runtime detection of a CPU sub-architecture can be done with the
106 * qCpuHasFeature(XXX) function. There are two strategies for generating
107 * optimized code like that:
108 *
109 * 1) place the optimized code in a different translation unit (C or assembly
110 * sources) and pass the correct flags to the compiler to enable support. Those
111 * sources must not include qglobal.h, which means they cannot include this
112 * file either. The dispatcher function would look like this:
113 *
114 * void foo()
115 * {
116 * #if QT_COMPILER_SUPPORTS(XXX)
117 * if (qCpuHasFeature(XXX)) {
118 * foo_optimized_xxx();
119 * return;
120 * }
121 * #endif
122 * foo_plain();
123 * }
124 *
125 * 2) place the optimized code in a function tagged with QT_FUNCTION_TARGET and
126 * surrounded by #if QT_COMPILER_SUPPORTS_HERE(XXX). That code can freely use
127 * other Qt code. The dispatcher function would look like this:
128 *
129 * void foo()
130 * {
131 * #if QT_COMPILER_SUPPORTS_HERE(XXX)
132 * if (qCpuHasFeature(XXX)) {
133 * foo_optimized_xxx();
134 * return;
135 * }
136 * #endif
137 * foo_plain();
138 * }
139 */
140
141#if defined(__MINGW64_VERSION_MAJOR) || defined(Q_CC_MSVC)
142#include <intrin.h>
143#endif
144
145#define QT_COMPILER_SUPPORTS(x) (QT_COMPILER_SUPPORTS_ ## x - 0)
146
147#if defined(Q_PROCESSOR_ARM)
148# define QT_COMPILER_SUPPORTS_HERE(x) (__ARM_FEATURE_ ## x)
149# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL) && Q_CC_GNU >= 600
150 /* GCC requires attributes for a function */
151# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
152# else
153# define QT_FUNCTION_TARGET(x)
154# endif
155# if !defined(__ARM_FEATURE_NEON) && defined(__ARM_NEON__)
156# define __ARM_FEATURE_NEON // also support QT_COMPILER_SUPPORTS_HERE(NEON)
157# endif
158#elif defined(Q_PROCESSOR_MIPS)
159# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
160# define QT_FUNCTION_TARGET(x)
161# if !defined(__MIPS_DSP__) && defined(__mips_dsp) && defined(Q_PROCESSOR_MIPS_32)
162# define __MIPS_DSP__
163# endif
164# if !defined(__MIPS_DSPR2__) && defined(__mips_dspr2) && defined(Q_PROCESSOR_MIPS_32)
165# define __MIPS_DSPR2__
166# endif
167#elif defined(Q_PROCESSOR_X86) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS)
168# define QT_COMPILER_SUPPORTS_HERE(x) ((__ ## x ## __) || QT_COMPILER_SUPPORTS(x))
169# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
170 /* GCC requires attributes for a function */
171# define QT_FUNCTION_TARGET(x) __attribute__((__target__(QT_FUNCTION_TARGET_STRING_ ## x)))
172# else
173# define QT_FUNCTION_TARGET(x)
174# endif
175#else
176# define QT_COMPILER_SUPPORTS_HERE(x) (__ ## x ## __)
177# define QT_FUNCTION_TARGET(x)
178#endif
179
180#ifdef Q_PROCESSOR_X86
181/* -- x86 intrinsic support -- */
182
183# if defined(Q_CC_MSVC) && (defined(_M_X64) || _M_IX86_FP >= 2)
184// MSVC doesn't define __SSE2__, so do it ourselves
185# define __SSE__ 1
186# define __SSE2__ 1
187# endif
188
189# ifdef __SSE2__
190// #include the intrinsics
191# include <immintrin.h>
192# endif
193
194# if defined(Q_CC_GNU) && !defined(Q_CC_INTEL)
195// GCC 4.4 and Clang 2.8 added a few more intrinsics there
196# include <x86intrin.h>
197# endif
198
199# if defined(Q_CC_MSVC) && (defined(_M_AVX) || defined(__AVX__))
200// Visual Studio defines __AVX__ when /arch:AVX is passed, but not the earlier macros
201// See: https://msdn.microsoft.com/en-us/library/b0084kay.aspx
202# define __SSE3__ 1
203# define __SSSE3__ 1
204// no Intel CPU supports SSE4a, so don't define it
205# define __SSE4_1__ 1
206# define __SSE4_2__ 1
207# ifndef __AVX__
208# define __AVX__ 1
209# endif
210# endif
211
212# if defined(__SSE4_2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
213// POPCNT instructions:
214// All processors that support SSE4.2 support POPCNT
215// (but neither MSVC nor the Intel compiler define this macro)
216# define __POPCNT__ 1
217# endif
218
219// AVX intrinsics
220# if defined(__AVX__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
221// AES, PCLMULQDQ instructions:
222// All processors that support AVX support PCLMULQDQ
223// (but neither MSVC nor the Intel compiler define this macro)
224# define __PCLMUL__ 1
225# endif
226
227# if defined(__AVX2__) && defined(QT_COMPILER_SUPPORTS_SIMD_ALWAYS) && (defined(Q_CC_INTEL) || defined(Q_CC_MSVC))
228// F16C & RDRAND instructions:
229// All processors that support AVX2 support F16C & RDRAND:
230// (but neither MSVC nor the Intel compiler define these macros)
231# define __F16C__ 1
232# define __RDRND__ 1
233# endif
234
235# if defined(__BMI__) && !defined(__BMI2__) && defined(Q_CC_INTEL)
236// BMI2 instructions:
237// All processors that support BMI support BMI2 (and AVX2)
238// (but neither MSVC nor the Intel compiler define this macro)
239# define __BMI2__ 1
240# endif
241
242# include "qsimd_x86_p.h"
243
244// Haswell sub-architecture
245//
246// The Intel Core 4th generation was codenamed "Haswell" and introduced AVX2,
247// BMI1, BMI2, FMA, LZCNT, MOVBE, which makes it a good divider for a
248// sub-target for us. The first AMD processor with AVX2 support (Zen) has the
249// same features.
250//
251// macOS's fat binaries support the "x86_64h" sub-architecture and the GNU libc
252// ELF loader also supports a "haswell/" subdir (e.g., /usr/lib/haswell).
253# define QT_FUNCTION_TARGET_STRING_ARCH_HASWELL "arch=haswell"
254# if defined(__AVX2__) && defined(__BMI__) && defined(__BMI2__) && defined(__F16C__) && \
255 defined(__FMA__) && defined(__LZCNT__) && defined(__RDRND__)
256# define __haswell__ 1
257# endif
258
259// This constant does not include all CPU features found in a Haswell, only
260// those that we'd have optimized code for.
261// Note: must use Q_CONSTEXPR here, as this file may be compiled in C mode.
262QT_BEGIN_NAMESPACE
263static const quint64 CpuFeatureArchHaswell = 0
264 | CpuFeatureSSE2
265 | CpuFeatureSSE3
266 | CpuFeatureSSSE3
267 | CpuFeatureSSE4_1
268 | CpuFeatureSSE4_2
269 | CpuFeatureFMA
270 | CpuFeaturePOPCNT
271 | CpuFeatureAVX
272 | CpuFeatureF16C
273 | CpuFeatureAVX2
274 | CpuFeatureBMI
275 | CpuFeatureBMI2;
276QT_END_NAMESPACE
277
278#endif /* Q_PROCESSOR_X86 */
279
280// Clang compiler fix, see http://lists.llvm.org/pipermail/cfe-commits/Week-of-Mon-20160222/151168.html
281// This should be tweaked with an "upper version" of clang once we know which release fixes the
282// issue. At that point we can rely on __ARM_FEATURE_CRC32 again.
283#if defined(Q_CC_CLANG) && defined(Q_OS_DARWIN) && defined (__ARM_FEATURE_CRC32)
284# undef __ARM_FEATURE_CRC32
285#endif
286
287// NEON intrinsics
288// note: as of GCC 4.9, does not support function targets for ARM
289#if defined(__ARM_NEON) || defined(__ARM_NEON__)
290#include <arm_neon.h>
291#define QT_FUNCTION_TARGET_STRING_NEON "+neon" // unused: gcc doesn't support function targets on non-aarch64, and on Aarch64 NEON is always available.
292#ifndef __ARM_NEON__
293// __ARM_NEON__ is not defined on AArch64, but we need it in our NEON detection.
294#define __ARM_NEON__
295#endif
296#endif
297// AArch64/ARM64
298#if defined(Q_PROCESSOR_ARM_V8) && defined(__ARM_FEATURE_CRC32)
299#if defined(Q_PROCESSOR_ARM_64)
300// only available on aarch64
301#define QT_FUNCTION_TARGET_STRING_CRC32 "+crc"
302#endif
303# include <arm_acle.h>
304#endif
305
306#ifdef __cplusplus
307#include <qatomic.h>
308
309QT_BEGIN_NAMESPACE
310
311#ifndef Q_PROCESSOR_X86
312enum CPUFeatures {
313#if defined(Q_PROCESSOR_ARM)
314 CpuFeatureNEON = 2,
315 CpuFeatureARM_NEON = CpuFeatureNEON,
316 CpuFeatureCRC32 = 4,
317#elif defined(Q_PROCESSOR_MIPS)
318 CpuFeatureDSP = 2,
319 CpuFeatureDSPR2 = 4,
320#endif
321
322 // used only to indicate that the CPU detection was initialised
323 QSimdInitialized = 1
324};
325
326static const quint64 qCompilerCpuFeatures = 0
327#if defined __ARM_NEON__
328 | CpuFeatureNEON
329#endif
330#if defined __ARM_FEATURE_CRC32
331 | CpuFeatureCRC32
332#endif
333#if defined __mips_dsp
334 | CpuFeatureDSP
335#endif
336#if defined __mips_dspr2
337 | CpuFeatureDSPR2
338#endif
339 ;
340#endif
341
342#ifdef Q_ATOMIC_INT64_IS_SUPPORTED
343extern Q_CORE_EXPORT QBasicAtomicInteger<quint64> qt_cpu_features[1];
344#else
345extern Q_CORE_EXPORT QBasicAtomicInteger<unsigned> qt_cpu_features[2];
346#endif
347Q_CORE_EXPORT quint64 qDetectCpuFeatures();
348
349static inline quint64 qCpuFeatures()
350{
351 quint64 features = qt_cpu_features[0].loadRelaxed();
352#ifndef Q_ATOMIC_INT64_IS_SUPPORTED
353 features |= quint64(qt_cpu_features[1].loadRelaxed()) << 32;
354#endif
355 if (Q_UNLIKELY(features == 0)) {
356 features = qDetectCpuFeatures();
357 Q_ASSUME(features != 0);
358 }
359 return features;
360}
361
362#define qCpuHasFeature(feature) (((qCompilerCpuFeatures & CpuFeature ## feature) == CpuFeature ## feature) \
363 || ((qCpuFeatures() & CpuFeature ## feature) == CpuFeature ## feature))
364
365#define ALIGNMENT_PROLOGUE_16BYTES(ptr, i, length) \
366 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((4 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x3)) & 0x3))); ++i)
367
368#define ALIGNMENT_PROLOGUE_32BYTES(ptr, i, length) \
369 for (; i < static_cast<int>(qMin(static_cast<quintptr>(length), ((8 - ((reinterpret_cast<quintptr>(ptr) >> 2) & 0x7)) & 0x7))); ++i)
370
371QT_END_NAMESPACE
372
373#endif // __cplusplus
374
375#define SIMD_EPILOGUE(i, length, max) \
376 for (int _i = 0; _i < max && i < length; ++i, ++_i)
377
378#endif // QSIMD_P_H
379