Grok  9.5.0
detect_targets.h
Go to the documentation of this file.
1 // Copyright 2021 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HIGHWAY_HWY_DETECT_TARGETS_H_
16 #define HIGHWAY_HWY_DETECT_TARGETS_H_
17 
18 // Defines targets and chooses which to enable.
19 
21 
22 //------------------------------------------------------------------------------
23 // Optional configuration
24 
25 // See ../quick_reference.md for documentation of these macros.
26 
27 // Uncomment to override the default baseline determined from predefined macros:
28 // #define HWY_BASELINE_TARGETS (HWY_SSE4 | HWY_SCALAR)
29 
30 // Uncomment to override the default blocklist:
31 // #define HWY_BROKEN_TARGETS HWY_AVX3
32 
33 // Uncomment to definitely avoid generating those target(s):
34 // #define HWY_DISABLED_TARGETS HWY_SSE4
35 
36 // Uncomment to avoid emitting BMI/BMI2/FMA instructions (allows generating
37 // AVX2 target for VMs which support AVX2 but not the other instruction sets)
38 // #define HWY_DISABLE_BMI2_FMA
39 
40 //------------------------------------------------------------------------------
41 // Targets
42 
43 // Unique bit value for each target. A lower value is "better" (e.g. more lanes)
44 // than a higher value within the same group/platform - see HWY_STATIC_TARGET.
45 //
46 // All values are unconditionally defined so we can test HWY_TARGETS without
47 // first checking the HWY_ARCH_*.
48 //
49 // The C99 preprocessor evaluates #if expressions using intmax_t types, so we
50 // can use 32-bit literals.
51 
52 // 1,2: reserved
53 
54 // Currently satisfiable by Ice Lake (VNNI, VPCLMULQDQ, VBMI2, VAES). Later to
55 // be added: BF16 (Cooper Lake). VP2INTERSECT is only in Tiger Lake? We do not
56 // yet have uses for VBMI, VPOPCNTDQ, BITALG, GFNI.
57 #define HWY_AVX3_DL 4 // see HWY_WANT_AVX3_DL below
58 #define HWY_AVX3 8
59 #define HWY_AVX2 16
60 // 32: reserved for AVX
61 #define HWY_SSE4 64
62 #define HWY_SSSE3 128
63 // 0x100, 0x200: reserved for SSE3, SSE2
64 
65 // The highest bit in the HWY_TARGETS mask that a x86 target can have. Used for
66 // dynamic dispatch. All x86 target bits must be lower or equal to
67 // (1 << HWY_HIGHEST_TARGET_BIT_X86) and they can only use
68 // HWY_MAX_DYNAMIC_TARGETS in total.
69 #define HWY_HIGHEST_TARGET_BIT_X86 9
70 
71 #define HWY_SVE2 0x400
72 #define HWY_SVE 0x800
73 // 0x1000 reserved for Helium
74 #define HWY_NEON 0x2000
75 
76 #define HWY_HIGHEST_TARGET_BIT_ARM 13
77 
78 // 0x4000, 0x8000 reserved
79 #define HWY_PPC8 0x10000 // v2.07 or 3
80 // 0x20000, 0x40000 reserved for prior VSX/AltiVec
81 
82 #define HWY_HIGHEST_TARGET_BIT_PPC 18
83 
84 // 0x80000 reserved
85 #define HWY_WASM 0x100000
86 
87 #define HWY_HIGHEST_TARGET_BIT_WASM 20
88 
89 // 0x200000, 0x400000, 0x800000 reserved
90 
91 #define HWY_RVV 0x1000000
92 
93 #define HWY_HIGHEST_TARGET_BIT_RVV 24
94 
95 // 0x2000000, 0x4000000, 0x8000000, 0x10000000 reserved
96 
97 #define HWY_SCALAR 0x20000000
98 
99 #define HWY_HIGHEST_TARGET_BIT_SCALAR 29
100 
101 // Cannot use higher values, otherwise HWY_TARGETS computation might overflow.
102 
103 //------------------------------------------------------------------------------
104 // Set default blocklists
105 
106 // Disabled means excluded from enabled at user's request. A separate config
107 // macro allows disabling without deactivating the blocklist below.
108 #ifndef HWY_DISABLED_TARGETS
109 #define HWY_DISABLED_TARGETS 0
110 #endif
111 
112 // Broken means excluded from enabled due to known compiler issues. Allow the
113 // user to override this blocklist without any guarantee of success.
114 #ifndef HWY_BROKEN_TARGETS
115 
116 // x86 clang-6: we saw multiple AVX2/3 compile errors and in one case invalid
117 // SSE4 codegen (possibly only for msan), so disable all those targets.
118 #if HWY_ARCH_X86 && (HWY_COMPILER_CLANG != 0 && HWY_COMPILER_CLANG < 700)
119 #define HWY_BROKEN_TARGETS (HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
120 // This entails a major speed reduction, so warn unless the user explicitly
121 // opts in to scalar-only.
122 #if !defined(HWY_COMPILE_ONLY_SCALAR)
123 #pragma message("x86 Clang <= 6: define HWY_COMPILE_ONLY_SCALAR or upgrade.")
124 #endif
125 
126 // 32-bit may fail to compile AVX2/3.
127 #elif HWY_ARCH_X86_32
128 #define HWY_BROKEN_TARGETS (HWY_AVX2 | HWY_AVX3 | HWY_AVX3_DL)
129 
130 // MSVC AVX3 support is buggy: https://github.com/Mysticial/Flops/issues/16
131 #elif HWY_COMPILER_MSVC != 0
132 #define HWY_BROKEN_TARGETS (HWY_AVX3 | HWY_AVX3_DL)
133 
134 // armv7be has not been tested and is not yet supported.
135 #elif HWY_ARCH_ARM_V7 && \
136  (defined(__ARM_BIG_ENDIAN) || \
137  (defined(__BYTE_ORDER) && __BYTE_ORDER == __BIG_ENDIAN))
138 #define HWY_BROKEN_TARGETS (HWY_NEON)
139 
140 // SVE[2] require recent clang or gcc versions.
141 #elif (HWY_COMPILER_CLANG && HWY_COMPILER_CLANG < 1100) ||\
142 (!HWY_COMPILER_CLANG && HWY_COMPILER_GCC && HWY_COMPILER_GCC < 1000)
143 #define HWY_BROKEN_TARGETS (HWY_SVE | HWY_SVE2)
144 
145 #else
146 #define HWY_BROKEN_TARGETS 0
147 #endif
148 
149 #endif // HWY_BROKEN_TARGETS
150 
151 // Enabled means not disabled nor blocklisted.
152 #define HWY_ENABLED(targets) \
153  ((targets) & ~((HWY_DISABLED_TARGETS) | (HWY_BROKEN_TARGETS)))
154 
155 //------------------------------------------------------------------------------
156 // Detect baseline targets using predefined macros
157 
158 // Baseline means the targets for which the compiler is allowed to generate
159 // instructions, implying the target CPU would have to support them. Do not use
160 // this directly because it does not take the blocklist into account. Allow the
161 // user to override this without any guarantee of success.
162 #ifndef HWY_BASELINE_TARGETS
163 
164 #if defined(HWY_EMULATE_SVE)
165 #define HWY_BASELINE_TARGETS HWY_SVE // does not support SVE2
166 #define HWY_BASELINE_AVX3_DL 0
167 #else
168 
169 // Also check HWY_ARCH to ensure that simulating unknown platforms ends up with
170 // HWY_TARGET == HWY_SCALAR.
171 
172 #if HWY_ARCH_WASM && defined(__wasm_simd128__)
173 #define HWY_BASELINE_WASM HWY_WASM
174 #else
175 #define HWY_BASELINE_WASM 0
176 #endif
177 
178 // Avoid choosing the PPC target until we have an implementation.
179 #if HWY_ARCH_PPC && defined(__VSX__) && 0
180 #define HWY_BASELINE_PPC8 HWY_PPC8
181 #else
182 #define HWY_BASELINE_PPC8 0
183 #endif
184 
185 // SVE compiles, but is not yet tested.
186 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE2)
187 #define HWY_BASELINE_SVE2 HWY_SVE2
188 #else
189 #define HWY_BASELINE_SVE2 0
190 #endif
191 
192 #if HWY_ARCH_ARM && defined(__ARM_FEATURE_SVE)
193 #define HWY_BASELINE_SVE HWY_SVE
194 #else
195 #define HWY_BASELINE_SVE 0
196 #endif
197 
198 // GCC 4.5.4 only defines __ARM_NEON__; 5.4 defines both.
199 #if HWY_ARCH_ARM && (defined(__ARM_NEON__) || defined(__ARM_NEON))
200 #define HWY_BASELINE_NEON HWY_NEON
201 #else
202 #define HWY_BASELINE_NEON 0
203 #endif
204 
205 // Special handling for MSVC because it has fewer predefined macros
206 #if HWY_COMPILER_MSVC && !HWY_COMPILER_CLANG
207 
208 // We can only be sure SSSE3/SSE4 are enabled if AVX is
209 // (https://stackoverflow.com/questions/18563978/)
210 #if defined(__AVX__)
211 #define HWY_CHECK_SSSE3 1
212 #define HWY_CHECK_SSE4 1
213 #else
214 #define HWY_CHECK_SSSE3 0
215 #define HWY_CHECK_SSE4 0
216 #endif
217 
218 // Cannot check for PCLMUL/AES and BMI2/FMA/F16C individually; we assume
219 // PCLMUL/AES are available if SSE4 is, and BMI2/FMA/F16C if AVX2 is.
220 #define HWY_CHECK_PCLMUL_AES 1
221 #define HWY_CHECK_BMI2_FMA 1
222 #define HWY_CHECK_F16C 1
223 
224 #else // non-MSVC
225 
226 #if defined(__SSSE3__)
227 #define HWY_CHECK_SSSE3 1
228 #else
229 #define HWY_CHECK_SSSE3 0
230 #endif
231 
232 #if defined(__SSE4_1__) && defined(__SSE4_2__)
233 #define HWY_CHECK_SSE4 1
234 #else
235 #define HWY_CHECK_SSE4 0
236 #endif
237 
238 // If these are disabled, they should not gate the availability of SSE4/AVX2.
239 #if defined(HWY_DISABLE_PCLMUL_AES) || (defined(__PCLMUL__) && defined(__AES__))
240 #define HWY_CHECK_PCLMUL_AES 1
241 #else
242 #define HWY_CHECK_PCLMUL_AES 0
243 #endif
244 
245 #if defined(HWY_DISABLE_BMI2_FMA) || (defined(__BMI2__) && defined(__FMA__))
246 #define HWY_CHECK_BMI2_FMA 1
247 #else
248 #define HWY_CHECK_BMI2_FMA 0
249 #endif
250 
251 #if defined(HWY_DISABLE_F16C) || defined(__F16C__)
252 #define HWY_CHECK_F16C 1
253 #else
254 #define HWY_CHECK_F16C 0
255 #endif
256 
257 #endif // non-MSVC
258 
259 #if HWY_ARCH_X86 && HWY_CHECK_SSSE3
260 #define HWY_BASELINE_SSSE3 HWY_SSSE3
261 #else
262 #define HWY_BASELINE_SSSE3 0
263 #endif
264 
265 #if HWY_ARCH_X86 && HWY_CHECK_SSE4 && HWY_CHECK_PCLMUL_AES
266 #define HWY_BASELINE_SSE4 HWY_SSE4
267 #else
268 #define HWY_BASELINE_SSE4 0
269 #endif
270 
271 #if HWY_BASELINE_SSE4 != 0 && HWY_CHECK_BMI2_FMA && HWY_CHECK_F16C && \
272  defined(__AVX2__)
273 #define HWY_BASELINE_AVX2 HWY_AVX2
274 #else
275 #define HWY_BASELINE_AVX2 0
276 #endif
277 
278 // Require everything in AVX2 plus AVX-512 flags (also set by MSVC)
279 #if HWY_BASELINE_AVX2 != 0 && defined(__AVX512F__) && defined(__AVX512BW__) && \
280  defined(__AVX512DQ__) && defined(__AVX512VL__)
281 #define HWY_BASELINE_AVX3 HWY_AVX3
282 #else
283 #define HWY_BASELINE_AVX3 0
284 #endif
285 
286 // TODO(janwas): not yet known whether these will be set by MSVC
287 #if HWY_BASELINE_AVX3 != 0 && defined(__AVXVNNI__) && defined(__VAES__) && \
288  defined(__VPCLMULQDQ__)
289 #define HWY_BASELINE_AVX3_DL HWY_AVX3_DL
290 #else
291 #define HWY_BASELINE_AVX3_DL 0
292 #endif
293 
294 #if HWY_ARCH_RVV && defined(__riscv_vector)
295 #define HWY_BASELINE_RVV HWY_RVV
296 #else
297 #define HWY_BASELINE_RVV 0
298 #endif
299 
300 #define HWY_BASELINE_TARGETS \
301  (HWY_SCALAR | HWY_BASELINE_WASM | HWY_BASELINE_PPC8 | HWY_BASELINE_SVE2 | \
302  HWY_BASELINE_SVE | HWY_BASELINE_NEON | HWY_BASELINE_SSSE3 | \
303  HWY_BASELINE_SSE4 | HWY_BASELINE_AVX2 | HWY_BASELINE_AVX3 | \
304  HWY_BASELINE_AVX3_DL | HWY_BASELINE_RVV)
305 
306 #endif // HWY_EMULATE_SVE
307 
308 #else
309 // User already defined HWY_BASELINE_TARGETS, but we still need to define
310 // HWY_BASELINE_AVX3 (matching user's definition) for HWY_CHECK_AVX3_DL.
311 #define HWY_BASELINE_AVX3_DL (HWY_BASELINE_TARGETS & HWY_AVX3_DL)
312 #endif // HWY_BASELINE_TARGETS
313 
314 //------------------------------------------------------------------------------
315 // Choose target for static dispatch
316 
317 #define HWY_ENABLED_BASELINE HWY_ENABLED(HWY_BASELINE_TARGETS)
318 #if HWY_ENABLED_BASELINE == 0
319 #error "At least one baseline target must be defined and enabled"
320 #endif
321 
322 // Best baseline, used for static dispatch. This is the least-significant 1-bit
323 // within HWY_ENABLED_BASELINE and lower bit values imply "better".
324 #define HWY_STATIC_TARGET (HWY_ENABLED_BASELINE & -HWY_ENABLED_BASELINE)
325 
326 // Start by assuming static dispatch. If we later use dynamic dispatch, this
327 // will be defined to other targets during the multiple-inclusion, and finally
328 // return to the initial value. Defining this outside begin/end_target ensures
329 // inl headers successfully compile by themselves (required by Bazel).
330 #define HWY_TARGET HWY_STATIC_TARGET
331 
332 //------------------------------------------------------------------------------
333 // Choose targets for dynamic dispatch according to one of four policies
334 
335 #if (defined(HWY_COMPILE_ONLY_SCALAR) + defined(HWY_COMPILE_ONLY_STATIC) + \
336  defined(HWY_COMPILE_ALL_ATTAINABLE)) > 1
337 #error "Invalid config: can only define a single policy for targets"
338 #endif
339 
340 // Further to checking for disabled/broken targets, we only use AVX3_DL after
341 // explicit opt-in (via this macro OR baseline compiler flags) to avoid
342 // generating a codepath which is only helpful if the app uses AVX3_DL features.
343 #if defined(HWY_WANT_AVX3_DL)
344 #define HWY_CHECK_AVX3_DL HWY_AVX3_DL
345 #else
346 #define HWY_CHECK_AVX3_DL HWY_BASELINE_AVX3_DL
347 #endif
348 
349 // Attainable means enabled and the compiler allows intrinsics (even when not
350 // allowed to autovectorize). Used in 3 and 4.
351 #if HWY_ARCH_X86
352 #define HWY_ATTAINABLE_TARGETS \
353  HWY_ENABLED(HWY_SCALAR | HWY_SSSE3 | HWY_SSE4 | HWY_AVX2 | HWY_AVX3 | \
354  HWY_CHECK_AVX3_DL)
355 #else
356 #define HWY_ATTAINABLE_TARGETS HWY_ENABLED_BASELINE
357 #endif
358 
359 // 1) For older compilers: disable all SIMD (could also set HWY_DISABLED_TARGETS
360 // to ~HWY_SCALAR, but this is more explicit).
361 #if defined(HWY_COMPILE_ONLY_SCALAR)
362 #undef HWY_STATIC_TARGET
363 #define HWY_STATIC_TARGET HWY_SCALAR // override baseline
364 #define HWY_TARGETS HWY_SCALAR
365 
366 // 2) For forcing static dispatch without code changes (removing HWY_EXPORT)
367 #elif defined(HWY_COMPILE_ONLY_STATIC)
368 #define HWY_TARGETS HWY_STATIC_TARGET
369 
370 // 3) For tests: include all attainable targets (in particular: scalar)
371 #elif defined(HWY_COMPILE_ALL_ATTAINABLE) || defined(HWY_IS_TEST)
372 #define HWY_TARGETS HWY_ATTAINABLE_TARGETS
373 
374 // 4) Default: attainable WITHOUT non-best baseline. This reduces code size by
375 // excluding superseded targets, in particular scalar.
376 #else
377 #define HWY_TARGETS (HWY_ATTAINABLE_TARGETS & (2 * HWY_STATIC_TARGET - 1))
378 
379 #endif // target policy
380 
381 // HWY_ONCE and the multiple-inclusion mechanism rely on HWY_STATIC_TARGET being
382 // one of the dynamic targets. This also implies HWY_TARGETS != 0 and
383 // (HWY_TARGETS & HWY_ENABLED_BASELINE) != 0.
384 #if (HWY_TARGETS & HWY_STATIC_TARGET) == 0
385 #error "Logic error: best baseline should be included in dynamic targets"
386 #endif
387 
388 #endif // HIGHWAY_HWY_DETECT_TARGETS_H_