Grok  9.5.0
cache_control.h
Go to the documentation of this file.
1 // Copyright 2020 Google LLC
2 //
3 // Licensed under the Apache License, Version 2.0 (the "License");
4 // you may not use this file except in compliance with the License.
5 // You may obtain a copy of the License at
6 //
7 // http://www.apache.org/licenses/LICENSE-2.0
8 //
9 // Unless required by applicable law or agreed to in writing, software
10 // distributed under the License is distributed on an "AS IS" BASIS,
11 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 // See the License for the specific language governing permissions and
13 // limitations under the License.
14 
15 #ifndef HIGHWAY_HWY_CACHE_CONTROL_H_
16 #define HIGHWAY_HWY_CACHE_CONTROL_H_
17 
18 #include <stddef.h>
19 #include <stdint.h>
20 
21 #include "hwy/base.h"
22 
23 // Requires SSE2; fails to compile on 32-bit Clang 7 (see
24 // https://github.com/gperftools/gperftools/issues/946).
25 #if !defined(__SSE2__) || (HWY_COMPILER_CLANG && HWY_ARCH_X86_32)
26 #undef HWY_DISABLE_CACHE_CONTROL
27 #define HWY_DISABLE_CACHE_CONTROL
28 #endif
29 
30 // intrin.h is sufficient on MSVC and already included by base.h.
31 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
32 #include <emmintrin.h> // SSE2
33 #endif
34 
35 // Windows.h #defines these, which causes infinite recursion. Temporarily
36 // undefine them in this header; these functions are anyway deprecated.
37 // TODO(janwas): remove when these functions are removed.
38 #pragma push_macro("LoadFence")
39 #pragma push_macro("StoreFence")
40 #undef LoadFence
41 #undef StoreFence
42 
43 namespace hwy {
44 
45 // Even if N*sizeof(T) is smaller, Stream may write a multiple of this size.
46 #define HWY_STREAM_MULTIPLE 16
47 
48 // The following functions may also require an attribute.
49 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL) && !HWY_COMPILER_MSVC
50 #define HWY_ATTR_CACHE __attribute__((target("sse2")))
51 #else
52 #define HWY_ATTR_CACHE
53 #endif
54 
55 // Delays subsequent loads until prior loads are visible. On Intel CPUs, also
56 // serves as a full fence (waits for all prior instructions to complete).
57 // No effect on non-x86.
58 // DEPRECATED due to differing behavior across architectures AND vendors.
60 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
61  _mm_lfence();
62 #endif
63 }
64 
65 // Ensures values written by previous `Stream` calls are visible on the current
66 // core. This is NOT sufficient for synchronizing across cores; when `Stream`
67 // outputs are to be consumed by other core(s), the producer must publish
68 // availability (e.g. via mutex or atomic_flag) after `FlushStream`.
70 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
71  _mm_sfence();
72 #endif
73 }
74 
75 // DEPRECATED, replace with `FlushStream`.
77 
78 // Optionally begins loading the cache line containing "p" to reduce latency of
79 // subsequent actual loads.
80 template <typename T>
82 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
83  _mm_prefetch(reinterpret_cast<const char*>(p), _MM_HINT_T0);
84 #elif HWY_COMPILER_GCC || HWY_COMPILER_CLANG
85  // Hint=0 (NTA) behavior differs, but skipping outer caches is probably not
86  // desirable, so use the default 3 (keep in caches).
87  __builtin_prefetch(p, /*write=*/0, /*hint=*/3);
88 #else
89  (void)p;
90 #endif
91 }
92 
93 // Invalidates and flushes the cache line containing "p", if possible.
95 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
96  _mm_clflush(p);
97 #else
98  (void)p;
99 #endif
100 }
101 
102 // When called inside a spin-loop, may reduce power consumption.
104 #if HWY_ARCH_X86 && !defined(HWY_DISABLE_CACHE_CONTROL)
105  _mm_pause();
106 #endif
107 }
108 
109 } // namespace hwy
110 
111 // TODO(janwas): remove when these functions are removed. (See above.)
112 #pragma pop_macro("StoreFence")
113 #pragma pop_macro("LoadFence")
114 
115 #endif // HIGHWAY_HWY_CACHE_CONTROL_H_
#define HWY_INLINE
Definition: base.h:59
#define HWY_ATTR_CACHE
Definition: cache_control.h:52
Definition: aligned_allocator.h:23
HWY_INLINE HWY_ATTR_CACHE void FlushStream()
Definition: cache_control.h:69
HWY_INLINE HWY_ATTR_CACHE void Prefetch(const T *p)
Definition: cache_control.h:81
HWY_INLINE HWY_ATTR_CACHE void Pause()
Definition: cache_control.h:103
HWY_INLINE HWY_ATTR_CACHE void StoreFence()
Definition: cache_control.h:76
HWY_INLINE HWY_ATTR_CACHE void LoadFence()
Definition: cache_control.h:59
HWY_INLINE HWY_ATTR_CACHE void FlushCacheline(const void *p)
Definition: cache_control.h:94