-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathplatform.cpp
293 lines (268 loc) · 8.43 KB
/
platform.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
/*******************************************************************************
* Copyright 2020-2025 Intel Corporation
* Copyright 2020-2024 FUJITSU LIMITED
* Copyright 2022-2024 Arm Ltd. and affiliates
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#include <thread>
#include "cpu/platform.hpp"
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
#include <algorithm>
#if defined(_WIN32)
#include <windows.h>
#elif defined(__GLIBC__)
#include <sched.h>
#endif
#endif
#if DNNL_X64
#include "cpu/x64/cpu_isa_traits.hpp"
#elif DNNL_AARCH64
#include "cpu/aarch64/cpu_isa_traits.hpp"
#if DNNL_AARCH64_USE_ACL
// For checking if fp16 isa is supported on the platform
#include "arm_compute/core/CPP/CPPTypes.h"
#endif
#endif
// For DNNL_X64 build we compute the timestamp using rdtsc. Use std::chrono for
// other builds.
#if !DNNL_X64
#include <chrono>
#endif
namespace dnnl {
namespace impl {
namespace cpu {
namespace platform {
const char *get_isa_info() {
#if DNNL_X64
return x64::get_isa_info();
#elif DNNL_AARCH64
return aarch64::get_isa_info();
#else
return "Generic";
#endif
}
dnnl_cpu_isa_t get_effective_cpu_isa() {
#if DNNL_X64
return x64::get_effective_cpu_isa();
#elif DNNL_AARCH64
return aarch64::get_effective_cpu_isa();
#else
return dnnl_cpu_isa_default;
#endif
}
status_t set_max_cpu_isa(dnnl_cpu_isa_t isa) {
#if DNNL_X64
return x64::set_max_cpu_isa(isa);
#else
return status::unimplemented;
#endif
}
status_t set_cpu_isa_hints(dnnl_cpu_isa_hints_t isa_hints) {
#if DNNL_X64
return x64::set_cpu_isa_hints(isa_hints);
#elif DNNL_AARCH64
return status::success;
#else
return status::unimplemented;
#endif
}
dnnl_cpu_isa_hints_t get_cpu_isa_hints() {
#if DNNL_X64
return x64::get_cpu_isa_hints();
#else
return dnnl_cpu_isa_no_hints;
#endif
}
bool prefer_ymm_requested() {
#if DNNL_X64
const bool prefer_ymm = x64::get_cpu_isa_hints() == dnnl_cpu_isa_prefer_ymm;
return prefer_ymm;
#else
return false;
#endif
}
bool has_data_type_support(data_type_t data_type) {
// Notice: see notes in header
switch (data_type) {
case data_type::bf16:
#if DNNL_X64
return x64::mayiuse(x64::avx512_core)
|| x64::mayiuse(x64::avx2_vnni_2);
#elif DNNL_PPC64
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
return true;
#endif
#elif DNNL_AARCH64
return aarch64::mayiuse_bf16();
#else
return false;
#endif
case data_type::f16:
#if DNNL_X64
return x64::mayiuse(x64::avx512_core_fp16)
|| x64::mayiuse(x64::avx2_vnni_2);
#elif DNNL_AARCH64_USE_ACL
return arm_compute::CPUInfo::get().has_fp16();
#else
return false;
#endif
case data_type::f8_e5m2:
case data_type::f8_e4m3:
#if DNNL_X64
return x64::mayiuse(x64::avx512_core_fp16);
#else
return false;
#endif
default: return true;
}
}
bool has_training_support(data_type_t data_type) {
// TODO: maybe return false for int8, but some primitives like prelu
// have training support
switch (data_type) {
case data_type::bf16:
#if DNNL_X64
return x64::mayiuse(x64::avx512_core);
#elif DNNL_PPC64
#if defined(USE_CBLAS) && defined(BLAS_HAS_SBGEMM) && defined(__MMA__)
return true;
#endif
#elif DNNL_AARCH64_USE_ACL
return arm_compute::CPUInfo::get().has_bf16();
#else
return false;
#endif
case data_type::f16:
#if DNNL_X64
return x64::mayiuse(x64::avx512_core_fp16);
#elif DNNL_AARCH64_USE_ACL
return arm_compute::CPUInfo::get().has_fp16();
#else
return false;
#endif
default: return true;
}
}
float s8s8_weights_scale_factor() {
#if DNNL_X64
return x64::mayiuse(x64::avx512_core_vnni) || x64::mayiuse(x64::avx2_vnni)
? 1.0f
: 0.5f;
#else
return 1.0f;
#endif
}
unsigned get_per_core_cache_size(int level) {
auto guess = [](int level) {
switch (level) {
case 1: return 32U * 1024;
case 2: return 512U * 1024;
case 3: return 1024U * 1024;
default: return 0U;
}
};
#if DNNL_X64
using namespace x64;
if (cpu().getDataCacheLevels() == 0) return guess(level);
if (level > 0 && (unsigned)level <= cpu().getDataCacheLevels()) {
unsigned l = level - 1;
return cpu().getDataCacheSize(l) / cpu().getCoresSharingDataCache(l);
} else
return 0;
#else
return guess(level);
#endif
}
unsigned get_num_cores() {
#if DNNL_X64
return x64::cpu().getNumCores(Xbyak::util::CoreLevel);
#elif DNNL_AARCH64_USE_ACL
return aarch64::cpu().getNumCores(Xbyak_aarch64::util::CoreLevel);
#else
return 1;
#endif
}
#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
// The purpose of this function is to return the potential maximum number of
// threads in user's threadpool. It is assumed that the number of threads in an
// actual threadpool will not exceed the number cores in a socket reported by
// the OS, which may or may not be equal to the number of total physical cores
// in a socket depending on the OS configuration (read -- VM environment). In
// order to simulate the number of cores available in such environment, this
// function supports process affinity.
unsigned get_max_threads_to_use() {
// TODO: the logic below should involve number of sockets to provide exact
// number of cores on 2+ socket systems.
int num_cores_per_socket = (int)dnnl::impl::cpu::platform::get_num_cores();
// It may happen that XByak doesn't get num of threads identified, e.g. for
// AMD. In order to make threadpool working, we supply an additional
// condition to have some reasonable number of threads available at
// primitive descriptor creation time.
if (num_cores_per_socket == 0)
num_cores_per_socket = std::thread::hardware_concurrency();
#if defined(_WIN32)
DWORD_PTR proc_affinity_mask;
DWORD_PTR sys_affinity_mask;
if (GetProcessAffinityMask(
GetCurrentProcess(), &proc_affinity_mask, &sys_affinity_mask)) {
int masked_nthr = 0;
for (int i = 0; i < CHAR_BIT * sizeof(proc_affinity_mask);
i++, proc_affinity_mask >>= 1)
masked_nthr += proc_affinity_mask & 1;
return std::min(masked_nthr, num_cores_per_socket);
}
#elif defined(__GLIBC__)
cpu_set_t cpu_set;
// Check if the affinity of the process has been set using, e.g.,
// numactl.
if (::sched_getaffinity(0, sizeof(cpu_set_t), &cpu_set) == 0)
return std::min(CPU_COUNT(&cpu_set), num_cores_per_socket);
#endif
return num_cores_per_socket;
}
#endif
int get_vector_register_size() {
#if DNNL_X64
using namespace x64;
if (mayiuse(avx512_core)) return cpu_isa_traits_t<avx512_core>::vlen;
if (mayiuse(avx)) return cpu_isa_traits_t<avx>::vlen;
if (mayiuse(sse41)) return cpu_isa_traits_t<sse41>::vlen;
#elif DNNL_AARCH64
using namespace aarch64;
if (mayiuse(asimd)) return cpu_isa_traits<asimd>::vlen;
if (mayiuse(sve_512)) return cpu_isa_traits<sve_512>::vlen;
if (mayiuse(sve_256)) return cpu_isa_traits<sve_256>::vlen;
#endif
return 0;
}
/* The purpose of this function is to provide a very efficient timestamp
* calculation (used primarily for primitive cache). For DNNL_X64, this can be
* accomplished using *rdtsc* since it provides a timestamp value that (i) is
* independent for each core, and (ii) is synchronized across cores in multiple
* sockets.
* TODO: For now, use std::chrono::steady_clock for other builds, however
* another more optimized function may be called here.
*/
size_t get_timestamp() {
#if DNNL_X64
return static_cast<size_t>(Xbyak::util::Clock::getRdtsc());
#else
return static_cast<size_t>(
std::chrono::steady_clock::now().time_since_epoch().count());
#endif
}
} // namespace platform
} // namespace cpu
} // namespace impl
} // namespace dnnl