-
Notifications
You must be signed in to change notification settings - Fork 1k
/
Copy pathkernel.hpp
272 lines (227 loc) · 9.1 KB
/
kernel.hpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
/*******************************************************************************
* Copyright 2019-2025 Intel Corporation
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*******************************************************************************/
#ifndef GPU_INTEL_COMPUTE_KERNEL_HPP
#define GPU_INTEL_COMPUTE_KERNEL_HPP
#if defined(__linux__) && (defined(DNNL_DEV_MODE) || !defined(NDEBUG))
#include <unistd.h>
#endif
#include <functional>
#include <memory>
#include <utility>
#include "common/utils.hpp"
#include "common/verbose.hpp"
#include "gpu/intel/compute/kernel_arg_list.hpp"
#include "gpu/intel/compute/utils.hpp"
#include "gpu/intel/utils.hpp"
#include "xpu/context.hpp"
#include "xpu/utils.hpp"
namespace dnnl {
namespace impl {
namespace gpu {
namespace intel {
namespace compute {
#if defined(__linux__) && (defined(DNNL_DEV_MODE) || !defined(NDEBUG))
struct program_src_t {
program_src_t() = default;
program_src_t(const std::string &src_str) {
// Only enable if gdb-oneapi debugging is active
if (getenv_int("ZET_ENABLE_PROGRAM_DEBUGGING", 0) == 0) return;
const int name_size = 29;
char name[name_size] = "/tmp/dnnl_ocl_jit_src.XXXXXX";
// Ensure /tmp is a valid target for writing a temporary file
bool is_symlink = false;
status_t status = check_for_symlinks("/tmp", &is_symlink);
if (status != status::success || is_symlink) return;
// Guaranteed to have permissions 600 per the mkstemp specification,
// which is the minimum required for writing and then subsequently
// reading when debugging.
int fd = mkstemp(name);
if (fd == -1) return;
auto delete_fd = [&](int fd, char *name) {
// Unlink is called before close to ensure the file always exists
// and cannot be replaced with another file
unlink(name);
close(fd);
};
if (write(fd, src_str.c_str(), src_str.length()) == -1) {
delete_fd(fd, name);
return;
}
if (fsync(fd) == -1) {
delete_fd(fd, name);
return;
}
auto deleter = [&](char *name) {
delete_fd(fd, name);
delete[] name;
};
name_ = std::shared_ptr<char>(new char[name_size], deleter);
std::memcpy(name_.get(), name, name_size);
}
operator bool() const { return name_ != nullptr; };
const char *name() const { return name_.get(); }
private:
std::shared_ptr<char> name_;
};
#else
struct program_src_t {
program_src_t() = default;
program_src_t(const std::string &src_str) {}
operator bool() const { return false; }
const char *name() const { return nullptr; }
};
#endif
class kernel_impl_t {
public:
kernel_impl_t() = default;
kernel_impl_t(const kernel_impl_t &) = delete;
kernel_impl_t &operator=(const kernel_impl_t &) = delete;
virtual ~kernel_impl_t() = default;
virtual status_t parallel_for(impl::stream_t &stream,
const nd_range_t &range, const kernel_arg_list_t &arg_list,
const xpu::event_t &deps, xpu::event_t &out_dep) {
gpu_assert(false) << "unimplemented function parallel_for() called";
return status::runtime_error;
}
virtual status_t parallel_for(
impl::stream_t &stream, const std::function<void(void *)> &cgf) {
gpu_assert(false) << "unimplemented function parallel_for() called";
return status::runtime_error;
}
virtual status_t get_binary_size(
const impl::engine_t *engine, size_t *binary_size) const {
gpu_assert(false) << "unimplemented function get_binary_size() called";
return status::runtime_error;
}
virtual status_t get_binary(
const impl::engine_t *engine, xpu::binary_t &binary) const {
gpu_assert(false) << "unimplemented function get_binary() called";
return status::runtime_error;
}
virtual const std::vector<scalar_type_t> &arg_types() const {
static const std::vector<scalar_type_t> dummy;
return dummy;
}
virtual void save_output_events() {}
virtual status_t dump() const {
gpu_assert(false) << "unimplemented function dump() called";
return status::runtime_error;
}
virtual std::string name() const {
gpu_assert(false) << "unimplemented function name() called";
return "unknown";
}
status_t check_scalar_arguments(const kernel_arg_list_t &arg_list) const {
// Some kernels may not support argument validation.
if (arg_types().empty()) return status::success;
for (int i = 0; i < arg_list.nargs(); i++) {
auto &arg = arg_list.get(i);
auto req_arg_type = arg_types()[i];
if (!arg.is_global() && !arg.is_local() && !arg.is_svm_pointer()) {
if (req_arg_type == gpu::intel::compute::scalar_type_t::undef) {
// Types of kernel arguments may not be available when zebin
// is used.
continue;
}
if (req_arg_type != arg.scalar_type()) {
VERROR(primitive, gpu,
"%s: scalar kernel argument #%d (%s) is "
"different from the type of the given scalar (%s)",
name().c_str(), i, to_string(req_arg_type).c_str(),
to_string(arg.scalar_type()).c_str());
return status::invalid_arguments;
}
}
}
return status::success;
}
};
class kernel_t {
public:
kernel_t(std::nullptr_t) : impl_(nullptr) {}
kernel_t(std::shared_ptr<kernel_impl_t> &impl) : impl_(impl) {}
kernel_t(std::shared_ptr<kernel_impl_t> &&impl) : impl_(std::move(impl)) {}
kernel_t() = default;
kernel_t(kernel_t &&other) = default;
kernel_t(const kernel_t &other) = default;
kernel_t &operator=(const kernel_t &other) = default;
kernel_t &operator=(kernel_t &&other) = default;
virtual ~kernel_t() = default;
operator bool() const { return bool(impl_); }
kernel_impl_t *impl() const { return impl_.get(); }
status_t parallel_for(impl::stream_t &stream, const nd_range_t &range,
const kernel_arg_list_t &arg_list, const xpu::event_t &deps,
xpu::event_t &out_dep) const {
return impl_->parallel_for(stream, range, arg_list, deps, out_dep);
}
status_t parallel_for(impl::stream_t &stream,
const std::function<void(void *)> &cgf) const {
return impl_->parallel_for(stream, cgf);
}
status_t get_binary_size(
const impl::engine_t *engine, size_t *binary_size) const {
return impl_->get_binary_size(engine, binary_size);
}
status_t get_binary(
const impl::engine_t *engine, xpu::binary_t &binary) const {
return impl_->get_binary(engine, binary);
}
const std::vector<scalar_type_t> &arg_types() const {
return impl_->arg_types();
}
void save_output_events() { return impl_->save_output_events(); }
status_t dump() const {
if (!gpu_utils::is_jit_dump_enabled()) return status::success;
return impl_->dump();
}
std::string name() const { return impl_->name(); }
private:
std::shared_ptr<kernel_impl_t> impl_;
};
class kernel_bundle_t {
public:
kernel_bundle_t() = default;
kernel_bundle_t(std::vector<kernel_t> &&kernels,
const std::vector<const char *> &kernel_names) {
for (size_t i = 0; i < kernels.size(); i++) {
bundle[kernel_names[i]] = std::move(kernels[i]);
}
}
// Copies may be expensive, require explicit clone
kernel_bundle_t(const kernel_bundle_t &other) = delete;
kernel_bundle_t &operator=(const kernel_bundle_t &other) = delete;
kernel_bundle_t(kernel_bundle_t &&other) = default;
kernel_bundle_t &operator=(kernel_bundle_t &&other) = default;
~kernel_bundle_t() = default;
status_t get_kernels(std::vector<kernel_t> &kernels,
const std::vector<const char *> &kernel_names) const {
kernels = std::vector<kernel_t>(kernel_names.size());
for (size_t i = 0; i < kernel_names.size(); i++) {
if (!kernel_names[i]) continue;
auto kernel_entry = bundle.find(kernel_names[i]);
if (kernel_entry == bundle.end()) return status::runtime_error;
kernels[i] = kernel_entry->second;
}
return status::success;
}
std::unordered_map<std::string, kernel_t> bundle;
};
} // namespace compute
} // namespace intel
} // namespace gpu
} // namespace impl
} // namespace dnnl
#endif // GPU_INTEL_COMPUTE_KERNEL_HPP