Skip to content

Commit fd25ea4

Browse files
author
Vladimir Paramuzov
authored
[GPU] Sycl op example, base class cleanup, docs update (#26304)
### Details: - Added naive FC sycl impl as an example and it will also be used to verify base class compilation in CI util we have any production sycl op. - Moved sycl readme to common gpu docs folder with some minor updates
1 parent 50896e9 commit fd25ea4

File tree

4 files changed

+287
-50
lines changed

4 files changed

+287
-50
lines changed
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,21 @@
1+
# How to build with DPC++ support
2+
3+
1. Install OneAPI base toolkit. Guide: https://www.intel.com/content/www/us/en/docs/oneapi/installation-guide-linux/2024-0/installation.html
4+
2. Export environment:
5+
$ source /opt/intel/oneapi/setvars.sh
6+
7+
3. Configure cmake with the following additional options:
8+
- [Linux] `-DCMAKE_C_COMPILER:FILEPATH=icx -DCMAKE_CXX_COMPILER:FILEPATH=icpx`
9+
[Windows] `-DCMAKE_C_COMPILER:FILEPATH=icx -DCMAKE_CXX_COMPILER:FILEPATH=icx`
10+
- For now find_package(IntelSYCL) doesn't work if compiler is not icpx, so we need to update compilers globally for the whole project
11+
- `-DENABLE_INTEL_CPU=OFF`
12+
- OneAPI toolkit with OneDNN installed may cause CPU plugin build issue due to weird include files resolver which prefer system onednn intead of
13+
CPU fork which causes build issue. Alternatively, OneDNN can be removed from OneAPI toolkit installation.
14+
- [Linux] `-DCMAKE_CXX_FLAGS:STRING=--gcc-install-dir=/lib/gcc/x86_64-linux-gnu/12/ -DCMAKE_C_FLAGS:STRING=--gcc-install-dir=/lib/gcc/x86_64-linux-gnu/12/`
15+
- This WA is needed if multiple GCC version available in the system
16+
- `-DENABLE_SYSTEM_OPENCL=OFF`
17+
- May help to avoid opencl icd/header conflicts as sycl package may have no clhpp headers
18+
- `-DCMAKE_CXX_COMPILER_LAUNCHER=ccache`
19+
- For some reason with latest OneAPI package versions each `make` call causes full project recompilation, so the recommendation is to enable caching
20+
21+
4. cmake --build . --config Release --parallel

src/plugins/intel_gpu/src/graph/impls/sycl/README.md

-16
This file was deleted.
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,262 @@
1+
// Copyright (C) 2024 Intel Corporation
2+
// SPDX-License-Identifier: Apache-2.0
3+
//
4+
5+
#include "fully_connected_inst.h"
6+
#include "intel_gpu/primitives/reorder.hpp"
7+
#include "ocl/ocl_event.hpp"
8+
#include "ocl/sycl_engine.hpp"
9+
#include "ocl/sycl_stream.hpp"
10+
#include "openvino/core/type/element_type.hpp"
11+
#include "primitive_sycl_base.h"
12+
#include "impls/registry/implementation_map.hpp"
13+
14+
#include "impls/ocl/kernel_selector_helper.h"
15+
16+
#include "sycl/sycl.hpp"
17+
#include "sycl/ext/oneapi/experimental/builtins.hpp"
18+
19+
#include <memory>
20+
21+
#ifdef __SYCL_DEVICE_ONLY__
22+
#define CONSTANT __attribute__((opencl_constant))
23+
#else
24+
#define CONSTANT
25+
#endif
26+
27+
namespace cldnn {
28+
namespace sycl {
29+
30+
template <typename A, typename B>
31+
struct AccumulatorType {
32+
using type = float;
33+
};
34+
35+
template<> struct AccumulatorType<::sycl::half, ::sycl::half> {
36+
using type = ::sycl::half;
37+
};
38+
39+
template<> struct AccumulatorType<::sycl::half, uint8_t> {
40+
using type = ::sycl::half;
41+
};
42+
43+
44+
template<> struct AccumulatorType<::sycl::half, int8_t> {
45+
using type = ::sycl::half;
46+
};
47+
48+
template<typename AType, typename WType, typename ZPType, typename ScaleType, typename DType>
49+
::sycl::event run_fc_int4_woq(::sycl::queue& queue, bool enqueue_barrier, const AType* a, const WType* w, const ZPType* zp, const ScaleType* s, DType* dst,
50+
size_t M, size_t N, size_t K, size_t group_size, size_t groups_num, const ov::Shape& out_shape, optional_value<float> dzp_s) {
51+
if (enqueue_barrier) {
52+
queue.submit([=](::sycl::handler& cgh) {
53+
cgh.ext_oneapi_barrier();
54+
});
55+
}
56+
57+
bool has_value = dzp_s.has_value();
58+
float dzp_value = dzp_s.value_or(0.0f);
59+
return queue.submit([=](::sycl::handler& cgh) {
60+
cgh.parallel_for(::sycl::range<3>(out_shape[0], out_shape[1], out_shape[2]), [=](::sycl::id<3> index) {
61+
const uint b = index[0];
62+
const uint m = index[1];
63+
const uint n = index[2];
64+
using accum_t = typename AccumulatorType<AType, WType>::type;
65+
accum_t accumulator = 0.0f;
66+
67+
const uint dst_index = n + m*N + b*N*M;
68+
for (uint y = 0; y < K; ++y) {
69+
const uint input0_offset = y + m*K + b*M*K;
70+
const uint decomp_offset = (y / group_size % groups_num)*N + n % N;
71+
const uint filter_offset = y + n*K;
72+
const uint zp_offset = 0;
73+
74+
75+
accum_t zp_val = has_value ? static_cast<accum_t>(dzp_value) : static_cast<accum_t>(zp[zp_offset]);
76+
accum_t scale = s[decomp_offset];
77+
const WType packed = w[filter_offset / 2];
78+
79+
const WType v0 = packed & 0x0F;
80+
const WType v1 = (packed & 0xF0) >> 4;
81+
accum_t unpacked = filter_offset % 2 == 0 ? v0 : v1;
82+
83+
accum_t filter_val = (unpacked - zp_val) * scale;
84+
accumulator += a[input0_offset] * filter_val;
85+
}
86+
dst[dst_index] = accumulator;
87+
});
88+
});
89+
}
90+
91+
template<typename AType, typename WType, typename ZPType, typename ScaleType, typename DType>
92+
::sycl::event run_fc_int8_woq(::sycl::queue& queue, bool enqueue_barrier, const AType* a, const WType* w, const ZPType* zp, const ScaleType* s, DType* dst,
93+
size_t M, size_t N, size_t K, size_t group_size, size_t groups_num, const ov::Shape& out_shape, optional_value<float> dzp_s) {
94+
if (enqueue_barrier) {
95+
queue.submit([=](::sycl::handler& cgh) {
96+
cgh.ext_oneapi_barrier();
97+
});
98+
}
99+
100+
bool has_value = dzp_s.has_value();
101+
float dzp_value = dzp_s.value_or(0.0f);
102+
103+
return queue.submit([=](::sycl::handler& cgh) {
104+
cgh.parallel_for(::sycl::range<3>(out_shape[0], out_shape[1], out_shape[2]), [=](::sycl::id<3> index) {
105+
const uint b = index[0];
106+
const uint m = index[1];
107+
const uint n = index[2];
108+
using accum_t = typename AccumulatorType<AType, WType>::type;
109+
accum_t accumulator = 0.0f;
110+
111+
for (uint y = 0; y < K; ++y) {
112+
const uint input0_offset = y + m*K + b*M*K;
113+
const uint zp_offset = (y / group_size % groups_num)*N + n % N;
114+
const uint decomp_offset = (y / group_size % groups_num)*N + n % N;
115+
const uint filter_offset = y + n*K;
116+
117+
accum_t zp_val = has_value ? static_cast<accum_t>(dzp_value) : static_cast<accum_t>(zp[zp_offset]);
118+
accum_t scale = s[decomp_offset];
119+
accum_t filter_compressed = static_cast<accum_t>(w[filter_offset]);
120+
accum_t filter_val = (filter_compressed - zp_val) * scale;
121+
accumulator += a[input0_offset] * filter_val;
122+
}
123+
const uint dst_index = n + m*N + b*N*M;
124+
dst[dst_index] = accumulator;
125+
});
126+
});
127+
}
128+
129+
struct fully_connected_sycl_example : typed_primitive_sycl_impl<fully_connected> {
130+
using parent = typed_primitive_sycl_impl<fully_connected>;
131+
using parent::parent;
132+
133+
DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::sycl::fully_connected_sycl_example)
134+
135+
std::unique_ptr<primitive_impl> clone() const override {
136+
return make_unique<fully_connected_sycl_example>(*this);
137+
}
138+
139+
event::ptr execute_impl(const std::vector<event::ptr>& /* events */, typed_primitive_inst<fully_connected>& instance) override {
140+
auto& network = instance.get_network();
141+
const auto& desc = instance.get_typed_desc<fully_connected>();
142+
143+
auto& stream = downcast<ocl::sycl_stream>(network.get_stream());
144+
auto& engine = downcast<ocl::sycl_engine>(network.get_engine());
145+
::sycl::context sycl_context = engine.get_sycl_context();
146+
::sycl::queue& sycl_queue = stream.get_sycl_queue();
147+
148+
const auto& params = instance.get_impl_params();
149+
auto out_shape = params->output_layouts[0].get_shape();
150+
151+
auto output = instance.output_memory_ptr(0);
152+
auto weights = instance.weights_memory();
153+
auto bias = instance.bias_term() ? instance.bias_memory() : nullptr;
154+
155+
std::vector<memory::ptr> inputs = { instance.input_memory_ptr(0) };
156+
size_t in_id = instance.bias_term() ? 3 : 2;
157+
if (!desc->decompression_scale.empty())
158+
inputs.push_back(instance.dep_memory_ptr(in_id++));
159+
160+
if (!desc->decompression_zero_point.empty())
161+
inputs.push_back(instance.dep_memory_ptr(in_id));
162+
163+
OPENVINO_ASSERT(!instance.bias_term() && !instance.get_node().has_fused_primitives());
164+
165+
ov::element::Type_t in_t = params->input_layouts[0].data_type;
166+
ov::element::Type_t wei_t = params->weights_layout.value().data_type;
167+
ov::element::Type_t out_t = params->output_layouts[0].data_type;
168+
ov::element::Type_t ds_t = params->input_layouts[2].data_type;
169+
ov::element::Type_t dzp_t = inputs.size() == 3 ? params->input_layouts[3].data_type : ov::element::Type_t::undefined;
170+
171+
OPENVINO_ASSERT(out_shape.size() == 3);
172+
size_t M = out_shape[1];
173+
size_t N = out_shape[2];
174+
size_t K = params->weights_layout.value().get_partial_shape()[1].get_length();
175+
size_t groups_num = params->input_layouts[2].get_shape()[1];
176+
size_t group_size = K / groups_num;
177+
178+
OPENVINO_ASSERT(inputs.size() >= 2);
179+
180+
auto dzp_scalar = desc->decompression_zero_point_scalar;
181+
182+
bool barrier = stream.get_queue_type() == QueueTypes::out_of_order;
183+
184+
#define CASE(InputType, WeightsType, ZPType, ScaleType, DstType) \
185+
in_t == ov::element::InputType && \
186+
wei_t == ov::element::WeightsType && \
187+
out_t == ov::element::DstType && \
188+
ds_t == ov::element::ScaleType && \
189+
dzp_t == ov::element::ZPType
190+
191+
if ((CASE(f32, u4, f32, f32, f32)) || (CASE(f32, u4, undefined, f32, f32))) {
192+
const float* in = static_cast<const float*>(inputs[0]->buffer_ptr());
193+
const uint8_t* wei = static_cast<const uint8_t*>(weights->buffer_ptr());
194+
float* out = static_cast<float*>(output->buffer_ptr());
195+
const float* ds = static_cast<const float*>(inputs[1]->buffer_ptr());
196+
const float* dzp = inputs.size() == 3 ? static_cast<const float*>(inputs[2]->buffer_ptr()) : nullptr;
197+
198+
return to_ocl_event(stream, run_fc_int4_woq(sycl_queue, barrier, in, wei, dzp, ds, out, M, N, K, group_size, groups_num, out_shape, dzp_scalar));
199+
} else if ((CASE(f16, u4, f16, f16, f16)) || (CASE(f16, u4, undefined, f16, f16))) {
200+
const ::sycl::half* in = static_cast<const ::sycl::half*>(inputs[0]->buffer_ptr());
201+
const uint8_t* wei = static_cast<const uint8_t*>(weights->buffer_ptr());
202+
::sycl::half* out = static_cast<::sycl::half*>(output->buffer_ptr());
203+
const ::sycl::half* ds = static_cast<const ::sycl::half*>(inputs[1]->buffer_ptr());
204+
const ::sycl::half* dzp = inputs.size() == 3 ? static_cast<const ::sycl::half*>(inputs[2]->buffer_ptr()) : nullptr;
205+
206+
207+
return to_ocl_event(stream, run_fc_int4_woq(sycl_queue, barrier, in, wei, dzp, ds, out, M, N, K, group_size, groups_num, out_shape, dzp_scalar));
208+
} else if ((CASE(f16, u4, f16, f16, f32)) || (CASE(f16, u4, undefined, f16, f32))) {
209+
const ::sycl::half* in = static_cast<const ::sycl::half*>(inputs[0]->buffer_ptr());
210+
const uint8_t* wei = static_cast<const uint8_t*>(weights->buffer_ptr());
211+
float* out = static_cast<float*>(output->buffer_ptr());
212+
const ::sycl::half* ds = static_cast<const ::sycl::half*>(inputs[1]->buffer_ptr());
213+
const ::sycl::half* dzp = inputs.size() == 3 ? static_cast<const ::sycl::half*>(inputs[2]->buffer_ptr()) : nullptr;
214+
215+
216+
return to_ocl_event(stream, run_fc_int4_woq(sycl_queue, barrier, in, wei, dzp, ds, out, M, N, K, group_size, groups_num, out_shape, dzp_scalar));
217+
} else if ((CASE(f32, u8, f32, f32, f32)) || (CASE(f32, u8, undefined, f32, f32))) {
218+
const float* in = static_cast<const float*>(inputs[0]->buffer_ptr());
219+
const uint8_t* wei = static_cast<const uint8_t*>(weights->buffer_ptr());
220+
float* out = static_cast<float*>(output->buffer_ptr());
221+
const float* ds = static_cast<const float*>(inputs[1]->buffer_ptr());
222+
const float* dzp = inputs.size() == 3 ? static_cast<const float*>(inputs[2]->buffer_ptr()) : nullptr;
223+
224+
return to_ocl_event(stream, run_fc_int8_woq(sycl_queue, barrier, in, wei, dzp, ds, out, M, N, K, group_size, groups_num, out_shape, dzp_scalar));
225+
} else if ((CASE(f16, u8, f16, f16, f16)) || (CASE(f16, u8, undefined, f16, f16))) {
226+
const ::sycl::half* in = static_cast<const ::sycl::half*>(inputs[0]->buffer_ptr());
227+
const uint8_t* wei = static_cast<const uint8_t*>(weights->buffer_ptr());
228+
::sycl::half* out = static_cast<::sycl::half*>(output->buffer_ptr());
229+
const ::sycl::half* ds = static_cast<const ::sycl::half*>(inputs[1]->buffer_ptr());
230+
const ::sycl::half* dzp = inputs.size() == 3 ? static_cast<const ::sycl::half*>(inputs[2]->buffer_ptr()) : nullptr;
231+
232+
return to_ocl_event(stream, run_fc_int8_woq(sycl_queue, barrier, in, wei, dzp, ds, out, M, N, K, group_size, groups_num, out_shape, dzp_scalar));
233+
} else if ((CASE(f16, u8, f16, f16, f32)) || (CASE(f16, u8, undefined, f16, f32))) {
234+
const ::sycl::half* in = static_cast<const ::sycl::half*>(inputs[0]->buffer_ptr());
235+
const uint8_t* wei = static_cast<const uint8_t*>(weights->buffer_ptr());
236+
float* out = static_cast<float*>(output->buffer_ptr());
237+
const ::sycl::half* ds = static_cast<const ::sycl::half*>(inputs[1]->buffer_ptr());
238+
const ::sycl::half* dzp = inputs.size() == 3 ? static_cast<const ::sycl::half*>(inputs[2]->buffer_ptr()) : nullptr;
239+
240+
return to_ocl_event(stream, run_fc_int8_woq(sycl_queue, barrier, in, wei, dzp, ds, out, M, N, K, group_size, groups_num, out_shape, dzp_scalar));
241+
} else {
242+
OPENVINO_THROW("No instance for given types found: ", in_t, " ", wei_t, " ", out_t, " ", ds_t, " ", dzp_t);
243+
}
244+
}
245+
246+
static std::shared_ptr<WeightsReorderParams> get_weights_reorder(const kernel_impl_params& impl_params) {
247+
auto source_weights_layout = impl_params.get_input_layout(1);
248+
auto target_weights_layout = source_weights_layout;
249+
target_weights_layout.format = format::oiyx;
250+
251+
return std::make_shared<WeightsReorderParams>(source_weights_layout, target_weights_layout);
252+
}
253+
254+
static std::unique_ptr<primitive_impl> create(const fully_connected_node& arg, const kernel_impl_params& impl_params) {
255+
auto& engine = impl_params.prog->get_engine();
256+
auto& config = impl_params.prog->get_config();
257+
return cldnn::make_unique<fully_connected_sycl_example>(engine, config, get_weights_reorder(impl_params));
258+
}
259+
};
260+
261+
} // namespace sycl
262+
} // namespace cldnn

src/plugins/intel_gpu/src/graph/impls/sycl/primitive_sycl_base.h

+4-34
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,6 @@
77
#include "primitive_inst.h"
88
#include "intel_gpu/runtime/memory.hpp"
99
#include "register.hpp"
10-
#include "utils.hpp"
1110
#include "runtime/ocl/ocl_event.hpp"
1211

1312
#include <vector>
@@ -27,50 +26,21 @@ struct typed_primitive_sycl_impl : public typed_primitive_impl<PType> {
2726
: typed_primitive_impl<PType>(weights_reorder, "sycl_kernel"),
2827
_engine(&engine) { }
2928

30-
typed_primitive_sycl_impl()
31-
: typed_primitive_impl<PType>({}, "undef"),
32-
_engine(nullptr) {
29+
typed_primitive_sycl_impl() : typed_primitive_impl<PType>({}, "undef"), _engine(nullptr) {
3330
}
3431

3532
bool is_cpu() const override { return false; }
3633
bool is_onednn() const override { return false; }
3734

38-
void save(BinaryOutputBuffer& ob) const override {
39-
}
40-
41-
void load(BinaryInputBuffer& ib) override {
42-
}
43-
4435
protected:
4536
void init_kernels(const kernels_cache&, const kernel_impl_params&) override { }
4637

47-
void set_arguments_impl(typed_primitive_inst<PType>& instance) override {
48-
if (instance.can_be_optimized())
49-
return;
50-
}
51-
52-
void update_dispatch_data(const kernel_impl_params& impl_params) override {}
53-
54-
void set_arguments_impl(typed_primitive_inst<PType>& instance, kernel_arguments_data& args) override {
55-
if (instance.can_be_optimized()) {
56-
return;
57-
}
58-
}
59-
60-
event::ptr execute_impl(const std::vector<event::ptr>& /* events */,
61-
typed_primitive_inst<PType>& instance) override {
62-
auto& network = instance.get_network();
63-
auto& stream = network.get_stream();
64-
auto net_id = network.get_id();
65-
event::ptr event;
66-
67-
68-
return event;
69-
}
38+
void set_arguments_impl(typed_primitive_inst<PType>& instance) override { }
39+
void set_arguments_impl(typed_primitive_inst<PType>& instance, kernel_arguments_data& args) override { }
7040

7141
static event::ptr to_ocl_event(stream& stream, ::sycl::event e) {
7242
if (stream.get_queue_type() == QueueTypes::out_of_order) {
73-
auto native_events = get_native<::sycl::backend::opencl, ::sycl::event>(e);
43+
auto native_events = ::sycl::get_native<::sycl::backend::opencl, ::sycl::event>(e);
7444
std::vector<event::ptr> events;
7545
for (auto& e : native_events) {
7646
events.push_back(std::make_shared<ocl::ocl_event>(cl::Event(e, true)));

0 commit comments

Comments
 (0)