Skip to content

Commit f2eb2bd

Browse files
t4c1mgouicem
authored andcommitted
gpu: nvidia: convolution: bugfix sum post op with int8
1 parent 9db41d4 commit f2eb2bd

File tree

3 files changed

+20
-21
lines changed

3 files changed

+20
-21
lines changed

src/gpu/nvidia/cudnn_convolution.cpp

-4
Original file line numberDiff line numberDiff line change
@@ -54,11 +54,8 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
5454

5555
if (pd()->use_temp_dst()) {
5656
memory_storage_t *temp_dst_mem = scratch_storage.get();
57-
memory_storage_t *temp_reorder_mem = scratch_storage_2.get();
5857
temp_dst = xpu::sycl::interop_memory_arg_t<
5958
::sycl::access::mode::read_write>(temp_dst_mem, cgh);
60-
temp_reorder = xpu::sycl::interop_memory_arg_t<
61-
::sycl::access::mode::read_write>(temp_reorder_mem, cgh);
6259
}
6360

6461
xpu::sycl::interop_memory_arg_t<::sycl::access::mode::read_write>
@@ -85,7 +82,6 @@ status_t cudnn_convolution_fwd_t::execute_convolution(
8582
args.push_back(arg_scratch.get_native_pointer(ih));
8683
args.push_back(arg_filter_scratch.get_native_pointer(ih));
8784
args.push_back(temp_dst.get_native_pointer(ih));
88-
args.push_back(temp_reorder.get_native_pointer(ih));
8985
args.push_back(arg_src_scale.get_native_pointer(ih));
9086
args.push_back(arg_wei_scale.get_native_pointer(ih));
9187
args.push_back(arg_dst_scale.get_native_pointer(ih));

src/gpu/nvidia/cudnn_convolution.hpp

-4
Original file line numberDiff line numberDiff line change
@@ -176,10 +176,6 @@ struct cudnn_convolution_fwd_t : public gpu::primitive_t {
176176
CHECK(sycl_engine->create_memory_storage(
177177
&scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
178178
scratch_storage.reset(scratch_ptr);
179-
180-
CHECK(sycl_engine->create_memory_storage(
181-
&scratch_ptr, memory_flags_t::alloc, wrap.size(), nullptr));
182-
scratch_storage_2.reset(scratch_ptr);
183179
}
184180
if (impl && impl->use_scales_dst()) {
185181
CHECK(sycl_engine->create_memory_storage(

src/gpu/nvidia/cudnn_convolution_impl.hpp

+20-13
Original file line numberDiff line numberDiff line change
@@ -511,14 +511,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
511511
const float beta = 0.0f;
512512
if (flip_formats) {
513513
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
514-
reorder_dst_desc, src, &beta, descs[y], dst);
514+
reorder_dst_desc, src, &beta, y_fp32_desc, dst);
515515
} else {
516-
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha, descs[y],
517-
src, &beta, reorder_dst_desc, dst);
516+
CUDNN_EXECUTE_FUNC_V(cudnnTransformTensor, handle, &alpha,
517+
y_fp32_desc, src, &beta, reorder_dst_desc, dst);
518518
}
519519
}
520520

521-
void execute_f32_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
521+
void execute_f32_dst_sum(cudnnHandle_t handle, void *y, void *y_fp32_data,
522522
float alpha_, float beta_) const {
523523
float alpha1 = 0.0f;
524524
float alpha2 = alpha_;
@@ -528,6 +528,14 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
528528
y_fp32_data);
529529
}
530530

531+
void execute_f32_src_sum(cudnnHandle_t handle, void *x, void *y,
532+
float alpha_, float beta_) const {
533+
float alpha = alpha_;
534+
float beta = beta_;
535+
CUDNN_EXECUTE_FUNC_V(cudnnAddTensor, handle, &alpha, descs[io::y], x,
536+
&beta, y_fp32_desc, y);
537+
}
538+
531539
void execute_eltwise(cudnnHandle_t handle, void *src, void *dst) const {
532540
float alpha = 1.0f;
533541
float beta = 0.0f;
@@ -551,8 +559,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
551559
const std::vector<void *> &args) const override {
552560
auto x = args[0], weights = args[1], y = args[2], bias = args[3],
553561
scratchpad = args[4], post_op_scratch = args[6],
554-
post_op_reorder = args[7], src_scale = args[8],
555-
wei_scale = args[9], dst_scale = args[10];
562+
src_scale = args[7], wei_scale = args[8], dst_scale = args[9];
556563
void *output = use_temp_dst_ ? post_op_scratch : y;
557564
if (using_transformed_filter()) {
558565
auto w_scratch = args[5];
@@ -561,7 +568,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
561568
}
562569

563570
float *y_fp32_data = nullptr;
564-
if (y_f32_is_required()) { y_fp32_data = (float *)args[11]; }
571+
if (y_f32_is_required()) { y_fp32_data = (float *)args[10]; }
565572

566573
bool fused = conv_bias || conv_bias_eltwise;
567574

@@ -581,7 +588,8 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
581588
}
582589
}
583590

584-
auto &y_desc = y_f32_is_required() ? y_fp32_desc : descs[io::y];
591+
auto &y_desc = (y_f32_is_required() || use_temp_dst_) ? y_fp32_desc
592+
: descs[io::y];
585593
void *y_data = y_f32_is_required() ? y_fp32_data : output;
586594

587595
if (fused) {
@@ -619,12 +627,11 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
619627
switch (post_ops[i]) {
620628
case dnnl_sum:
621629
if (need_reorder) {
622-
execute_reorder(handle, y, post_op_reorder, true);
623-
execute_sum(handle, post_op_reorder, post_op_scratch,
624-
sum_scale, 1.0f);
630+
execute_f32_src_sum(
631+
handle, y, post_op_scratch, sum_scale, 1.0f);
625632
} else if (last_op) {
626633
if (y_f32_is_required()) {
627-
execute_f32_sum(
634+
execute_f32_dst_sum(
628635
handle, y, y_fp32_data, 1.0f, sum_scale);
629636
} else {
630637
execute_sum(handle, post_op_scratch, y, 1.0f,
@@ -687,7 +694,7 @@ struct cudnn_convolution_impl_fwd_t : public cudnn_convolution_impl_base_t {
687694
// The scratchpad size will need to be modified in
688695
// cases where the dst_scaling is used and the output
689696
// uses s8 values.
690-
if (use_scales_dst_) {
697+
if (use_scales_dst_ || use_temp_dst_) {
691698
CHECK(create_and_set_tensor_descriptor(&y_fp32_desc,
692699
CUDNN_DATA_FLOAT, ndims[y], dims[y], strides[y]));
693700
CHECK(CUDNN_EXECUTE_FUNC_S(cudnnGetConvolutionForwardWorkspaceSize,

0 commit comments

Comments
 (0)