Skip to content

Commit 41d3ce5

Browse files
committed
cpu: pooling: fix crashes of large tensor processing
1 parent 191fc46 commit 41d3ce5

File tree

4 files changed

+50
-38
lines changed

4 files changed

+50
-38
lines changed

src/cpu/nchw_pooling.cpp

+6-4
Original file line numberDiff line numberDiff line change
@@ -66,7 +66,7 @@ status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
6666
const dim_t padT = pd()->padT();
6767
const dim_t padL = pd()->padL();
6868

69-
const auto apply_offset = [](int index, int offset) {
69+
const auto apply_offset = [](dim_t index, dim_t offset) {
7070
return (index > offset) ? index - offset : 0;
7171
};
7272

@@ -270,7 +270,7 @@ status_t nchw_pooling_fwd_t<d_type>::execute_forward(
270270
const size_t blocked_size = src_size / simd_w;
271271
const size_t tail_size = src_size % simd_w;
272272

273-
auto apply_offset = [=](int index, int offset) {
273+
auto apply_offset = [=](dim_t index, dim_t offset) {
274274
return (index > offset) ? index - offset : 0;
275275
};
276276

@@ -469,7 +469,7 @@ status_t nchw_pooling_bwd_t<data_type::f32>::execute_backward(
469469
const dim_t padT = pd()->padT();
470470
const dim_t padL = pd()->padL();
471471

472-
auto apply_offset = [=](int index, int offset) {
472+
auto apply_offset = [=](dim_t index, dim_t offset) {
473473
return (index > offset) ? index - offset : 0;
474474
};
475475

@@ -622,7 +622,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
622622
const size_t dst_sp_size = pd()->OD() * pd()->OH() * pd()->OW();
623623
const size_t src_sp_size = pd()->ID() * pd()->IH() * pd()->IW();
624624

625-
auto apply_offset = [=](int index, int offset) {
625+
auto apply_offset = [=](dim_t index, dim_t offset) {
626626
return (index > offset) ? index - offset : 0;
627627
};
628628

@@ -704,6 +704,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
704704
if (alg == alg_kind::pooling_max) {
705705
parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
706706
[&](int ithr, int, dim_t mb, dim_t cb) {
707+
assert(ithr < pd()->nbuf_);
707708
bool is_last_c_block
708709
= c_blk_tail > 0 && (cb + 1) * c_blk > C;
709710
dim_t curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
@@ -740,6 +741,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
740741
} else {
741742
parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
742743
[&](int ithr, int, dim_t mb, dim_t cb) {
744+
assert(ithr < pd()->nbuf_);
743745
bool is_last_c_block
744746
= c_blk_tail > 0 && (cb + 1) * c_blk > C;
745747
dim_t curr_c_block = is_last_c_block ? c_blk_tail : c_blk;

src/cpu/nchw_pooling.hpp

+24-15
Original file line numberDiff line numberDiff line change
@@ -174,8 +174,9 @@ struct nchw_pooling_bwd_t : public primitive_t {
174174
return status::success;
175175
}
176176

177-
dim_t channel_block_size_;
177+
dim_t channel_block_size_ {1};
178178
int nthr_; // To not exceed the limit in execute used for set up.
179+
int nbuf_ {0};
179180

180181
private:
181182
void init_scratchpad() {
@@ -185,26 +186,34 @@ struct nchw_pooling_bwd_t : public primitive_t {
185186
size_t src_sz_ = ID() * IH() * IW();
186187
auto scratchpad = scratchpad_registry().registrar();
187188

189+
// The value of nbuf_ must be in compliance with arguments of
190+
// parallel_nd_ext called from execute_backward for data_type!=f32
191+
nbuf_ = nstl::min(static_cast<dim_t>(nthr_),
192+
MB() * utils::div_up(IC(), channel_block_size_));
193+
188194
scratchpad.template book<float>(key_pool_src_bf16cvt,
189-
src_sz_ * nthr_ * channel_block_size_);
195+
src_sz_ * nbuf_ * channel_block_size_);
190196
scratchpad.template book<float>(key_pool_dst_bf16cvt,
191-
dst_sz_ * nthr_ * channel_block_size_);
197+
dst_sz_ * nbuf_ * channel_block_size_);
192198
}
193199
}
194200

195201
void calculate_channel_block_size() {
196-
// calculate channels block size at which the data fits into half
197-
// of L1, it allows to improve performance for problems with small
198-
// spatial
199-
dim_t dst_sz_ = OD() * OH() * OW();
200-
dim_t src_sz_ = ID() * IH() * IW();
201-
dim_t C_per_thr = nstl::min(MB() * IC() / nthr_, IC());
202-
const dim_t max_block_size
203-
= platform::get_per_core_cache_size(1) / 2;
204-
dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
205-
channel_block_size_ = nstl::max(
206-
nstl::min(C_per_thr, max_block_size / data_size_per_ch),
207-
(dim_t)1);
202+
using namespace memory_tracking::names;
203+
if (diff_dst_md()->data_type != data_type::f32) {
204+
// calculate channels block size at which the data fits into half
205+
// of L1, it allows to improve performance for problems with small
206+
// spatial
207+
dim_t dst_sz_ = OD() * OH() * OW();
208+
dim_t src_sz_ = ID() * IH() * IW();
209+
dim_t C_per_thr = nstl::min(MB() * IC() / nthr_, IC());
210+
const dim_t max_block_size
211+
= platform::get_per_core_cache_size(1) / 2;
212+
dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
213+
channel_block_size_ = nstl::max(
214+
nstl::min(C_per_thr, max_block_size / data_size_per_ch),
215+
(dim_t)1);
216+
}
208217
}
209218
};
210219

src/cpu/nhwc_pooling.cpp

+12-12
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*******************************************************************************
2-
* Copyright 2019-2024 Intel Corporation
2+
* Copyright 2019-2025 Intel Corporation
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -55,39 +55,39 @@ namespace cpu {
5555
= MEM_D(name).blocking_desc().strides[ndims - 1];
5656

5757
namespace nhwc_pooling {
58-
size_t strided_offset(const int _n, const size_t _sn, const int _d,
59-
const size_t _sd, const int _h, const size_t _sh, const int _w,
58+
size_t strided_offset(const dim_t _n, const size_t _sn, const dim_t _d,
59+
const size_t _sd, const dim_t _h, const size_t _sh, const dim_t _w,
6060
const size_t _sw) {
6161
return _n * _sn + _d * _sd + _h * _sh + _w * _sw;
6262
}
6363
} // namespace nhwc_pooling
6464

6565
template <data_type_t d_type>
66-
void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const int n,
66+
void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const dim_t n,
6767
const ker_data_t *src, const size_t num, ker_data_t *dst) const {
68-
for (int i = 0; i < n; ++i) {
68+
for (dim_t i = 0; i < n; ++i) {
6969
const float ftmp = ((float)src[i]) / num;
7070
dst[i] = q10n::out_round<ker_data_t>(ftmp);
7171
}
7272
}
7373

7474
template <data_type_t d_type>
7575
void nhwc_pooling_fwd_t<d_type>::array_add(
76-
const int n, const ker_data_t *src, ker_data_t *dst) const {
77-
for (int i = 0; i < n; ++i) {
76+
const dim_t n, const ker_data_t *src, ker_data_t *dst) const {
77+
for (dim_t i = 0; i < n; ++i) {
7878
dst[i] += src[i];
7979
}
8080
}
8181

8282
template <data_type_t d_type>
83-
void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const int n, ker_data_t *dst,
83+
void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const dim_t n, ker_data_t *dst,
8484
const ker_data_t *src, unsigned char *ws, const size_t ws_offset,
8585
const data_type_t ws_dt, const int index) const {
8686
assert(ws);
8787
#if SAFE_TO_USE_OMP_SIMD
8888
PRAGMA_OMP_SIMD()
8989
#endif
90-
for (int oc = 0; oc < n; ++oc) {
90+
for (dim_t oc = 0; oc < n; ++oc) {
9191
const auto s = src[oc];
9292
ker_data_t mv = dst[oc];
9393

@@ -130,14 +130,14 @@ void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const int n, ker_data_t *dst,
130130
}
131131

132132
template <data_type_t d_type>
133-
void nhwc_pooling_fwd_t<d_type>::array_nhwc_initialize(const int n,
133+
void nhwc_pooling_fwd_t<d_type>::array_nhwc_initialize(const dim_t n,
134134
ker_data_t *dst, unsigned char *ws, const size_t ws_offset,
135135
const data_type_t ws_dt) const {
136136
assert(ws && (ws_dt == data_type::u8 || ws_dt == data_type::s32));
137137
#if SAFE_TO_USE_OMP_SIMD
138138
PRAGMA_OMP_SIMD()
139139
#endif
140-
for (int oc = 0; oc < n; ++oc) {
140+
for (dim_t oc = 0; oc < n; ++oc) {
141141
if (ws_dt == data_type::u8)
142142
ws[ws_offset + oc] = 0;
143143
else
@@ -189,7 +189,7 @@ status_t nhwc_pooling_fwd_t<data_type::f32>::execute_forward(
189189
DECLARE_READ_STRIDES(src);
190190
DECLARE_READ_STRIDES(dst);
191191

192-
const auto apply_offset = [](int index, int offset) {
192+
const auto apply_offset = [](dim_t index, dim_t offset) {
193193
return (index > offset) ? index - offset : 0;
194194
};
195195

src/cpu/nhwc_pooling.hpp

+8-7
Original file line numberDiff line numberDiff line change
@@ -35,8 +35,8 @@ namespace impl {
3535
namespace cpu {
3636

3737
namespace nhwc_pooling {
38-
size_t strided_offset(const int _n, const size_t _sn, const int _d,
39-
const size_t _sd, const int _h, const size_t _sh, const int _w,
38+
size_t strided_offset(const dim_t _n, const size_t _sn, const dim_t _d,
39+
const size_t _sd, const dim_t _h, const size_t _sh, const dim_t _w,
4040
const size_t _sw);
4141
}
4242

@@ -130,14 +130,15 @@ struct nhwc_pooling_fwd_t : public primitive_t {
130130

131131
private:
132132
status_t execute_forward(const exec_ctx_t &ctx) const;
133-
void array_div_by_const(const int n, const ker_data_t *src,
133+
void array_div_by_const(const dim_t n, const ker_data_t *src,
134134
const size_t num, ker_data_t *dst) const;
135-
void array_add(const int n, const ker_data_t *src, ker_data_t *dst) const;
136-
void array_nhwc_max(const int n, ker_data_t *dst, const ker_data_t *src,
135+
void array_add(const dim_t n, const ker_data_t *src, ker_data_t *dst) const;
136+
void array_nhwc_max(const dim_t n, ker_data_t *dst, const ker_data_t *src,
137137
unsigned char *ws, const size_t ws_offset, const data_type_t ws_dt,
138138
const int index) const;
139-
void array_nhwc_initialize(const int n, ker_data_t *dst, unsigned char *ws,
140-
const size_t ws_offset, const data_type_t ws_dt) const;
139+
void array_nhwc_initialize(const dim_t n, ker_data_t *dst,
140+
unsigned char *ws, const size_t ws_offset,
141+
const data_type_t ws_dt) const;
141142

142143
const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
143144
std::unique_ptr<ref_post_ops_t> ref_post_ops_;

0 commit comments

Comments
 (0)