cpu: pooling: fix crashes of large tensor processing

asimonov1 · asimonov1 · commit 41d3ce571ed4 · 2025-03-18T10:17:36.000+01:00
diff --git a/src/cpu/nchw_pooling.cpp b/src/cpu/nchw_pooling.cpp
@@ -66,7 +66,7 @@ status_t nchw_pooling_fwd_t<data_type::f32>::execute_forward(
     const dim_t padT = pd()->padT();
     const dim_t padL = pd()->padL();
 
-    const auto apply_offset = [](int index, int offset) {
+    const auto apply_offset = [](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -270,7 +270,7 @@ status_t nchw_pooling_fwd_t<d_type>::execute_forward(
     const size_t blocked_size = src_size / simd_w;
     const size_t tail_size = src_size % simd_w;
 
-    auto apply_offset = [=](int index, int offset) {
+    auto apply_offset = [=](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -469,7 +469,7 @@ status_t nchw_pooling_bwd_t<data_type::f32>::execute_backward(
     const dim_t padT = pd()->padT();
     const dim_t padL = pd()->padL();
 
-    auto apply_offset = [=](int index, int offset) {
+    auto apply_offset = [=](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -622,7 +622,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
     const size_t dst_sp_size = pd()->OD() * pd()->OH() * pd()->OW();
     const size_t src_sp_size = pd()->ID() * pd()->IH() * pd()->IW();
 
-    auto apply_offset = [=](int index, int offset) {
+    auto apply_offset = [=](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
@@ -704,6 +704,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
     if (alg == alg_kind::pooling_max) {
         parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
                 [&](int ithr, int, dim_t mb, dim_t cb) {
+                    assert(ithr < pd()->nbuf_);
                     bool is_last_c_block
                             = c_blk_tail > 0 && (cb + 1) * c_blk > C;
                     dim_t curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
@@ -740,6 +741,7 @@ status_t nchw_pooling_bwd_t<d_type>::execute_backward(
     } else {
         parallel_nd_ext(nthr, MB, utils::div_up(C, c_blk),
                 [&](int ithr, int, dim_t mb, dim_t cb) {
+                    assert(ithr < pd()->nbuf_);
                     bool is_last_c_block
                             = c_blk_tail > 0 && (cb + 1) * c_blk > C;
                     dim_t curr_c_block = is_last_c_block ? c_blk_tail : c_blk;
diff --git a/src/cpu/nchw_pooling.hpp b/src/cpu/nchw_pooling.hpp
@@ -174,8 +174,9 @@ struct nchw_pooling_bwd_t : public primitive_t {
             return status::success;
         }
 
-        dim_t channel_block_size_;
+        dim_t channel_block_size_ {1};
         int nthr_; // To not exceed the limit in execute used for set up.
+        int nbuf_ {0};
 
     private:
         void init_scratchpad() {
@@ -185,26 +186,34 @@ struct nchw_pooling_bwd_t : public primitive_t {
                 size_t src_sz_ = ID() * IH() * IW();
                 auto scratchpad = scratchpad_registry().registrar();
 
+                // The value of nbuf_ must be in compliance with arguments of
+                // parallel_nd_ext called from execute_backward for data_type!=f32
+                nbuf_ = nstl::min(static_cast<dim_t>(nthr_),
+                        MB() * utils::div_up(IC(), channel_block_size_));
+
                 scratchpad.template book<float>(key_pool_src_bf16cvt,
-                        src_sz_ * nthr_ * channel_block_size_);
+                        src_sz_ * nbuf_ * channel_block_size_);
                 scratchpad.template book<float>(key_pool_dst_bf16cvt,
-                        dst_sz_ * nthr_ * channel_block_size_);
+                        dst_sz_ * nbuf_ * channel_block_size_);
             }
         }
 
         void calculate_channel_block_size() {
-            // calculate channels block size at which the data fits into half
-            // of L1, it allows to improve performance for problems with small
-            // spatial
-            dim_t dst_sz_ = OD() * OH() * OW();
-            dim_t src_sz_ = ID() * IH() * IW();
-            dim_t C_per_thr = nstl::min(MB() * IC() / nthr_, IC());
-            const dim_t max_block_size
-                    = platform::get_per_core_cache_size(1) / 2;
-            dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
-            channel_block_size_ = nstl::max(
-                    nstl::min(C_per_thr, max_block_size / data_size_per_ch),
-                    (dim_t)1);
+            using namespace memory_tracking::names;
+            if (diff_dst_md()->data_type != data_type::f32) {
+                // calculate channels block size at which the data fits into half
+                // of L1, it allows to improve performance for problems with small
+                // spatial
+                dim_t dst_sz_ = OD() * OH() * OW();
+                dim_t src_sz_ = ID() * IH() * IW();
+                dim_t C_per_thr = nstl::min(MB() * IC() / nthr_, IC());
+                const dim_t max_block_size
+                        = platform::get_per_core_cache_size(1) / 2;
+                dim_t data_size_per_ch = (dst_sz_ + src_sz_) * 6; // f32 + bf16
+                channel_block_size_ = nstl::max(
+                        nstl::min(C_per_thr, max_block_size / data_size_per_ch),
+                        (dim_t)1);
+            }
         }
     };
 
diff --git a/src/cpu/nhwc_pooling.cpp b/src/cpu/nhwc_pooling.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2019-2024 Intel Corporation
+* Copyright 2019-2025 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -55,39 +55,39 @@ namespace cpu {
             = MEM_D(name).blocking_desc().strides[ndims - 1];
 
 namespace nhwc_pooling {
-size_t strided_offset(const int _n, const size_t _sn, const int _d,
-        const size_t _sd, const int _h, const size_t _sh, const int _w,
+size_t strided_offset(const dim_t _n, const size_t _sn, const dim_t _d,
+        const size_t _sd, const dim_t _h, const size_t _sh, const dim_t _w,
         const size_t _sw) {
     return _n * _sn + _d * _sd + _h * _sh + _w * _sw;
 }
 } // namespace nhwc_pooling
 
 template <data_type_t d_type>
-void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const int n,
+void nhwc_pooling_fwd_t<d_type>::array_div_by_const(const dim_t n,
         const ker_data_t *src, const size_t num, ker_data_t *dst) const {
-    for (int i = 0; i < n; ++i) {
+    for (dim_t i = 0; i < n; ++i) {
         const float ftmp = ((float)src[i]) / num;
         dst[i] = q10n::out_round<ker_data_t>(ftmp);
     }
 }
 
 template <data_type_t d_type>
 void nhwc_pooling_fwd_t<d_type>::array_add(
-        const int n, const ker_data_t *src, ker_data_t *dst) const {
-    for (int i = 0; i < n; ++i) {
+        const dim_t n, const ker_data_t *src, ker_data_t *dst) const {
+    for (dim_t i = 0; i < n; ++i) {
         dst[i] += src[i];
     }
 }
 
 template <data_type_t d_type>
-void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const int n, ker_data_t *dst,
+void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const dim_t n, ker_data_t *dst,
         const ker_data_t *src, unsigned char *ws, const size_t ws_offset,
         const data_type_t ws_dt, const int index) const {
     assert(ws);
 #if SAFE_TO_USE_OMP_SIMD
     PRAGMA_OMP_SIMD()
 #endif
-    for (int oc = 0; oc < n; ++oc) {
+    for (dim_t oc = 0; oc < n; ++oc) {
         const auto s = src[oc];
         ker_data_t mv = dst[oc];
 
@@ -130,14 +130,14 @@ void nhwc_pooling_fwd_t<d_type>::array_nhwc_max(const int n, ker_data_t *dst,
 }
 
 template <data_type_t d_type>
-void nhwc_pooling_fwd_t<d_type>::array_nhwc_initialize(const int n,
+void nhwc_pooling_fwd_t<d_type>::array_nhwc_initialize(const dim_t n,
         ker_data_t *dst, unsigned char *ws, const size_t ws_offset,
         const data_type_t ws_dt) const {
     assert(ws && (ws_dt == data_type::u8 || ws_dt == data_type::s32));
 #if SAFE_TO_USE_OMP_SIMD
     PRAGMA_OMP_SIMD()
 #endif
-    for (int oc = 0; oc < n; ++oc) {
+    for (dim_t oc = 0; oc < n; ++oc) {
         if (ws_dt == data_type::u8)
             ws[ws_offset + oc] = 0;
         else
@@ -189,7 +189,7 @@ status_t nhwc_pooling_fwd_t<data_type::f32>::execute_forward(
     DECLARE_READ_STRIDES(src);
     DECLARE_READ_STRIDES(dst);
 
-    const auto apply_offset = [](int index, int offset) {
+    const auto apply_offset = [](dim_t index, dim_t offset) {
         return (index > offset) ? index - offset : 0;
     };
 
diff --git a/src/cpu/nhwc_pooling.hpp b/src/cpu/nhwc_pooling.hpp
@@ -35,8 +35,8 @@ namespace impl {
 namespace cpu {
 
 namespace nhwc_pooling {
-size_t strided_offset(const int _n, const size_t _sn, const int _d,
-        const size_t _sd, const int _h, const size_t _sh, const int _w,
+size_t strided_offset(const dim_t _n, const size_t _sn, const dim_t _d,
+        const size_t _sd, const dim_t _h, const size_t _sh, const dim_t _w,
         const size_t _sw);
 }
 
@@ -130,14 +130,15 @@ struct nhwc_pooling_fwd_t : public primitive_t {
 
 private:
     status_t execute_forward(const exec_ctx_t &ctx) const;
-    void array_div_by_const(const int n, const ker_data_t *src,
+    void array_div_by_const(const dim_t n, const ker_data_t *src,
             const size_t num, ker_data_t *dst) const;
-    void array_add(const int n, const ker_data_t *src, ker_data_t *dst) const;
-    void array_nhwc_max(const int n, ker_data_t *dst, const ker_data_t *src,
+    void array_add(const dim_t n, const ker_data_t *src, ker_data_t *dst) const;
+    void array_nhwc_max(const dim_t n, ker_data_t *dst, const ker_data_t *src,
             unsigned char *ws, const size_t ws_offset, const data_type_t ws_dt,
             const int index) const;
-    void array_nhwc_initialize(const int n, ker_data_t *dst, unsigned char *ws,
-            const size_t ws_offset, const data_type_t ws_dt) const;
+    void array_nhwc_initialize(const dim_t n, ker_data_t *dst,
+            unsigned char *ws, const size_t ws_offset,
+            const data_type_t ws_dt) const;
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
     std::unique_ptr<ref_post_ops_t> ref_post_ops_;