[FIX] added some legacy parallel methods to fix perf issues

antonvor · xczhai · commit 10d508a496bc · 2024-12-06T17:28:46.000+08:00
- gemm conv im2col()

	- simple concat
diff --git a/src/common/dnnl_thread.hpp b/src/common/dnnl_thread.hpp
@@ -207,6 +207,10 @@ inline int dnnl_get_current_num_threads() {
 #define simdlen(x)
 #endif // long simdlen if
 
+#if defined(DNNL_ENABLE_ITT_TASKS)
+#include "common/ittnotify.hpp"
+#endif
+
 namespace dnnl {
 namespace impl {
 
@@ -655,6 +659,171 @@ static inline void parallel_nd(dim_t D0, dim_t D1, dim_t D2, dim_t D3, dim_t D4,
         });
 }
 
+template <typename F>
+void parallel_legacy(int nthr, F f) {
+    nthr = adjust_num_threads(nthr, INT64_MAX);
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_SEQ
+    assert(nthr == 1);
+f(0, 1);
+#else
+#if defined(DNNL_ENABLE_ITT_TASKS)
+    auto task_primitive_kind = itt::primitive_task_get_current_kind();
+bool itt_enable = itt::get_itt(itt::__itt_task_level_high);
+#endif
+    if (nthr == 1) {
+        f(0, 1);
+        return;
+    }
+#if DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_OMP
+    #pragma omp parallel num_threads(nthr)
+{
+int nthr_ = omp_get_num_threads();
+int ithr_ = omp_get_thread_num();
+assert(nthr_ == nthr);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+if (ithr_ && itt_enable) itt::primitive_task_start(task_primitive_kind);
+#endif
+f(ithr_, nthr_);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+if (ithr_ && itt_enable) itt::primitive_task_end();
+#endif
+}
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB
+    tbb::parallel_for(
+            0, nthr,
+            [&](int ithr) {
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                bool mark_task = itt::primitive_task_get_current_kind()
+            == primitive_kind::undefined;
+    if (mark_task && itt_enable)
+        itt::primitive_task_start(task_primitive_kind);
+#endif
+                f(ithr, nthr);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+                if (mark_task && itt_enable) itt::primitive_task_end();
+#endif
+            },
+            tbb::static_partitioner());
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_TBB_AUTO
+    tbb::parallel_for(
+0, nthr, [&](int ithr) { f(ithr, nthr); });
+#elif DNNL_CPU_THREADING_RUNTIME == DNNL_RUNTIME_THREADPOOL
+using namespace dnnl::impl::threadpool_utils;
+dnnl::threadpool_interop::threadpool_iface *tp = get_active_threadpool();
+if (!tp || dnnl_in_parallel()) {
+threadpool_utils::deactivate_threadpool();
+for (int ithr = 0; ithr < nthr; ithr++) {
+f(ithr, nthr);
+}
+threadpool_utils::activate_threadpool(tp);
+} else {
+bool async = tp->get_flags()
+    & dnnl::threadpool_interop::threadpool_iface::ASYNCHRONOUS;
+counting_barrier_t b;
+if (async) b.init(nthr);
+tp->parallel_for(nthr, [&, tp](int ithr, int nthr) {
+bool is_master = threadpool_utils::get_active_threadpool() == tp;
+if (!is_master) {
+    threadpool_utils::activate_threadpool(tp);
+#if defined(DNNL_ENABLE_ITT_TASKS)
+    if (itt_enable) itt::primitive_task_start(task_primitive_kind);
+#endif
+}
+f(ithr, nthr);
+if (!is_master) {
+#if defined(DNNL_ENABLE_ITT_TASKS)
+    if (itt_enable) itt::primitive_task_end();
+#endif
+    threadpool_utils::deactivate_threadpool();
+}
+if (async) b.notify();
+});
+if (async) b.wait();
+}
+#endif
+#endif
+}
+
+template <typename T0, typename F>
+void for_nd_legacy(const int ithr, const int nthr, const T0 &D0, F f) {
+    T0 start {0}, end {0};
+    balance211(D0, nthr, ithr, start, end);
+    for (T0 d0 = start; d0 < end; ++d0)
+        f(d0);
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename F>
+void for_nd_legacy(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
+                       const T2 &D2, const T3 &D3, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
+    if (work_amount == 0) return;
+    size_t start {0}, end {0};
+    balance211(work_amount, nthr, ithr, start, end);
+
+    T0 d0 {0};
+    T1 d1 {0};
+    T2 d2 {0};
+    T3 d3 {0};
+    utils::nd_iterator_init(start, d0, D0, d1, D1, d2, D2, d3, D3);
+    for (size_t iwork = start; iwork < end; ++iwork) {
+        f(d0, d1, d2, d3);
+        utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3);
+    }
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4,
+        typename T5, typename F>
+void for_nd_legacy(const int ithr, const int nthr, const T0 &D0, const T1 &D1,
+            const T2 &D2, const T3 &D3, const T4 &D4, const T5 &D5, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
+    if (work_amount == 0) return;
+    size_t start {0}, end {0};
+    balance211(work_amount, nthr, ithr, start, end);
+
+    T0 d0 {0};
+    T1 d1 {0};
+    T2 d2 {0};
+    T3 d3 {0};
+    T4 d4 {0};
+    T5 d5 {0};
+    utils::nd_iterator_init(
+            start, d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
+    for (size_t iwork = start; iwork < end; ++iwork) {
+        f(d0, d1, d2, d3, d4, d5);
+        utils::nd_iterator_step(d0, D0, d1, D1, d2, D2, d3, D3, d4, D4, d5, D5);
+    }
+}
+
+template <typename T0, typename F>
+void parallel_nd_legacy(const T0 &D0, F f) {
+    const size_t work_amount = (size_t)D0;
+    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), work_amount);
+    if (nthr)
+        parallel_legacy(nthr, [&](int ithr, int nthr) { for_nd_legacy(ithr, nthr, D0, f); });
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename F>
+void parallel_nd_legacy(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3;
+    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), work_amount);
+    if (nthr)
+        parallel_legacy(nthr, [&](int ithr, int nthr) {
+            for_nd_legacy(ithr, nthr, D0, D1, D2, D3, f);
+        });
+}
+
+template <typename T0, typename T1, typename T2, typename T3, typename T4,
+        typename T5, typename F>
+void parallel_nd_legacy(const T0 &D0, const T1 &D1, const T2 &D2, const T3 &D3,
+                 const T4 &D4, const T5 &D5, F f) {
+    const size_t work_amount = (size_t)D0 * D1 * D2 * D3 * D4 * D5;
+    int nthr = adjust_num_threads(dnnl_get_current_num_threads(), work_amount);
+    if (nthr)
+        parallel_legacy(nthr, [&](int ithr, int nthr) {
+            for_nd_legacy(ithr, nthr, D0, D1, D2, D3, D4, D5, f);
+        });
+}
+
 } // namespace impl
 } // namespace dnnl
 
diff --git a/src/cpu/gemm_convolution_utils.cpp b/src/cpu/gemm_convolution_utils.cpp
@@ -454,7 +454,7 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
     bool with_input_zp = input_zp != nullptr;
 
     if (sd == 1 && sh == 1 && sw == 1 && dd == 1 && dh == 1 && dw == 1)
-        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+        parallel_nd_legacy(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
                     col_dt *__restrict col_loc = col + kd * col_kd_s
                             + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
@@ -484,7 +484,7 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     }
                 });
     else if (sd == 2 && sh == 2 && sw == 2 && dd == 1 && dh == 1 && dw == 1)
-        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+        parallel_nd_legacy(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
                     col_dt *__restrict col_loc = col + kd * col_kd_s
                             + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
@@ -516,7 +516,7 @@ void im2col_dt_3d(const conv_gemm_conf_t &jcp, const void *__restrict _imtr,
                     }
                 });
     else
-        parallel_nd(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
+        parallel_nd_legacy(jcp.kd, jcp.kh, jcp.kw, jcp.ic,
                 [&](dim_t kd, dim_t kh, dim_t kw, dim_t ic) {
                     col_dt *__restrict col_loc = col + kd * col_kd_s
                             + kh * col_kh_s + kw * col_kw_s + ic * col_ic_s;
@@ -660,7 +660,7 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
         // Generated code is more optimized for stride_w == 1
         // because innermost loop is by width
         if (sw == 1)
-            parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
+            parallel_nd_legacy(cb, jcp.kh, jcp.kw, oh_range,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
                         const dim_t oh = ohr + oh_begin;
                         const dim_t ih = oh * sh - tp + kh * dh;
@@ -685,7 +685,7 @@ void im2col(const conv_gemm_conf_t &jcp, const data_type_t *__restrict im,
                             }
                     });
         else
-            parallel_nd(cb, jcp.kh, jcp.kw, oh_range,
+            parallel_nd_legacy(cb, jcp.kh, jcp.kw, oh_range,
                     [&](dim_t ic, dim_t kh, dim_t kw, dim_t ohr) {
                         const dim_t oh = ohr + oh_begin;
                         const dim_t ih = oh * sh - tp + kh * dh;
@@ -840,7 +840,7 @@ void im2col_dt(const conv_gemm_conf_t &jcp, const void *__restrict _im,
             }
         }
     } else {
-        parallel_nd(jcp.kh, jcp.kw, jcp.ic, hb,
+        parallel_nd_legacy(jcp.kh, jcp.kw, jcp.ic, hb,
                 [&](dim_t kh, dim_t kw, dim_t ic, dim_t oh) {
                     const dim_t hp = tp - kh * dh;
                     const dim_t ih = (oh + hs) * sh - hp;
diff --git a/src/cpu/gemm_x8s8s32x_convolution.cpp b/src/cpu/gemm_x8s8s32x_convolution.cpp
@@ -220,7 +220,7 @@ status_t gemm_x8s8s32x_convolution_fwd_t::execute_forward_thr(const int ithr,
     balance211(work_amount, nthr, ithr, start, end);
     nd_iterator_init(start, n, jcp.mb, g, jcp.ngroups, ohb, nb_oh, owb, nb_ow);
     const uint8_t shift = jcp.signed_input ? 128 : 0;
-    parallel_nd(jcp.im2col_sz, [&](ptrdiff_t i) { col[i] = shift; });
+    parallel_nd_legacy(jcp.im2col_sz, [&](ptrdiff_t i) { col[i] = shift; });
 
     status_t st = status::success;
 
diff --git a/src/cpu/simple_concat.cpp b/src/cpu/simple_concat.cpp
@@ -74,6 +74,16 @@ status_t simple_concat_t<data_type>::execute(const exec_ctx_t &ctx) const {
     // Applies when concat axis is the outermost dimension, e.g. concat_axis = 0
     // or concat_axis = 1, and dims[0] = 1;
     if (!has_outer_loop) {
+        // @todo CPU_PLUGIN:
+        // the following implementation was used to fix some performace issues
+        // Now after original oneDNN re-designed this piece it seems to be not applicable
+        // anymore
+        // for (int a = 0; a < num_arrs; ++a) {
+        //     const data_t *i = &iptrs[a][0];
+        //     data_t *o = &optrs[a][0];
+        //     parallel_nd_legacy(nelems_to_copy[a], [&](dim_t e) { o[e] = i[e]; });
+        // }
+
         int nthr = dnnl_get_max_threads();
         parallel(nthr, [&](int ithr, int nthr) {
             for (int a = 0; a < num_arrs; ++a) {
@@ -104,7 +114,7 @@ status_t simple_concat_t<data_type>::execute(const exec_ctx_t &ctx) const {
     const auto L1_size = platform::get_per_core_cache_size(1);
     UNUSED(L1_size); // for Windows
 
-    parallel_nd(phys_dims[0], phys_dims[1], phys_dims[2], phys_dims[3],
+    parallel_nd_legacy(phys_dims[0], phys_dims[1], phys_dims[2], phys_dims[3],
             phys_dims[4], num_arrs,
             [&](dim_t n0, dim_t n1, dim_t n2, dim_t n3, dim_t n4, dim_t a) {
                 // check if zero memory