azhai219
diff --git a/‎src/cpu/gemm_convolution.cpp
+43 b/‎src/cpu/gemm_convolution.cpp
+43
diff --git a/‎src/cpu/gemm_convolution.hpp
+39-2 b/‎src/cpu/gemm_convolution.hpp
+39-2
diff --git a/‎src/cpu/ref_convolution.cpp
+14 b/‎src/cpu/ref_convolution.cpp
+14
diff --git a/‎src/cpu/ref_convolution.hpp
+40-2 b/‎src/cpu/ref_convolution.hpp
+40-2
diff --git a/‎src/cpu/ref_depthwise_injector.cpp
+79 b/‎src/cpu/ref_depthwise_injector.cpp
+79
diff --git a/‎src/cpu/ref_depthwise_injector.hpp
+40 b/‎src/cpu/ref_depthwise_injector.hpp
+40
@@ -467,6 +467,8 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_thr_nspc(
     // threads share work across mini-batch and groups
     const dim_t work_amount = jcp.ngroups * jcp.mb;
 
+    const auto &p = pd()->attr()->post_ops_;
+
     data_t *__restrict col = scratchpad.get<data_t>(key_conv_gemm_col)
             + (ptrdiff_t)ithr * jcp.im2col_sz;
     const bool acc_needed = jcp.ngroups > 1;
@@ -515,6 +517,25 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_thr_nspc(
                 }
             });
         }
+        if (p.len() > 0) {
+            int depthwise_inj_idx = 0;
+            for (int i = 0; i < p.len(); i++) {
+                auto &post_op = p.entry_[i];
+                if (post_op.is_depthwise()) {
+                    auto depthwise_weights = post_op.depthwise.weights_data;
+                    auto depthwise_bias = post_op.depthwise.biases_data;
+                    parallel_nd(static_cast<size_t>(jcp.is) * jcp.id, [&](size_t is) {
+                        data_t *__restrict diff_src_arr
+                                = diff_src + is * diff_src_os_stride;
+                        for (int ic = 0; ic < jcp.ic; ic++) {
+                            diff_src_arr[ic] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(diff_src_arr[ic],
+                                depthwise_weights + g * jcp.ic + ic, depthwise_bias + g * jcp.ic + ic);
+                        }
+                    });
+                    depthwise_inj_idx++;
+                }
+            }
+        }
         nd_iterator_step(n, jcp.mb, g, jcp.ngroups);
     }
     return status::success;
@@ -547,6 +568,8 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_ncsp(
     const dim_t work_amount = (size_t)jcp.ngroups * jcp.mb;
     const bool is_problem_3d = pd()->ndims() == 5;
 
+    const auto &p = pd()->attr()->post_ops_;
+
     std::atomic<status_t> st(status::success);
     parallel(jcp.nthr, [&](const int ithr, const int nthr) {
         data_t *_col = col + (ptrdiff_t)ithr * jcp.im2col_sz;
@@ -594,6 +617,26 @@ status_t gemm_convolution_bwd_data_t::execute_backward_data_ncsp(
                     }
                 }
             }
+            if (p.len() > 0) {
+                int depthwise_inj_idx = 0;
+                for (int i = 0; i < p.len(); i++) {
+                    auto &post_op = p.entry_[i];
+                    if (post_op.is_depthwise()) {
+                        auto depthwise_weights = post_op.depthwise.weights_data;
+                        auto depthwise_bias = post_op.depthwise.biases_data;
+                        parallel_nd(jcp.ic, [&](const int ic) {
+                            for (int id = 0; id < jcp.id; ++id) {
+                                data_t *d_ = _diff_src + ic * jcp.id * jcp.is + id * jcp.is;
+                                for (int iS = 0; iS < jcp.is; ++iS) {
+                                    d_[iS] = depthwise_injectors[depthwise_inj_idx]->compute_scalar(d_[iS],
+                                            depthwise_weights + g * jcp.ic + ic, depthwise_bias + g * jcp.ic + ic);
+                                }
+                            }
+                        });
+                        depthwise_inj_idx++;
+                    }
+                }
+            }
             nd_iterator_step(g, jcp.ngroups, n, jcp.mb);
         }
     });
 
@@ -28,6 +28,8 @@
 #include "cpu/gemm_convolution_utils.hpp"
 #include "cpu/primitive_attr_postops.hpp"
 
+#include "ref_depthwise_injector.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
@@ -156,7 +158,7 @@ struct gemm_convolution_bwd_data_t : public primitive_t {
                     VERBOSE_BAD_ALGORITHM);
             VDISPATCH_CONV(!has_zero_dim_memory(), VERBOSE_EMPTY_TENSOR, "");
             VDISPATCH_CONV(
-                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+                    is_supported_post_ops(), VERBOSE_UNSUPPORTED_ATTR);
 
             auto scratchpad = scratchpad_registry().registrar();
 
@@ -166,9 +168,42 @@ struct gemm_convolution_bwd_data_t : public primitive_t {
         }
 
         conv_gemm_conf_t jcp_;
+
+    protected:
+        virtual bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
-    gemm_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
+
+    gemm_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {
+        const auto &post_ops = pd()->attr()->post_ops_;
+        for (int i = 0; i < post_ops.len(); i++) {
+            auto &post_op = post_ops.entry_[i];
+            if (post_op.is_depthwise()) {
+                depthwise_injectors.push_back(new ref_depthwise_scalar_fwd_t(post_op.depthwise.alg));
+            }
+        }
+    }
+
+    ~gemm_convolution_bwd_data_t() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
     typedef typename prec_traits<data_type::f32>::type data_t;
 
@@ -187,6 +222,8 @@ struct gemm_convolution_bwd_data_t : public primitive_t {
             const memory_tracking::grantor_t &scratchpad) const;
 
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    nstl::vector<ref_depthwise_scalar_fwd_t*> depthwise_injectors;
 };
 
 struct gemm_convolution_bwd_weights_t : public primitive_t {
 
@@ -387,6 +387,8 @@ status_t ref_convolution_bwd_data_t::execute_backward_data(
         return ds;
     };
 
+    const auto &p = pd()->attr()->post_ops_;
+
     parallel_nd(G, MB, IC, ID, IH, IW,
             [&](dim_t g, dim_t mb, dim_t ic, dim_t id, dim_t ih, dim_t iw) {
                 float ds = 0;
@@ -396,6 +398,18 @@ status_t ref_convolution_bwd_data_t::execute_backward_data(
                 else
                     ds += ker(g, mb, ic, id, ih, iw);
 
+                int depthwise_inj_idx = 0;
+                for (int i = 0; i < p.len(); i++) {
+                    auto &post_op = p.entry_[i];
+                    if (post_op.is_depthwise()) {
+                        auto depthwise_weights = post_op.depthwise.weights_data;
+                        auto depthwise_bias = post_op.depthwise.biases_data;
+
+                        ds = depthwise_injectors[depthwise_inj_idx]->compute_scalar(ds, depthwise_weights + g * IC + ic, depthwise_bias + g * IC + ic);
+                    }
+                    depthwise_inj_idx++;
+                }
+
                 const auto diff_src_off = ref_conv_utils::get_data_off(
                         diff_src_d, ndims, mb, g * IC + ic, id, ih, iw);
                 io::store_float_value(
 
@@ -27,6 +27,8 @@
 #include "cpu/cpu_convolution_pd.hpp"
 #include "cpu/primitive_attr_postops.hpp"
 
+#include "ref_depthwise_injector.hpp"
+
 namespace dnnl {
 namespace impl {
 namespace cpu {
@@ -118,7 +120,9 @@ struct ref_convolution_bwd_data_t : public primitive_t {
                     && utils::one_of(diff_dst_type, f32, bf16, f16)
                     && wei_type == diff_dst_type
                     && utils::one_of(diff_src_type, f32, diff_dst_type)
-                    && set_default_formats() && attr()->has_default_values();
+                    && set_default_formats()
+                    && attr()->has_default_values(primitive_attr_t::skip_mask_t::post_ops)
+                    && is_supported_post_ops();
 
             return ok ? status::success : status::unimplemented;
         }
@@ -132,9 +136,41 @@ struct ref_convolution_bwd_data_t : public primitive_t {
                     : utils::pick(ndims() - 3, oiw, oihw, oidhw);
             return set_default_formats_common(dat_tag, wei_tag, dat_tag);
         }
+
+        bool is_supported_post_ops() const {
+            const auto &p = this->attr()->post_ops_;
+            if (p.len() > 1)
+                return false;
+
+            auto all_post_ops_supported = [&]() {
+                bool ok = true;
+
+                for (int i = 0; i < p.len(); i++) {
+                    ok = ok && utils::one_of(p.entry_[i].kind, primitive_kind::depthwise);
+                }
+                return ok;
+            };
+
+            return all_post_ops_supported();
+        }
     };
 
-    ref_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {}
+    ref_convolution_bwd_data_t(const pd_t *apd) : primitive_t(apd) {
+        const auto &post_ops = pd()->attr()->post_ops_;
+
+        for (int i = 0; i < post_ops.len(); i++) {
+            auto &post_op = post_ops.entry_[i];
+            if (post_op.is_depthwise()) {
+                depthwise_injectors.push_back(new ref_depthwise_scalar_fwd_t(post_op.depthwise.alg));
+            }
+        }
+    }
+
+    ~ref_convolution_bwd_data_t() {
+        for (auto inj : depthwise_injectors)
+            delete inj;
+        depthwise_injectors.clear();
+    }
 
     status_t execute(const exec_ctx_t &ctx) const override {
         return execute_backward_data(ctx);
@@ -143,6 +179,8 @@ struct ref_convolution_bwd_data_t : public primitive_t {
 private:
     status_t execute_backward_data(const exec_ctx_t &ctx) const;
     const pd_t *pd() const { return (const pd_t *)primitive_t::pd().get(); }
+
+    nstl::vector<ref_depthwise_scalar_fwd_t*> depthwise_injectors;
 };
 
 struct ref_convolution_bwd_weights_t : public primitive_t {
 
@@ -0,0 +1,79 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#include "ref_depthwise_injector.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+using namespace alg_kind;
+using namespace math;
+
+template <typename T> inline T scale_shift_fwd(T s_val, T w_val, T b_val) {
+    return s_val*w_val + b_val;
+}
+
+template <typename T> inline T prelu_fwd(T s_val, T w_val) {
+    return s_val >= 0 ? s_val : s_val*w_val;
+}
+
+union float_raw {
+    float f;
+    unsigned short i[2];
+};
+
+static float bf16tof32(bfloat16_t bf16) {
+    union float_raw t = { 0 };
+    t.i[1] = bf16;
+    t.i[0] = 0;
+    return t.f;
+}
+
+static bfloat16_t f32tobf16(float f32) {
+    union float_raw t = { 0 };
+    t.f = f32;
+    return t.i[1];
+}
+
+inline bfloat16_t bf16_scale_shift_fwd(bfloat16_t s_val, bfloat16_t w_val, bfloat16_t b_val) {
+    return f32tobf16(bf16tof32(s_val) * bf16tof32(w_val) + bf16tof32(b_val));
+}
+
+inline bfloat16_t bf16_prelu_fwd(bfloat16_t s_val, bfloat16_t w_val) {
+    return s_val >= 0 ? s_val : f32tobf16(bf16tof32(s_val) * bf16tof32(w_val));
+}
+
+ref_depthwise_scalar_fwd_t::ref_depthwise_scalar_fwd_t(const alg_kind_t alg_)
+        : alg(alg_) {
+    using namespace alg_kind;
+
+    assert(utils::one_of(alg, depthwise_scale_shift, depthwise_prelu));
+}
+
+float ref_depthwise_scalar_fwd_t::compute_scalar(float s, const float* weights, const float* bias) {
+    switch (alg) {
+        case depthwise_scale_shift: return scale_shift_fwd(s, *weights, *bias);
+        case depthwise_prelu: return prelu_fwd(s, *weights);
+        default: assert(!"unknown depthwise alg_kind");
+    }
+
+    return 0.0f;
+}
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
@@ -0,0 +1,40 @@
+/*******************************************************************************
+* Copyright 2020 Intel Corporation
+*
+* Licensed under the Apache License, Version 2.0 (the "License");
+* you may not use this file except in compliance with the License.
+* You may obtain a copy of the License at
+*
+*     http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing, software
+* distributed under the License is distributed on an "AS IS" BASIS,
+* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+* See the License for the specific language governing permissions and
+* limitations under the License.
+*******************************************************************************/
+
+#ifndef REF_DEPTHWISE_INJECTOR_HPP
+#define REF_DEPTHWISE_INJECTOR_HPP
+
+#include "common/primitive.hpp"
+#include "common/primitive_attr.hpp"
+
+namespace dnnl {
+namespace impl {
+namespace cpu {
+
+struct ref_depthwise_scalar_fwd_t {
+public:
+    explicit ref_depthwise_scalar_fwd_t(alg_kind_t alg);
+    float compute_scalar(float s, const float* weights, const float* bias);
+
+private:
+    alg_kind_t alg;
+};
+
+} // namespace cpu
+} // namespace impl
+} // namespace dnnl
+
+#endif