generic: sycl: implement prelu post-op (#2131)

t4c1 · web-flow · commit 19f0862b88dd · 2024-10-04T09:31:52.000-07:00
diff --git a/src/gpu/generic/sycl/binary_kernels.hpp b/src/gpu/generic/sycl/binary_kernels.hpp
@@ -48,7 +48,7 @@ struct binary_kernel_vec_t {
                                                         | DNNL_ARG_SRC_0)
                                                      .data_type()
                                            : data_type_t::dnnl_f32)
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src0_mem(src0_, conf_.src0_md);
diff --git a/src/gpu/generic/sycl/eltwise_kernels.hpp b/src/gpu/generic/sycl/eltwise_kernels.hpp
@@ -36,7 +36,7 @@ struct eltwise_fwd_kernel_vec_t {
             ::sycl::handler &cgh, const exec_ctx_t &ctx)
         : conf_(conf)
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
-        , po_args_(cgh, ctx)
+        , po_args_(cgh, ctx, conf_.post_ops)
         , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST)) {}
 
     void operator()(::sycl::nd_item<1> item) const {
diff --git a/src/gpu/generic/sycl/matmul_kernels.hpp b/src/gpu/generic/sycl/matmul_kernels.hpp
@@ -323,6 +323,36 @@ struct matmul_kernel_fwd_t {
                 }
             }
         }
+
+        void apply_post_ops_edge(sycl_post_ops_t post_ops,
+                register_block<Rows, Cols> prev_dst, dims_t off_po, int dim1,
+                const matmul_kernel_fwd_t *kernel, int rows, int cols) {
+            for (int row = 0; row < rows; row++) {
+                int col;
+                for (col = 0; col < cols / vec_len; col++) {
+                    for (int v_el = 0; v_el < vec_len; v_el++) {
+                        off_po[dim1] += row;
+                        off_po[dim1 + 1] += col * vec_len + v_el;
+                        data[row][col][v_el]
+                                = post_ops.apply(data[row][col][v_el],
+                                        prev_dst.data[row][col][v_el],
+                                        kernel->po_args_, off_po);
+                        off_po[dim1] -= row;
+                        off_po[dim1 + 1] -= col * vec_len + v_el;
+                    }
+                }
+                int n_remaining = cols - col * vec_len;
+                for (int v_el = 0; v_el < n_remaining; v_el++) {
+                    off_po[dim1] += row;
+                    off_po[dim1 + 1] += col * vec_len + v_el;
+                    data[row][col][v_el] = post_ops.apply(data[row][col][v_el],
+                            prev_dst.data[row][col][v_el], kernel->po_args_,
+                            off_po);
+                    off_po[dim1] -= row;
+                    off_po[dim1 + 1] -= col * vec_len + v_el;
+                }
+            }
+        }
     };
 
     matmul_kernel_fwd_t(const sycl_matmul_conf_t &conf, ::sycl::handler &cgh,
@@ -377,7 +407,7 @@ struct matmul_kernel_fwd_t {
         , dropout_seed_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_ATTR_DROPOUT_SEED))
         , dropout_probability_(
                   CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_ATTR_DROPOUT_PROBABILITY))
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         using data_block_t = register_block<register_block_M, register_block_K>;
@@ -597,8 +627,13 @@ struct matmul_kernel_fwd_t {
             if (conf_.transpose_dst) {
                 std::swap(off_po[matmul_dim_1], off_po[matmul_dim_2]);
             }
-            dst_block.apply_post_ops(
-                    conf_.post_ops, prev_dst, off_po, matmul_dim_1, this);
+            if (is_dst_edge_block) {
+                dst_block.apply_post_ops_edge(conf_.post_ops, prev_dst, off_po,
+                        matmul_dim_1, this, remaining_m, remaining_n);
+            } else {
+                dst_block.apply_post_ops(
+                        conf_.post_ops, prev_dst, off_po, matmul_dim_1, this);
+            }
 
             if (conf_.do_scale_dst) {
                 dst_block.eltwise([=](float &el) { el /= dst_scale; });
diff --git a/src/gpu/generic/sycl/pooling_kernels.hpp b/src/gpu/generic/sycl/pooling_kernels.hpp
@@ -42,7 +42,7 @@ struct pooling_fwd_kernel_vec_t {
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
         , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
         , ws_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_WORKSPACE))
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
diff --git a/src/gpu/generic/sycl/ref_binary.cpp b/src/gpu/generic/sycl/ref_binary.cpp
@@ -49,7 +49,7 @@ status_t ref_binary_t::pd_t::init_conf() {
                 = conf_.src0_md.dims()[i] != 1 && conf_.src1_md.dims()[i] == 1;
     }
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
diff --git a/src/gpu/generic/sycl/ref_convolution.cpp b/src/gpu/generic/sycl/ref_convolution.cpp
@@ -51,7 +51,7 @@ status_t ref_convolution_fwd_t::pd_t::init_conf() {
     conf_.single_data_zeropoint = attr()->zero_points_.common(DNNL_ARG_SRC_0);
     conf_.single_dst_zeropoint = attr()->zero_points_.common(DNNL_ARG_DST);
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
     conf_.padding[1] = static_cast<int>(desc()->padding[0][1]);
@@ -111,7 +111,7 @@ status_t ref_convolution_bwd_data_t::pd_t::init_conf() {
     conf_.single_data_zeropoint = attr()->zero_points_.common(DNNL_ARG_SRC_0);
     conf_.single_dst_zeropoint = attr()->zero_points_.common(DNNL_ARG_DST);
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
     conf_.padding[1] = static_cast<int>(desc()->padding[0][1]);
@@ -173,7 +173,7 @@ status_t ref_convolution_bwd_weights_t::pd_t::init_conf() {
     conf_.single_data_zeropoint = attr()->zero_points_.common(DNNL_ARG_SRC_0);
     conf_.single_dst_zeropoint = attr()->zero_points_.common(DNNL_ARG_DST);
 
-    conf_.post_ops = sycl_post_ops_t(attr());
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     conf_.padding[0] = static_cast<int>(desc()->padding[0][0]);
     conf_.padding[1] = static_cast<int>(desc()->padding[0][1]);
diff --git a/src/gpu/generic/sycl/ref_eltwise.cpp b/src/gpu/generic/sycl/ref_eltwise.cpp
@@ -38,11 +38,7 @@ status_t ref_sycl_eltwise_fwd_t::pd_t::init_conf() {
     conf_.h = H();
     conf_.w = W();
 
-    if (attr()->post_ops_.len() > sycl_post_ops_t::max_post_ops) {
-        return status::unimplemented;
-    }
-    conf_.post_po_len = attr()->post_ops_.len();
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
diff --git a/src/gpu/generic/sycl/ref_matmul.cpp b/src/gpu/generic/sycl/ref_matmul.cpp
@@ -43,8 +43,7 @@ void ref_matmul_t::pd_t::init_conf() {
             = !attr()->zero_points_.has_default_values(DNNL_ARG_DST);
 
     conf_.use_dropout = !attr()->dropout_.has_default_values();
-
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     memory_desc_wrapper src_d = src_md();
     memory_desc_wrapper weights_d = weights_md();
diff --git a/src/gpu/generic/sycl/ref_matmul.hpp b/src/gpu/generic/sycl/ref_matmul.hpp
@@ -59,7 +59,8 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
                             | sm::zero_points_runtime_data_type)
                     && IMPLICATION(
                             !attr()->scales_.has_default_values(), scales_ok())
-                    && post_ops_ok() && md_dims_in_range(src_md())
+                    && sycl_post_ops_t::post_ops_ok(attr())
+                    && md_dims_in_range(src_md())
                     && md_dims_in_range(weights_md());
             if (!ok) return status::unimplemented;
 
@@ -121,14 +122,6 @@ struct ref_matmul_t : public gpu::generic::sycl::primitive_t {
             return dt_ok && attr_scales_ok(supported_args);
         }
 
-        bool post_ops_ok() const {
-            // Dw conv post-ops are not supported.
-            return attr()->post_ops_.len() <= sycl_post_ops_t::max_post_ops
-                    && attr()->post_ops_.has_default_values(
-                            {primitive_kind::eltwise, primitive_kind::binary,
-                                    primitive_kind::sum});
-        }
-
         static bool check_data_types(const memory_desc_wrapper &src,
                 const memory_desc_wrapper &weights,
                 const memory_desc_wrapper &dst) {
diff --git a/src/gpu/generic/sycl/ref_pooling.cpp b/src/gpu/generic/sycl/ref_pooling.cpp
@@ -64,13 +64,7 @@ status_t ref_pooling_fwd_t::pd_t::init_conf() {
     conf_.DH = KDH(); //K:kernel D:Dilation H:Height
     conf_.DW = KDW(); //K:kernel D:Dilation W:Weight
 
-    const auto *att = attr();
-    const auto &attr_po = att->post_ops_;
-    if (attr_po.len() > sycl_post_ops_t::max_post_ops) {
-        return dnnl_unimplemented;
-    }
-    conf_.po_len = attr_po.len();
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
     return status::success;
 }
 
diff --git a/src/gpu/generic/sycl/ref_reorder.cpp b/src/gpu/generic/sycl/ref_reorder.cpp
@@ -38,7 +38,7 @@ status_t ref_reorder_t::pd_t::init_conf() {
     conf_.do_scale_dst
             = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
     conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_;
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
diff --git a/src/gpu/generic/sycl/ref_resampling.cpp b/src/gpu/generic/sycl/ref_resampling.cpp
@@ -43,14 +43,8 @@ status_t ref_resampling_fwd_t::pd_t::init_conf() {
     conf_.dst_md = xpu::sycl::md_t(dst_md());
 
     conf_.alg = desc()->alg_kind;
-    const auto *att = attr();
-    const auto &attr_po = att->post_ops_;
-    if (attr_po.len() > sycl_post_ops_t::max_post_ops) {
-        return dnnl_unimplemented;
-    }
-    conf_.po_len = attr_po.len();
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
     return status::success;
 }
 
diff --git a/src/gpu/generic/sycl/ref_softmax.cpp b/src/gpu/generic/sycl/ref_softmax.cpp
@@ -41,8 +41,7 @@ status_t ref_sycl_softmax_fwd_t::pd_t::init_conf() {
     conf_.do_scale_dst
             = !attr()->scales_.get(DNNL_ARG_DST).has_default_values();
 
-    conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);
-    conf_.po_len = attr()->post_ops_.len();
+    conf_.post_ops = sycl_post_ops_t(attr(), dst_md());
 
     return status::success;
 }
diff --git a/src/gpu/generic/sycl/resampling_kernels.hpp b/src/gpu/generic/sycl/resampling_kernels.hpp
@@ -39,7 +39,7 @@ struct resampling_kernel_fwd_vec_t {
         : conf_(conf)
         , src_(CTX_IN_SYCL_KERNEL_MEMORY(DNNL_ARG_SRC))
         , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
diff --git a/src/gpu/generic/sycl/softmax_kernels.hpp b/src/gpu/generic/sycl/softmax_kernels.hpp
@@ -42,7 +42,7 @@ struct softmax_fwd_kernel_vec_t {
         , scale_dst_(CTX_IN_SYCL_KERNEL_MEMORY(
                   DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST))
         , dst_(CTX_OUT_SYCL_KERNEL_MEMORY(DNNL_ARG_DST))
-        , po_args_(cgh, ctx) {}
+        , po_args_(cgh, ctx, conf_.post_ops) {}
 
     void operator()(::sycl::nd_item<1> item) const {
         memory_tensor_t src_mem(src_, conf_.src_md);
diff --git a/src/gpu/generic/sycl/sycl_post_ops.hpp b/src/gpu/generic/sycl/sycl_post_ops.hpp
diff --git a/src/gpu/generic/sycl/sycl_primitive_conf.hpp b/src/gpu/generic/sycl/sycl_primitive_conf.hpp

Original file line number	Diff line number	Diff line change
`@@ -49,7 +49,7 @@ status_t ref_binary_t::pd_t::init_conf() {`
`49`	`49`	`= conf_.src0_md.dims()[i] != 1 && conf_.src1_md.dims()[i] == 1;`
`50`	`50`	`}`
`51`	`51`
`52`		`- conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);`
	`52`	`+ conf_.post_ops = sycl_post_ops_t(attr(), dst_md());`
`53`	`53`
`54`	`54`	`return status::success;`
`55`	`55`	`}`
Original file line number	Diff line number	Diff line change
`@@ -38,7 +38,7 @@ status_t ref_reorder_t::pd_t::init_conf() {`
`38`	`38`	`conf_.do_scale_dst`
`39`	`39`	`= !attr()->scales_.get(DNNL_ARG_DST).has_default_values();`
`40`	`40`	`conf_.scale_dst_mask = attr()->scales_.get(DNNL_ARG_DST).mask_;`
`41`		`- conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);`
	`41`	`+ conf_.post_ops = sycl_post_ops_t(attr(), dst_md());`
`42`	`42`
`43`	`43`	`return status::success;`
`44`	`44`	`}`
Original file line number	Diff line number	Diff line change
`@@ -41,8 +41,7 @@ status_t ref_sycl_softmax_fwd_t::pd_t::init_conf() {`
`41`	`41`	`conf_.do_scale_dst`
`42`	`42`	`= !attr()->scales_.get(DNNL_ARG_DST).has_default_values();`
`43`	`43`
`44`		`- conf_.post_ops = sycl_post_ops_t(attr(), dst_md()->data_type);`
`45`		`- conf_.po_len = attr()->post_ops_.len();`
	`44`	`+ conf_.post_ops = sycl_post_ops_t(attr(), dst_md());`
`46`	`45`
`47`	`46`	`return status::success;`
`48`	`47`	`}`