support bf16 activation

zhewang1-intc · zhewang1-intc · commit 29946d2fb707 · 2024-07-01T10:51:38.000+08:00
diff --git a/include/common/core/explicit_conv.hpp b/include/common/core/explicit_conv.hpp
@@ -62,6 +62,19 @@ xetla_cvt(xetla_vector<T_src, N> src) {
   return dst;
 }
 
+/// @brief xetla explicit data conversion, bf16->fp16.
+/// @tparam T_dst is the float16 data type.
+/// @tparam T_src is the bfloat16 data type.
+/// @tparam N is the element number in xetla_vector.
+template <typename T_dst, typename T_src, int N>
+__XETLA_API typename std::enable_if_t<
+    std::is_same<T_dst, fp16>::value && std::is_same<T_src, bf16>::value,
+    xetla_vector<T_dst, N>>
+xetla_cvt(xetla_vector<T_src, N> src) {
+  xetla_vector<T_dst, N> dst = src;
+  return dst;
+}
+
 /// @brief xetla explicit data conversion, bf16->fp32.
 /// @tparam T_dst is the bfloat16 data type.
 /// @tparam T_src is the float32 data type.
diff --git a/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp b/include/experimental/kernel/gemm/impl/int4_dequantize_kslicing_xe.hpp
@@ -526,6 +526,10 @@ class gemm_universal_t<
   template <quant_mode quant_mode>
   static bool can_implement(arguments_t<quant_mode>& args) {
     bool implementable = true;
+    if (arch_tag == gpu_arch::XeLpg) {
+      implementable &= !std::is_same_v<dtype_a, bf16>; // XeLpg arch dosen't
+                                                       // have bf16 related isa.
+    }
     if (gemm_t::msg_type_a != msg_type::unaligned_2d) {
       if (gemm_t::msg_type_a == msg_type::block_2d) {
         implementable &= kernel::block_2d<arch_tag, dtype_a>::check_tensor(
@@ -617,8 +621,8 @@ class gemm_universal_t<
     int start_x_scale = start_n;
     int start_y_scale = start_k / dequant_s;
 
-    int start_x_zero_pt = gemm_t::compute_policy::quant_mode ==
-            quant_mode::INT4_ASYM_FP_ZERO
+    int start_x_zero_pt =
+        gemm_t::compute_policy::quant_mode == quant_mode::INT4_ASYM_FP_ZERO
         ? start_n
         : start_n / pack_ratio;
     int start_y_zero_pt = start_k / dequant_s;
@@ -690,8 +694,7 @@ class gemm_universal_t<
           mem_desc_scale,
           mem_desc_zero_pt);
     } else if constexpr (
-        gemm_t::compute_policy::quant_mode ==
-        quant_mode::INT4_ASYM_FP_ZERO) {
+        gemm_t::compute_policy::quant_mode == quant_mode::INT4_ASYM_FP_ZERO) {
       mem_desc_zero_pt_t mem_desc_zero_pt(
           args.zero_pt_base,
           {args.matrix_n,
diff --git a/include/subgroup/tile/impl/tile_op_functor.hpp b/include/subgroup/tile/impl/tile_op_functor.hpp
@@ -163,11 +163,10 @@ struct dequant_int4_weight_t {
               uint32_t zero_pt_idx =
                   offset_y_in_tile / dequant_s * zero_pt_t::block_size_x +
                   offset_x_in_tile;
-              native_type_t<typename zero_pt_t::dtype> zero_pt_pack =
-                  zero_pt.reg[zero_pt_idx];
+              xetla_vector<fp16, 1> zero_pt_pack = zero_pt.reg[zero_pt_idx];
               dst_blk.xetla_select<step, 1>(jj * block_size_y_b + ii) =
                   dst_blk.xetla_select<step, 1>(jj * block_size_y_b + ii) +
-                  zero_pt_pack;
+                  zero_pt_pack[0];
             }
             // sycl::ext::oneapi::experimental::printf(
             //     "scale[%d] %f \n",
diff --git a/tests/integration/gemv/int4/main.cpp b/tests/integration/gemv/int4/main.cpp
@@ -49,9 +49,9 @@ class test_col_major_1 {
   static constexpr mem_layout layout_b = mem_layout::col_major;
   static constexpr mma_engine mma_eng = mma_engine::fpu;
   static constexpr gpu_arch arch = gpu_arch::XeLpg;
-  using data_type_a = fp16;
+  using data_type_a = bf16;
   using data_type_b = int4x8;
-  using data_type_c = fp16;
+  using data_type_c = bf16;
 };
 class test_col_major_2 {
  public:
@@ -569,9 +569,11 @@ void dequantize_gemv_run(int iter) {
   // performance
   prof.print_profiling_result(profiling_selector::GPU);
   // check result
-  std::vector<typename Test::data_type_a> dequantize_b =
-      dequantize_weight<dequant_s, layout_b, compute_policy::quant_mode>(
-          matrix_k, matrix_n, B_h, scale_h, zero_pt_h);
+  std::vector<typename Test::data_type_a> dequantize_b = dequantize_weight<
+      dequant_s,
+      layout_b,
+      compute_policy::quant_mode,
+      data_type_c>(matrix_k, matrix_n, B_h, scale_h, zero_pt_h);
 
   queue.memcpy((void*)C_h, (void*)C_d, size_c * sizeof(data_type_c)).wait();
   ASSERT_EQ(