xe: jit: gemm: expand decomp cases, enforce fpmath

kealan-barbieri · kealan-barbieri · commit ebf1fa9efe78 · 2025-03-24T09:36:14.000-07:00
diff --git a/src/gpu/intel/jit/gemm/gen_gemm.hpp b/src/gpu/intel/jit/gemm/gen_gemm.hpp
@@ -74,8 +74,8 @@ struct gen_gemm_t : public gpu_gemm_t {
             wei_decomp_ = (utils::one_of(d->c_type(), f32, f16, bf16, f8_e5m2,
                                    f8_e4m3)
                                   && utils::one_of(d->a_type(), u8, s8, s4, u4)
-                                  && utils::one_of(d->b_type(), f16, f32, bf16,
-                                          f8_e5m2, f8_e4m3))
+                                  && utils::one_of(d->b_type(), u8, s8, s4, u4,
+                                          f16, f32, bf16, f8_e5m2, f8_e4m3))
                     && attr()->mayiconvert(d->a_type(), f32);
             dy_quant_enabled_
                     = (utils::one_of(d->c_type(), f32, f16, bf16)
@@ -224,6 +224,9 @@ struct gen_gemm_t : public gpu_gemm_t {
 
             if (!attr()->zero_points_.has_default_values()) {
                 if (!attr_zps.has_default_values(DNNL_ARG_A)) {
+                    // Only apply to integers inputs.
+                    VDISPATCH_GEMM(utils::one_of(d->a_type(), s4, u4, s8, u8),
+                            VERBOSE_UNSUPPORTED_ZP_CFG);
                     const int cmask_a = attr_zps.get_mask(DNNL_ARG_A);
                     ao_dims_ = cmask_a > 0;
 
@@ -253,10 +256,17 @@ struct gen_gemm_t : public gpu_gemm_t {
                         VDISPATCH_GEMM(utils::one_of(cmask_a, 0, mask_per_oc,
                                                mask_per_ic),
                                 VERBOSE_UNSUPPORTED_ZP_CFG);
+                        // Weights zp can only be performantly enabled during upconversion.
+                        VDISPATCH_GEMM(wei_decomp_
+                                        || utils::one_of(d->b_type(), s4, u4),
+                                VERBOSE_UNSUPPORTED_ZP_CFG);
                     }
                 }
 
                 if (!attr_zps.has_default_values(DNNL_ARG_B)) {
+                    // Only apply to integers inputs.
+                    VDISPATCH_GEMM(utils::one_of(d->b_type(), s4, u4, s8, u8),
+                            VERBOSE_UNSUPPORTED_ZP_CFG);
                     const int cmask_b = attr_zps.get_mask(DNNL_ARG_B);
                     bo_dims_ = cmask_b > 0;
 
@@ -390,6 +400,7 @@ struct gen_gemm_t : public gpu_gemm_t {
                     : data_type::s32;
             if (swap_ab_) std::swap(ao_type, bo_type);
             bool int_acc = utils::one_of(eff_a_type(), s8, u8);
+            int_acc &= !wei_scales_2d_;
             auto co_type = with_bias() ? d->bias_type()
                     : with_sum_ab()    ? d->sum_ab_type
                     : int_acc          ? s32
diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp
@@ -535,19 +535,13 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,
 
     auto add_mode_matches = [&](bool has_mode, const char *(*match)(Type)) {
         if (!has_mode) return;
-        auto &def = base.selector.precisions;
         if (match(problem_.Ta)) {
-            match_params.push_back(base);
             match_params.back().selector.precisions[0] = match(problem_.Ta);
-            match_params.back().selector.precisions[1] = def[1];
         }
         if (match(problem_.Tb)) {
-            match_params.push_back(base);
-            match_params.back().selector.precisions[0] = def[0];
             match_params.back().selector.precisions[1] = match(problem_.Tb);
         }
         if (match(problem_.Ta) && match(problem_.Tb)) {
-            match_params.push_back(base);
             match_params.back().selector.precisions[0] = match(problem_.Ta);
             match_params.back().selector.precisions[1] = match(problem_.Tb);
         }
diff --git a/src/gpu/intel/jit/gemm/selector/db/kernel.db b/src/gpu/intel/jit/gemm/selector/db/kernel.db

Original file line number	Diff line number	Diff line change
`@@ -535,19 +535,13 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch,`
`535`	`535`
`536`	`536`	`auto add_mode_matches = [&](bool has_mode, const char (match)(Type)) {`
`537`	`537`	`if (!has_mode) return;`
`538`		`- auto &def = base.selector.precisions;`
`539`	`538`	`if (match(problem_.Ta)) {`
`540`		`- match_params.push_back(base);`
`541`	`539`	`match_params.back().selector.precisions[0] = match(problem_.Ta);`
`542`		`- match_params.back().selector.precisions[1] = def[1];`
`543`	`540`	`}`
`544`	`541`	`if (match(problem_.Tb)) {`
`545`		`- match_params.push_back(base);`
`546`		`- match_params.back().selector.precisions[0] = def[0];`
`547`	`542`	`match_params.back().selector.precisions[1] = match(problem_.Tb);`
`548`	`543`	`}`
`549`	`544`	`if (match(problem_.Ta) && match(problem_.Tb)) {`
`550`		`- match_params.push_back(base);`
`551`	`545`	`match_params.back().selector.precisions[0] = match(problem_.Ta);`
`552`	`546`	`match_params.back().selector.precisions[1] = match(problem_.Tb);`
`553`	`547`	`}`