diff --git a/src/gpu/intel/jit/gemm/gen_gemm.hpp b/src/gpu/intel/jit/gemm/gen_gemm.hpp index ec358fd6375..c172881c5c7 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm.hpp +++ b/src/gpu/intel/jit/gemm/gen_gemm.hpp @@ -74,14 +74,15 @@ struct gen_gemm_t : public gpu_gemm_t { wei_decomp_ = (utils::one_of(d->c_type(), f32, f16, bf16, f8_e5m2, f8_e4m3) && utils::one_of(d->a_type(), u8, s8, s4, u4) - && utils::one_of(d->b_type(), f16, f32, bf16, - f8_e5m2, f8_e4m3)) + && utils::one_of(d->b_type(), u8, s8, s4, u4, + f16, f32, bf16, f8_e5m2, f8_e4m3)) && attr()->mayiconvert(d->a_type(), f32); dy_quant_enabled_ - = (utils::one_of(d->c_type(), f32, f16, bf16) - && utils::one_of(d->a_type(), u8, s8, s4, u4) - && utils::one_of(d->b_type(), u8, s8)) - || all_f8; + = ((utils::one_of(d->c_type(), f32, f16, bf16) + && utils::one_of(d->a_type(), u8, s8, s4, u4) + && utils::one_of(d->b_type(), u8, s8)) + || all_f8) + && !attr()->mayiconvert(d->a_type(), f32); quant_enabled_ = wei_decomp_ || dy_quant_enabled_; CHECK(set_default_formats(false)); @@ -224,6 +225,9 @@ struct gen_gemm_t : public gpu_gemm_t { if (!attr()->zero_points_.has_default_values()) { if (!attr_zps.has_default_values(DNNL_ARG_A)) { + // Only apply to integers inputs. + VDISPATCH_GEMM(utils::one_of(d->a_type(), s4, u4, s8, u8), + VERBOSE_UNSUPPORTED_ZP_CFG); const int cmask_a = attr_zps.get_mask(DNNL_ARG_A); ao_dims_ = cmask_a > 0; @@ -253,10 +257,20 @@ struct gen_gemm_t : public gpu_gemm_t { VDISPATCH_GEMM(utils::one_of(cmask_a, 0, mask_per_oc, mask_per_ic), VERBOSE_UNSUPPORTED_ZP_CFG); + // Weights zp can only be performantly enabled during upconversion + // for cases that perform decompression. + VDISPATCH_GEMM(wei_decomp_ + || utils::one_of( + d->c_type(), s8, u8, s32) + || utils::one_of(d->a_type(), s4, u4), + VERBOSE_UNSUPPORTED_ZP_CFG); } } if (!attr_zps.has_default_values(DNNL_ARG_B)) { + // Only apply to integers inputs. + VDISPATCH_GEMM(utils::one_of(d->b_type(), s4, u4, s8, u8), + VERBOSE_UNSUPPORTED_ZP_CFG); const int cmask_b = attr_zps.get_mask(DNNL_ARG_B); bo_dims_ = cmask_b > 0; @@ -344,6 +358,7 @@ struct gen_gemm_t : public gpu_gemm_t { src_scales_2d_ = false; else { src_q2d_group_k = scales_group_k; + // 2d src scales only supported during dequantization. VDISPATCH_GEMM(dy_quant_enabled_ && utils::one_of(eff_a_type(), s4, u4), VERBOSE_UNSUPPORTED_SCALES_CFG); @@ -390,6 +405,7 @@ struct gen_gemm_t : public gpu_gemm_t { : data_type::s32; if (swap_ab_) std::swap(ao_type, bo_type); bool int_acc = utils::one_of(eff_a_type(), s8, u8); + int_acc &= !wei_scales_2d_; auto co_type = with_bias() ? d->bias_type() : with_sum_ab() ? d->sum_ab_type : int_acc ? s32 @@ -420,12 +436,17 @@ struct gen_gemm_t : public gpu_gemm_t { // Handle special compute modes. kernel_desc_t::compute_mode mode = kernel_desc_t::mode_default; - if (attr()->mayiconvert(f32, tf32)) - set_mode(mode, kernel_desc_t::mode_tf32); - if (attr()->mayiconvert(f32, bf16)) + if (attr()->mayiconvert(u8, bf16)) set_mode(mode, kernel_desc_t::mode_bf16x1); - if (attr()->mayiconvert(f32, f16)) + if (attr()->mayiconvert(u8, f16)) set_mode(mode, kernel_desc_t::mode_f16x1); + if (attr()->mayiconvert(f32, tf32) + && !(mode + & (kernel_desc_t::mode_f16x1 + | kernel_desc_t::mode_bf16x1))) { + VDISPATCH_GEMM(!wei_decomp_, VERBOSE_UNSUPPORTED_DT); + set_mode(mode, kernel_desc_t::mode_tf32); + } if (attr()->mayiconvert(f32, f32)) set_mode(mode, kernel_desc_t::mode_strict); if (attr()->deterministic_) diff --git a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp index 132cd913ae6..183d56e7751 100644 --- a/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp +++ b/src/gpu/intel/jit/gemm/gen_gemm_kernel.cpp @@ -533,49 +533,58 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch, bool fpmath_strict = !(fpmath_tf32 || fpmath_bf16 || fpmath_f16) && (mode & mode_strict) && (mode & mode_w_decomp); - auto add_mode_matches = [&](bool has_mode, const char *(*match)(Type)) { + auto add_mode_matches = [&](bool has_mode, bool optional, + const char *(*match)(Type)) { if (!has_mode) return; auto &def = base.selector.precisions; if (match(problem_.Ta)) { - match_params.push_back(base); + if (optional) { + match_params.push_back(base); + match_params.back().selector.precisions[1] = def[1]; + } match_params.back().selector.precisions[0] = match(problem_.Ta); - match_params.back().selector.precisions[1] = def[1]; } if (match(problem_.Tb)) { - match_params.push_back(base); - match_params.back().selector.precisions[0] = def[0]; + if (optional) { + match_params.push_back(base); + match_params.back().selector.precisions[0] = def[0]; + } match_params.back().selector.precisions[1] = match(problem_.Tb); } if (match(problem_.Ta) && match(problem_.Tb)) { - match_params.push_back(base); + if (optional) match_params.push_back(base); match_params.back().selector.precisions[0] = match(problem_.Ta); match_params.back().selector.precisions[1] = match(problem_.Tb); } }; - add_mode_matches(fpmath_tf32, [](Type dt) -> const char * { - if (dt == Type::f32) { return "T"; } - return nullptr; - }); - - add_mode_matches(fpmath_bf16, [](Type dt) -> const char * { - if (dt == Type::f32) { return "[SB]"; } - if (dt.isInt8() || dt.isInt4()) return "[OB]"; - if (dt.isF8()) return "B"; - return nullptr; - }); - - add_mode_matches(fpmath_f16, [](Type dt) -> const char * { - if (dt == Type::f32) { return "[SH]"; } - if (dt.isInt8() || dt.isInt4()) return "[OH]"; - if (dt.isF8()) return "H"; - return nullptr; - }); - - add_mode_matches(!(fpmath_f16 || fpmath_bf16), [](Type dt) -> const char * { - if (dt.isInt4()) return "[FO]"; - return nullptr; - }); + add_mode_matches( + fpmath_tf32, /*optional=*/true, [](Type dt) -> const char * { + if (dt == Type::f32) { return "T"; } + return nullptr; + }); + + add_mode_matches( + fpmath_bf16, /*optional=*/false, [](Type dt) -> const char * { + if (dt == Type::f32) { return "[SB]"; } + if (dt.isInt8() || dt.isInt4()) return "[OB]"; + if (dt.isF8()) return "B"; + return nullptr; + }); + + add_mode_matches( + fpmath_f16, /*optional=*/false, [](Type dt) -> const char * { + if (dt == Type::f32) { return "[SH]"; } + if (dt.isInt8() || dt.isInt4()) return "[OH]"; + if (dt.isF8()) return "H"; + return nullptr; + }); + + add_mode_matches(!(fpmath_f16 || fpmath_bf16), /*optional=*/false, + [](Type dt) -> const char * { + if (dt.isInt4()) return "[FO]"; + return nullptr; + }); if (fpmath_strict) { if (problem_.Tb.isInt4() && !(fpmath_f16 || fpmath_bf16)) { @@ -588,7 +597,7 @@ status_t gen_gemm_nocopy_kernel_desc_t::select_kernel(compute::gpu_arch_t arch, = match_params.back().selector.precisions[1]; } } - add_mode_matches(true, [](Type dt) -> const char * { + add_mode_matches(true, /*optional=*/false, [](Type dt) -> const char * { if (dt.isFP4()) return "E"; return nullptr; }); diff --git a/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp b/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp index 234c7dd706f..3d730018238 100644 --- a/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp +++ b/src/gpu/intel/jit/gemm/include/kernel_catalog.hpp @@ -86,8 +86,13 @@ struct Selector { friend bool operator<(const Selector &sel1, const Selector &sel2) { auto tupleize = [](const Selector &sel) { + bool compoundA = sel.precisions[0][0] == '['; + bool compoundB = sel.precisions[1][0] == '['; return std::make_tuple(sel.hw, - sel.precisions[0][0] & 0x1F, sel.precisions[1][0] & 0x1F, + sel.precisions[0][0] & 0x1F, + compoundA ? sel.precisions[0][2] & 0x1F : 'a', + sel.precisions[1][0] & 0x1F, + compoundB ? sel.precisions[1][2] & 0x1F : 'b', sel.layouts[0][0], sel.layouts[1][0]); }; return tupleize(sel1) < tupleize(sel2); diff --git a/src/gpu/intel/jit/gemm/selector/db/kernel.db b/src/gpu/intel/jit/gemm/selector/db/kernel.db index fd1483e01a3..137a8698e07 100644 --- a/src/gpu/intel/jit/gemm/selector/db/kernel.db +++ b/src/gpu/intel/jit/gemm/selector/db/kernel.db @@ -145,7 +145,7 @@ auto _CATALOG_ = kcatalog::toArray({ {{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.31167e+06, 785275, 0, 0, 0, 0, 7.11381, 8.75643, 6.11098, 15.9972, 0.0503546, 0.0303966, 0.0484271, 0.842682, 1.20649, 1.2023, -1.8357e-15}}}, {{'E', "gemm", {"B", "B", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.35147e+06, 357789, 0, 0, 0, 0, 11.5708, 11.8958, 6.40012, 17.218, 0.14396, 0.139657, 0.0130437, 0.882761, 1.16324, 1.0488, 3.12079e-12}}}, {{'E', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, -{{'E', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, +{{'E', "gemm", {"B", "[OB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "pI"}, "aB32 aB16 aB wg 4x4 cab3 ks64 af vav bo bk0 sm grf256 sys sr dm", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {8192, 8192, 16777216}, {16, 1, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 26112, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.37604e+06, 242290, 0, 0, 0, 0, 5.88917, 23.0297, 1.4695, 1.47683, 0.411652, 0.0306305, 0.368164, 0.884401, 1.21016, 0, 0}}}, {{'E', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 0, 0, 0, {64, 64, 8}, {true, true, true}}, {'W', 1, {64}}}, {{'E', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 512}, {8192, 8192, 512}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {64}}}, {{'E', "gemm", {"D", "D", "D"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aB4x2 aB", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 768}, {8192, 8192, 768}, {8, 8, 4}, {2, 8, 1}, 1, (WGType) 0, 1, 0, 0, {8, 8, 8}, {true, true, true}}, {'W', 1, {64}}}, @@ -259,40 +259,7 @@ auto _CATALOG_ = kcatalog::toArray({ {{'E', "gemm", {"H", "H", "H"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB8 aB8/4 aB wg 4x8 kc8 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 1, 24576, 0, {2, 2, 2}, {true, true, true}}, {'W', 1, {1024}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16x2 aB16x2 aB wg 8x4 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.30894e+06, 785550, 0, 0, 0, 0, 7.11935, 8.75656, 6.13129, 16.041, 0.0504259, 0.0420693, 0.0674256, 0.73758, 1.20696, 1.20187, 1.15297e-15}}}, {{'E', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qpi"}, "aB2x2 aB2x2 aB wg 8x4 kc2 cab4 ks8 nse bo sr bk0 sm l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {8192, 8192, 16777216}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.34765e+06, 357923, 0, 0, 0, 0, 11.5919, 11.8886, 6.4384, 17.1449, 0.145301, 0.14134, 0.0125881, 0.886981, 1.17153, 1.00812, 9.37293e-12}}}, -{{'E', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "oxyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, -{{'E', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Oxyz"}, "sB4x2 sB16 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 2, 16}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 4096, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, -{{'E', "gemm", {"O", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "oxyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, -{{'E', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Oxyz"}, "sB4x2 sB16 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 2, 16}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 4096, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 3, -1}, {-1, 16, -1}, {-1, 3, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4096, 2, -1}, {-1, 2, -1}, {4096, 2, -1}, {-1, 2, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, 4096}, {4096, 2, -1}, {-1, 2, 4096}, {4096, 2, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, -1}, {-1, 1, -1}, {-1, 1, -1}, {1, 1, 1}, "hxyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 512}, {-1, 1, -1}, {-1, 1, 512}, {1, 1, 1}, "H"}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 4194561, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 513}, {1024, 1, -1}, {-1, 1, 513}, {1024, 1, -1}, {1, 1, 1}, "H"}, "aB128 aB64x2 aB wg 8x1x2 ikr nse hi ar sb128 bk0 grf256 dot wt", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {8, 1, 2}, 1, (WGType) 0, 4198661, 0, 256, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 4097}, {-1, 1, -1}, {1025, 1, 4097}, {-1, 1, -1}, {1, 1, 1}, "H"}, "aB128 aB64x2 aB wg 4x1x8 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 16384, 16777216}, {32768, 16384, 16777216}, {2, 1, 128}, {4, 1, 8}, 1, (WGType) 0, 4198661, 0, 128, {1, 2, 4}, {true, true, true}}, {'W', 1, {2}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 513}, {-1, 1, 4096}, {1025, 1, 513}, {-1, 1, 4096}, {1, 1, 1}, "H"}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 4194561, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {4095, 2, 4095}, {-1, 2, -1}, {4095, 2, 4095}, {4, 4, 1}, "Hpxy"}, "sB128 sB64 aB wg 8x1x4 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 32768, 16777216}, {32768, 32768, 16777216}, {2, 2, 128}, {8, 1, 4}, 1, (WGType) 0, 4198661, 0, 256, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB32 aB16 aB ca3 ks64 wg 2x4x4 kr sys dw af k192 grf256 sm vav di dm sr bk0 cc fm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 192}, {2, 4, 4}, 1, (WGType) 1, 5, 12288, 12288, {4, 4, 2}, {true, true, true}}, {'E', 17, {4.488e+06, 120208, 122348, 327.657, 0, 0, 3.41911, 6.33998, 2.77024, 6.84323, 0.0582208, 0.0145417, 0.0574819, 0.511227, 1.20562, 1.20018, 5.11052e-14}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxy"}, "sS32x2 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {995629, 475244, 0, 0, 0, 0, 2.57281, 4.9973, 6.41839, 16.8374, 0.018098, 0.0100962, 0.0118445, 0.996594, 1.41614, 1.19695, 9.60059e-13}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, 2048}, {-1, 1024, -1}, {4, 4, 1}, "Ixy"}, "sS32x2 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {985865, 449373, 0, 0, 0, 0, 2.52021, 5.18432, 6.41111, 16.2959, 0.0196104, 0.0078899, 0.0176455, 0.975623, 1.37619, 1.19224, 7.58759e-13}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sS32 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {996386, 490462, 0, 0, 0, 0, 2.66142, 5.16946, 6.13711, 17.1234, 0.0173418, 0.00858928, 0.0131118, 0.74711, 1.38311, 1.23925, 7.08247e-13}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxy"}, "sB32 sB16 aB wg 8x4 cab3x2 ks32 xaf fx dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05802e+06, 544339, 0, 0, 0, 0, 3.68162, 5.47347, 6.43127, 16.927, 0.0189051, 0.00830613, 0.0176655, 0.737745, 1.33714, 1.26074, -3.55719e-14}}}, -{{'E', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 33, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB32 sB16 aB wg 8x4 cab4x2 ks32 xaf st dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {966409, 495159, 0, 0, 0, 0, 3.17717, 6.26529, 6.93941, 17.3959, 0.0210983, 0.0147392, 0.00938379, 1, 1.72262, 1.08677, 5.49901e-12}}}, +{{'E', "gemm", {"H", "[OH]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 8x4 cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.03504e+06, 483759, 0, 0, 0, 0, 5.14655, 5.4381, 5.95309, 14.8916, 0.0340942, 0.0152122, 0.0258839, 1, 1.27788, 1.19062, 3.54023e-13}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzdsm"}, "sB16 sB16 sb fs wg 4x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {4, 4, 1}, 1, (WGType) 1, 256, 32256, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyzDsm"}, "sB16 sB16 sb fs wg 8x4 bo acb bk8192 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 786432, 8192}, {8192, 8192, 8192}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 256, 61440, 0, {128, 128, 4}, {false, false, false}}, {'W', 1, {1536}}}, {{'E', "gemm", {"O", "O", "I"}, {"A4#8,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "sm"}, "ab16x3 ab16x3 ab fs sc bo acb bk4096 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 4096}, {8192, 8192, 4096}, {32, 32, 32}, {4, 4, 1}, 2, (WGType) 1, 256, 43008, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {1024}}}, @@ -401,6 +368,42 @@ auto _CATALOG_ = kcatalog::toArray({ {{'E', "gemm", {"S", "S", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "v"}, "aS4 aB4x2 aB wg 2x1x16 kr kc4 nse bo sb64 bk0 kb sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 4}, {2, 1, 16}, 1, (WGType) 0, 263, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.15281e+06, 253100, 56985.6, 10434.6, 0, 0, 15.075, 15.6287, 30.852, 29.2407, 6.13546, 4.08401, 0.367041, 0.496243, 1.00145, 0, 0}}}, {{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "qp"}, "aS4x2 aB8x2 aP wg 8x4 kc4 cb4 ks8 nse bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {981989, 555240, 0, 0, 0, 0, 15.2859, 19.3328, 6.46524, 20.9306, 0.131163, 0.116872, 0.0205996, 0.805001, 1.19768, 1.00218, 8.63883e-12}}}, {{'E', "gemm", {"S", "S", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aS4x2 aB8/4x2 aB wg 8x4 kc4 cb4 ks8 nse bk0 grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {984519, 554553, 0, 0, 0, 0, 15.6433, 19.3034, 6.32057, 17.386, 0.131163, 0.116872, 0.020599, 0.810127, 1.20102, 1.02803, 6.01287e-12}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 4x16 cab4 ks32 af dw vav bo bk0 sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {4, 16, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {540457, 1.10675e+06, 0, 0, 0, 0, 5.61877, 5.78531, 5.50943, 14.7763, 0.0373088, 0.0373088, 0, 1, 1.21298, 1.20167, -5.60976e-15}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "oxyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Oxyz"}, "sB4x2 sB16 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 2, 16}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 4096, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 2x8 cab4 ks64 af dw vav bo bk0 sm sn grf256 pab sys l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {2, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.27535e+06, 215187, 0, 0, 0, 0, 4.64559, 4.65327, 3.47044, 9.89232, 0.105235, 0.0510608, 0.0580013, 1, 1.20879, 1.2012, -2.49667e-15}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20075e+06, 136815, 51380.2, 5732.06, 0, 0, 6.73706, 6.52116, 7.79608, 7.48028, 2.1671, 0.385165, 0.687428, 0.333333, 1.20286, 0, 0}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"[OB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 4, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"[OB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB4x2 aS4x2 aP wg 4x8 kc4 ca4 ks8 nse bk0 sm grf256 bo sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.08113e+06, 644110, 0, 0, 0, 0, 17.8779, 12.8384, 6.35841, 21.2735, 0.12828, 0.125356, 0.0107346, 0.963002, 1.21137, 1.04637, 4.08712e-12}}}, +{{'E', "gemm", {"[OB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"A", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 4, 1}, "xyz"}, "sB8 sB8 sB wg 2x1x16 akr kc8 fg 0.25 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {32, 4, 8}, {2, 1, 16}, 1, (WGType) 0, 262917, 0, 1024, {32, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {4, 4, 1}, "oxyz"}, "sB4x2 sS2 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 4}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 8192, {4, 4, 4}, {false, false, false}}, {'W', 1, {256}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4x2 sB4x2 aB wg 4x8 kc2 cab4 ks16 nse bo sr bk0 sn l4 grf256 pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Oxyz"}, "sB4x2 sB16 sB wg 4x2x8 akr fg 0.25 kc2 nse sr sb32 bk0 bm0 pab", {8, (LoopType) 0, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 2, 16}, {4, 2, 8}, 1, (WGType) 0, 262917, 0, 4096, {4, 4, 4}, {false, false, false}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB64 aB wg 8x4 cab4 ks64 af dw vav bo bk0 sm sn sys pab l4 sr", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {131072, 131072, 16777216}, {8192, 8192, 16777216}, {8, 8, 64}, {8, 4, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.06234e+06, 445401, 0, 0, 0, 0, 4.62244, 5.31323, 3.78655, 11.4316, 0.08593, 0.0721937, 0.0244972, 1, 1.21089, 1.20155, -5.01322e-15}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16x2 aB16 aB wg 2x1x8 kr af vav bo bk0 sys l4 sr kd", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {131072, 16384, 16777216}, {8192, 8192, 16777216}, {8, 1, 16}, {2, 1, 8}, 1, (WGType) 0, 261, 0, 2048, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20626e+06, 136698, 52320.2, 5700.91, 0, 0, 6.73519, 6.52141, 8.28549, 8.11665, 2.16712, 0.382531, 0.689817, 0.333333, 1.20221, 0, 0}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB32 aB wg 4x4 cab3 ks64 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 65536, 16777216}, {8192, 8192, 16777216}, {16, 4, 64}, {4, 4, 1}, 1, (WGType) 1, 257, 30720, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.24598e+06, 199092, 0, 0, 0, 0, 5.11406, 6.54063, 3.6868, 10.138, 0.114685, 0.0491769, 0.0713602, 0.902881, 1.20724, 1.20086, 6.44189e-15}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB32 sB16 aB wg 2x4x4 kr cab4 ks32 af dw vav bo bk0 sm sn grf256 sys pab l4 sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 2}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 65536, 16384, {2, 2, 4}, {false, false, true}}, {'E', 17, {1.00833e+06, 472991, 67249.4, 65461, 0, 0, 5.08333, 5.12749, 4.89837, 12.0468, 0.0787666, 0.065148, 0.0481217, 1, 1.20614, 1.20116, 2.71563e-15}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 3, -1}, {-1, 16, -1}, {-1, 3, -1}, {-1, 16, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {4096, 2, -1}, {-1, 2, -1}, {4096, 2, -1}, {-1, 2, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, 4096}, {4096, 2, -1}, {-1, 2, 4096}, {4096, 2, -1}, {1, 1, 1}, "xyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, -1}, {-1, 1, -1}, {-1, 1, -1}, {1, 1, 1}, "hxyI"}, "sB64 sS16 aS wg 2x1x16 ikr af vav sr bk0 bm0 sys pab grf256 l4", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {2, 1, 16}, 1, (WGType) 0, 4357, 0, 1024, {1, 2, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "ixy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4 pab", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 512}, {-1, 1, -1}, {-1, 1, 512}, {1, 1, 1}, "H"}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 4194561, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 513}, {1024, 1, -1}, {-1, 1, 513}, {1024, 1, -1}, {1, 1, 1}, "H"}, "aB128 aB64x2 aB wg 8x1x2 ikr nse hi ar sb128 bk0 grf256 dot wt", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {8, 1, 2}, 1, (WGType) 0, 4198661, 0, 256, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 4097}, {-1, 1, -1}, {1025, 1, 4097}, {-1, 1, -1}, {1, 1, 1}, "H"}, "aB128 aB64x2 aB wg 4x1x8 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 16384, 16777216}, {32768, 16384, 16777216}, {2, 1, 128}, {4, 1, 8}, 1, (WGType) 0, 4198661, 0, 128, {1, 2, 4}, {true, true, true}}, {'W', 1, {2}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1025, 1, 513}, {-1, 1, 4096}, {1025, 1, 513}, {-1, 1, 4096}, {1, 1, 1}, "H"}, "aB128x2 aB128x2 aB wg 16x1 wx2 nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {16384, 16384, 16777216}, {16384, 16384, 16777216}, {1, 1, 128}, {16, 1, 1}, 2, (WGType) 1, 4194561, 0, 0, {1, 2, 4}, {true, true, true}}, {'W', 1, {1}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 2, -1}, {4095, 2, 4095}, {-1, 2, -1}, {4095, 2, 4095}, {4, 4, 1}, "Hpxy"}, "sB128 sB64 aB wg 8x1x4 ikr nse hi ar sb128 bk0 dot wt", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 2}, {32768, 32768, 16777216}, {32768, 32768, 16777216}, {2, 2, 128}, {8, 1, 4}, 1, (WGType) 0, 4198661, 0, 256, {4, 4, 4}, {false, false, true}}, {'W', 1, {128}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "I"}, "aB32 aB16 aB ca3 ks64 wg 2x4x4 kr sys dw af k192 grf256 sm vav di dm sr bk0 cc fm", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 2048}, {8192, 8192, 2048}, {16, 16, 192}, {2, 4, 4}, 1, (WGType) 1, 5, 12288, 12288, {4, 4, 2}, {true, true, true}}, {'E', 17, {4.488e+06, 120208, 122348, 327.657, 0, 0, 3.41911, 6.33998, 2.77024, 6.84323, 0.0582208, 0.0145417, 0.0574819, 0.511227, 1.20562, 1.20018, 5.11052e-14}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxy"}, "sS32x2 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {995629, 475244, 0, 0, 0, 0, 2.57281, 4.9973, 6.41839, 16.8374, 0.018098, 0.0100962, 0.0118445, 0.996594, 1.41614, 1.19695, 9.60059e-13}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, 2048}, {-1, 1024, -1}, {4, 4, 1}, "Ixy"}, "sS32x2 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {985865, 449373, 0, 0, 0, 0, 2.52021, 5.18432, 6.41111, 16.2959, 0.0196104, 0.0078899, 0.0176455, 0.975623, 1.37619, 1.19224, 7.58759e-13}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ixy"}, "sS32 sB16 aB wg 16x2 cb4 ks32 xaf dw vav bo sr bk0 sn dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {996386, 490462, 0, 0, 0, 0, 2.66142, 5.16946, 6.13711, 17.1234, 0.0173418, 0.00858928, 0.0131118, 0.74711, 1.38311, 1.23925, 7.08247e-13}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Isxy"}, "sB32 sB16 aB wg 8x4 cab3x2 ks32 xaf fx dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 786432, 16777216}, {262144, 786432, 16777216}, {16, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 61440, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.05802e+06, 544339, 0, 0, 0, 0, 3.68162, 5.47347, 6.43127, 16.927, 0.0189051, 0.00830613, 0.0176655, 0.737745, 1.33714, 1.26074, -3.55719e-14}}}, +{{'E', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 33, -1}, {-1, -1, -1}, {4, 4, 1}, "xyI"}, "sB32 sB16 aB wg 8x4 cab4x2 ks32 xaf st dw vav bo sr bk0 dm grf256 sys pab", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {262144, 524288, 16777216}, {262144, 524288, 16777216}, {16, 32, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {966409, 495159, 0, 0, 0, 0, 3.17717, 6.26529, 6.93941, 17.3959, 0.0210983, 0.0147392, 0.00938379, 1, 1.72262, 1.08677, 5.49901e-12}}}, {{'E', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB32 aB wg 8x8 cab3x2 ks32 af dw vav hi sr bk0 dm grf256 sys acb pab rc0", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 43008, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {3.88957e+06, 2.97727e+06, 0, 0, 0, 0, 2.09294, 2.00406, 4.52375, 15.4399, 0.0125151, 0.0125151, 0, 1, 1.272, 1.20602, 2.32202e-13}}}, {{'E', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 4, -1}, {-1, -1, -1}, {-1, 4, -1}, {4, 4, 1}, "Ipqxyz"}, "sB32 sB32 sB wg 2x1x16 akr fg 0.5 acb sr sb32 bk0 bm0 grf256 pab sys dm rc0", {8, (LoopType) 0, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {64, 4, 32}, {2, 1, 16}, 1, (WGType) 0, 525061, 0, 2048, {4, 4, 4}, {false, false, false}}, {'E', 17, {1.02104e+06, 24262.1, 391695, -22645.2, 0, 0, 1.81043, 16.2483, 6.79395, 16.9189, 0.130633, 0.0432557, 0.114384, 0.798165, 1.21069, 1.21079, -7.43355e-14}}}, {{'E', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "Ipsxy"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi sr bk0 sm dm grf256 sys acb pab", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 393216, 16777216}, {524288, 393216, 16777216}, {32, 24, 128}, {8, 8, 1}, 1, (WGType) 1, 257, 57344, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.72221e+06, 2.4081e+06, 0, 0, 0, 0, 1.90767, 2.47059, 4.96037, 15.1649, 0.0137524, 0.0137524, 0, 1, 1.25118, 1.20908, -4.599e-14}}}, @@ -413,6 +416,7 @@ auto _CATALOG_ = kcatalog::toArray({ {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 16x4 cab4 ks16 af dw vav hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {8192, 8192, 16777216}, {16, 64, 16}, {16, 4, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {555289, 1.12387e+06, 0, 0, 0, 0, 11.0812, 12.062, 6.31783, 16.9321, 0.033502, 0.033502, 0, 0.924238, 1.20788, 1.20316, -1.03588e-14}}}, {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xypIn"}, "sB32 sB32 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sn sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {1.01712e+06, 501677, 0, 0, 0, 0, 11.2352, 11.0061, 6.00586, 15.9727, 0.0368901, 0.0329381, 0.0158743, 0.872583, 1.26733, 1.18539, 7.97223e-13}}}, {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sn grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 49152, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {538723, 893443, 0, 0, 0, 0, 10.7801, 9.8182, 5.03684, 13.5344, 0.0761836, 0.0761836, 0, 0.79624, 1.20514, 1.20124, -4.36232e-15}}}, +{{'E', "gemm", {"[OH]", "[OH]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ixy"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav bo bk0 sn grf256 sys l4 pab sr", {8, (LoopType) 0, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {2, 2, 4}, {false, false, true}}, {'E', 17, {568500, 1.28115e+06, 0, 0, 0, 0, 5.43293, 5.22848, 6.20609, 17.6026, 0.0197921, 0.0197921, 0, 1, 1.32313, 1.19039, 7.17197e-13}}}, {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16 sB16 aB wg 8x8 cab4 ks16 af dw vav hi bk0 grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 16}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {554386, 926318, 0, 0, 0, 0, 11.2535, 11.3159, 6.30666, 17.7295, 0.0362105, 0.0362105, 0, 0.820846, 1.20543, 1.20182, -7.27096e-15}}}, {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyIn"}, "sB16 sB32 aB wg 8x8 cab4 ks32 af dw vav hi bk0 sys pab sr", {8, (LoopType) 0, 128, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {880413, 426415, 0, 0, 0, 0, 11.314, 12.5284, 6.07049, 16.1179, 0.0387733, 0.0216213, 0.024149, 0.726888, 1.2321, 1.19468, 2.81285e-13}}}, {{'E', "gemm", {"[SB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xyI"}, "sB16x2 sB16x2 aB wg 8x4 cab4 ks16 af dw vav hi bk0 grf256 sys pab sr", {8, (LoopType) 0, 256, {(LoopType) 144, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 16}, {8, 4, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 4, 4}, {false, false, true}}, {'E', 17, {873052, 300083, 0, 0, 0, 0, 11.3512, 11.8331, 6.13255, 15.5383, 0.0704048, 0.0182551, 0.0658259, 0.497503, 1.20284, 1.20066, 6.06715e-15}}}, @@ -541,9 +545,9 @@ auto _CATALOG_ = kcatalog::toArray({ {{'F', "gemm", {"B", "F", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, {{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, {{'F', "gemm", {"B", "F", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"B", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, -{{'F', "gemm", {"B", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, +{{'F', "gemm", {"B", "[OB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, "#I"}, "aB16+m32@32 aB32 aB wg 2x8 af vav li nmk pt sr br ca3 bk0 sys kv dm afb l4", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {1048576, 32768, 16777216}, {1048576, 32768, 32}, {64, 2, 32}, {2, 8, 1}, 1, (WGType) 1, 409, 24576, 0, {4, 2, 4}, {true, true, true}}, {'E', 17, {1.32162e+06, 161954, 0, 0, 2.32817e+06, 0, 0.71806, 4.15517, 0.786689, 1.40778, 0.0341164, 0.0131941, 0.0256486, 0.947188, 1.39057, 0.987284, 5.0128e-12}}}, +{{'F', "gemm", {"B", "[OB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.15757e+06, -134939, -26216.3, 226647, 2.94093e+06, 1.99475e+06, 0.459989, 0.701382, 0.869118, 1.51365, 0.0198203, 0.0153374, 0.00626234, 0.558687, 1.30133, 0.935668, 6.0334e-12}}}, +{{'F', "gemm", {"B", "[OB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, {{'F', "gemm", {"D", "D", "D"}, {"A", "B", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8+B8@16 aB8+B8@16 aB nse grf256 wg 4x8 bo pt kc8 sb256 bk0 br sr", {16, (LoopType) 255, 256, {(LoopType) 192, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {8192, 8192, 16777216}, {32, 32, 8}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 8}, {true, true, true}}, {'W', 1, {1024}}}, {{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@24 aS8x2+S8@24 aB wg 4x8 kc8 nse hi pt sb256 bk0 sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {915642, 488057, 0, 0, 0, 0, 2.62682, 4.67056, 1.01353, 1.76192, 0.0687398, 0.0687398, 0, 0.998364, 1.80644, 1.08579, 3.08664e-11}}}, {{'F', "gemm", {"D", "D", "D"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB32+B16@32 aS16+S8@32 aB wg 4x8 kc16 nse hi pt sb256 bk0 sn grf256 br sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 8}, {true, true, true}}, {'E', 17, {887309, 574758, 0, 0, 0, 0, 4.77569, 4.82861, 0.536993, 1.65054, 0.0889844, 0.0889844, 0, 1, 1.65309, 1.06232, 1.19911e-11}}}, @@ -740,32 +744,9 @@ auto _CATALOG_ = kcatalog::toArray({ {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Is"}, "aS16 aS16 aB sys grf256 cab2 wg 4x4 l4 sr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {48, 32, 16}, {4, 4, 1}, 1, (WGType) 1, 257, 20480, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.17895e+06, 343529, 0, 0, 0, 0, 1.63411, 1.77325, 1.00531, 1.48275, 0.0145617, 0.000936039, 0.0155971, 0.877282, 1.01034, 1.0048, 9.67486e-14}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABqI"}, "am32+S1,32@64 av32+B32@64 aS cs sys grf256 af wg 8x4 bo sb256 sm bk0 sr", {16, (LoopType) 255, 256, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 16777216}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {765291, 1.00234e+06, 0, 0, 0, 0, 0.723688, 0.663141, 1.08538, 2.05438, 0.00434664, 0.00434664, 0, 1, 1.8693, 1.21785, 3.96104e-12}}}, {{'F', "gemm", {"H", "H", "S"}, {"T", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "i"}, "aS8x2+m16@20 aB8+m16@20 aS wg 8x4 kc8 nse hi pt sb32 bk0 sm sn grf256 kv afb l4 sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 32}, {16, 32, 8}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {971283, 550381, 0, 0, 1.25911e+07, 8.13466e+06, 2.33425, 1.5484, 4.45361, 4.80392, 0.0689395, 0.0689395, 0, 1, 1.00725, 0.817916, 1.37423e-12}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, -{{'F', "gemm", {"H", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, -{{'F', "gemm", {"H", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}}, -{{'F', "gemm", {"O", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IABg"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"O", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m64@48 am32+m32@48 aB wg 8x4 af rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 917504, 16777216}, {524288, 917504, 32}, {32, 56, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {860257, 607544, 0, 0, 5.65002e+06, 8.42957e+06, 0.65045, 0.672365, 0.789643, 1.23363, 0.00404835, 0.00404835, 0, 1, 1.9756, 1.22265, 4.06886e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m64@64 am32+m32@64 aB wg 8x4 xaf rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {874546, 579329, 0, 0, 5.41819e+06, 8.2985e+06, 0.514532, 0.714068, 0.749831, 1.19945, 0.00433911, 0.00433911, 0, 1, 1.97939, 1.22442, 3.02858e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m64@56 am32+m32@56 aB wg 8x4 xaf fx rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {863702, 681514, 0, 0, 6.07928e+06, 1.03055e+07, 0.723236, 0.751648, 0.771003, 1.22757, 0.00408183, 0.00408183, 0, 0.787241, 1.94989, 1.18778, 5.70022e-12}}}, -{{'F', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {857350, 514605, 0, 0, 4.49741e+06, 5.0217e+06, 0.578609, 1.14407, 0.733928, 1.13844, 0.00599451, 0.00599451, 0, 0.988626, 1.78984, 1.15379, 4.01784e-12}}}, +{{'F', "gemm", {"H", "[OH]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "ABI"}, "av16+B16@48 am16x2 aB wg 8x2 af vav li sr br sb256 bk0 sys ska rr wx4 kv afb pt", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {1048576, 65536, 16777216}, {1048576, 65536, 32}, {64, 4, 16}, {8, 2, 1}, 4, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {879483, 350341, 0, 0, 0, 0, 0.458466, 18.3391, 0.732099, 2.00944, 0.0538243, 0.0538243, 0, 0.0465505, 1.08817, 1.00259, 3.38512e-15}}}, +{{'F', "gemm", {"H", "[OH]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.43314e+06, -85436.7, -21480.5, 122461, 4.34176e+06, 2.37568e+06, 0.844572, 0.876066, 0.58704, 1.33341, 0.0317939, 0.0288168, 0.00938048, 0.965671, 1.25852, 0.881513, 1.24288e-11}}}, +{{'F', "gemm", {"H", "[OH]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 1, 2}, "ABI"}, "at16+m32@64 aB16 aB wg 16x2 cb3x2 ks32 af vav hi pt sr br bk0 sm sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 1048576, 16777216}, {262144, 1048576, 16777216}, {16, 64, 32}, {16, 2, 1}, 1, (WGType) 1, 257, 24576, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.01114e+06, 521171, 0, 0, 0, 0, 0.680408, 1.20311, 1.01139, 1.54855, 0.00724061, 0.00724061, 0, 0.952894, 1.46407, 1.15995, 1.81761e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"A4#16,32", "B32", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB32+B32@96 aB32+B32@96 aB vav sys grf256 af hi pt wg 4x8 sb512 bk0 sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 256, 0, 0, {128, 128, 4}, {true, true, true}}, {'W', 1, {2048}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIps"}, "av32+m128@96 am64+m64@96 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {886295, 679810, 0, 0, 6.14973e+06, 1.05103e+07, 0.387882, 0.34723, 0.844766, 1.28789, 0.00202312, 0.00202312, 0, 0.99971, 1.60161, 1.05949, 2.70909e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32+m64@128 am64x2+m64@128 aB wg 4x8 xaf vav hi pt sr br sb128 bk0 sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {878684, 628439, 0, 0, 5.63446e+06, 8.38042e+06, 0.326785, 0.347891, 0.805503, 1.24882, 0.00204557, 0.00204557, 0, 1, 1.50403, 1.06152, 3.02972e-12}}}, @@ -806,8 +787,6 @@ auto _CATALOG_ = kcatalog::toArray({ {{'F', "gemm", {"O", "O", "I"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB16 aB16 aB wg 4x8 cab3 ks32 nse hi pt bk0 grf256 kv afb sr", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 49152, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.01957e+06, 906366, 0, 0, 4.13696e+06, 8.73267e+06, 1.51183, 1.36399, 0.870951, 1.51109, 0.0167461, 0.0167461, 0, 1, 1.09321, 0.977493, 1.29851e-12}}}, {{'F', "gemm", {"O", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@128 aB wg 2x16 ca3x2 ks128 af hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991020, 415137, 0, 0, 0, 0, 0.494061, 1.1783, 1.09356, 3.42418, 0.0192037, 0.0192037, 0, 0.99432, 1.16294, -0.903393, 1.39731e-11}}}, {{'F', "gemm", {"O", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x4x4 kr cab4 ks64 af hi pt sr br bk0 sn nb 2x4 dm grf256 kv afb sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 64}, {32, 8, 64}, {2, 4, 4}, 1, (WGType) 1, 445, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.88418e+06, -153479, -204520, 376006, 2.17088e+06, 1.88498e+06, 0.367103, 0.802232, -0.0233093, 0.803253, 0.0120542, 0.00674447, 0.00621616, 0.958995, 1.34316, 0.655552, 9.35284e-12}}}, -{{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, -{{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av32x2+m32@64 av32+m32@64 aB wg 4x8 cb3 ks32 xaf st vav hi pt sr br bk0 nb 0x8 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {968994, 716920, 0, 0, 5.78437e+06, 9.22419e+06, 0.534082, 0.742409, 0.894677, 1.49963, 0.00222583, 0.00222583, 0, 0.887893, 1.57089, 1.08446, 1.91309e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABIp"}, "av32x2+m128@64 av64+m32@64 aB wg 4x4x2 kr cb3 ks64 xaf vav hi pt sr br bk0 nb 0x4 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 524288, 16777216}, {1048576, 524288, 64}, {64, 32, 64}, {4, 4, 2}, 1, (WGType) 1, 445, 24576, 65536, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.48013e+06, -879245, -212784, 1.19787e+06, 3.27516e+06, 5.10362e+06, 0.353883, 0.703019, 0.911615, 1.47833, 0.00312427, 0.000484667, 0.00281171, 0.593822, 1.43189, 1.02998, 1.76713e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32+m64@128 av64+m32@128 aB wg 4x8 cab4x2 ks64 af vav hi pt sr br bk0 nb 4x8 grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.04641e+06, 551646, 0, 0, 0, 0, 0.775015, 0.878808, 0.742373, 1.49937, 0.00558315, 0.00558315, 0, 1, 1.29356, 0.970044, 1.0419e-12}}}, @@ -841,6 +820,8 @@ auto _CATALOG_ = kcatalog::toArray({ {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#i"}, "aB8x2 aS8x2 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.04685e+06, 855332, 0, 0, 3.9977e+06, 9.216e+06, 0.658789, 0.718726, 0.867169, 1.54002, 0.0168389, 0.0168389, 0, 1, 1.09776, 0.992762, 7.01917e-13}}}, {{'F', "gemm", {"O", "O", "I"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#i"}, "aB8/4 aS8 aB wg 4x8 cab4 ks32 nse hi pt sr br bk0 nb 4x8 grf256 kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 32}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 65536, 0, {1, 1, 4}, {true, true, true}}, {'E', 17, {991806, 971407, 0, 0, 4.04685e+06, 9.01939e+06, 1.72245, 1.50029, 0.860423, 1.51374, 0.0176793, 0.0176793, 0, 1, 1.07978, 0.979007, 6.31428e-13}}}, {{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av32 at32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, +{{'F', "gemm", {"O", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {64, 40, 64}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at32x2+m128@96 am64+m64@128 aB wg 8x4 xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 64}, {32, 64, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {895996, 619115, 0, 0, 5.44113e+06, 8.45414e+06, 0.385983, 0.342777, 0.788529, 1.22228, 0.00199536, 0.00199536, 0, 1, 1.60168, 1.14344, 3.24663e-12}}}, {{'F', "gemm", {"O", "O", "I"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32x2+m64@64 am64+m128@96 aB wg 8x4 xaf vav hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 64}, {32, 48, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {873940, 565190, 0, 0, 4.58752e+06, 7.00416e+06, 0.375538, 0.421029, 0.747052, 1.19407, 0.00251587, 0.00251587, 0, 1, 1.69069, 1.13204, 1.62736e-12}}}, @@ -1032,14 +1013,23 @@ auto _CATALOG_ = kcatalog::toArray({ {{'F', "gemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at8x2+m16@24 aB16 aB wg 4x2x4 kr cb4x2 ks32 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.64426e+06, -145120, -138637, 309031, 2.31014e+06, 2.00704e+06, 0.918257, 1.44548, 0.262092, 1.02772, 0.0377766, 0.0386483, 0.015333, 0.989041, 1.28006, -1.30164, 1.0788e-10}}}, {{'F', "gemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#AI"}, "at16+m16@24 aB16x2 aB wg 2x2x8 kr cb4x2 ks16 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.65746e+06, -77290.3, -69015.6, 169032, 2.75251e+06, 1.85958e+06, 0.958255, 0.997806, 0.100098, 0.957911, 0.0468295, 0.0427724, 0.0158824, 1, 1.31247, 0.973533, 8.29974e-12}}}, {{'F', "gemm", {"T", "T", "S"}, {"T", "N", "N"}}, {5, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#ABI"}, "at8+m8@16 aB32+m16@16 aB wg 2x2x8 kr af vav hi pt sr br sb32 bk0 sm sn dm grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 261, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.00059e+06, 567011, 1104.46, 49832.2, 0, 0, 1.24825, 1.37716, 2.24139, 6.36497, 0.0644534, 0.0644534, 0, 0.917445, 1.16564, 0.0446476, 1.86857e-11}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "am32 am32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x8x2 kr cab3 ks64 af hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.03799e+06, 666762, 2486.11, 229962, 0, 0, 0.52045, 0.670568, 0.707588, 2.31307, 0.00907287, 0.00907287, 0, 1, 1.37325, 0.975844, 1.68113e-12}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@128 aB wg 2x16 ca3x2 ks128 af hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991020, 415137, 0, 0, 0, 0, 0.494061, 1.1783, 1.09356, 3.42418, 0.0192037, 0.0192037, 0, 0.99432, 1.16294, -0.903393, 1.39731e-11}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "am32 at32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}}, -{{'F', "gemm", {"[F0]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 aB wg 4x2x4 kr xaf st hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}}, -{{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.07581e+06, 764320, 0, 0, 0, 0, 0.804535, 1.46469, 0.96438, 2.27185, 0.0120677, 0.0120677, 0, 1, 1.38109, 0.955498, 2.48341e-12}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16 at16 aB wg 1x2x16 kr cab3x2 ks16 af vav hi pt sr br bk0 grf256 kv afb sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 16}, {1, 2, 16}, 1, (WGType) 1, 445, 6144, 6144, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.44116e+06, -85531, -20925.3, 122459, 4.34995e+06, 2.28557e+06, 0.844362, 0.875124, 0.584811, 1.33561, 0.0318181, 0.0286306, 0.00945064, 0.972737, 1.204, 0.897885, 9.01622e-12}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at32+m32@48 aB32x2 aB wg 2x2x8 kr cb4x2 ks32 xaf vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {2, 2, 8}, 1, (WGType) 1, 445, 8192, 8192, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.69852e+06, -99048.2, -76947.2, 195116, 2.91635e+06, 1.97427e+06, 0.482236, 0.498349, 0.818811, 1.72604, 0.0234362, 0.0175745, 0.00879097, 0.964253, 1.36369, 0.92013, 8.14061e-12}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.20164e+06, 54541, 0, 0, 0, 0, 0.422505, 7.31873, 4.22063, 10.9759, 0.192755, 0.0407408, 0.156727, 0.693649, 1.00294, 0, 0}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IABg"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, +{{'F', "gemm", {"[OB]", "B", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B16@8 aS8x2+S32@8 aB wg 2x8x2 kr kc8 nse hi pt sr kv sb256 bk0 sn grf256 afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 262144, 16777216}, {8192, 8192, 0}, {32, 16, 8}, {2, 8, 2}, 1, (WGType) 1, 413, 0, 65536, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.12483e+06, -574330, -20527.6, 853244, 4.03046e+06, 0, 2.33007, 1.51079, 0.80632, 1.51643, 0.0625462, -1.54388e-05, 0.0629715, 0.425906, 1.06028, 0.229788, 1.06051e-11}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "am8+B8@8 at8x2 aB wg 4x1x16 kr kc8 nse li nmk pt sr bk0 sn kv afb sb32 l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {524288, 65536, 16777216}, {8192, 8192, 0}, {16, 4, 8}, {4, 1, 16}, 1, (WGType) 1, 413, 0, 2048, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.21115e+06, -59718.2, -11600.6, 104303, 3.06381e+06, 0, 0.860506, 9.14342, 0.769527, 1.14843, 0.0733058, 0.0350639, 0.04512, 0.901895, 1.307, 0.986093, 2.06541e-11}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {1, 1, 1}, ""}, "am8x2+m32@8 aB8x2+m8@8 aB wg 1x4x16 kr kc8 nse li pt sr sb256 bk0 kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 224, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {1, 4, 16}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.13268e+06, -103657, 246.583, 142575, 3.21126e+06, 0, 1.60336, 0.933443, 0.504957, 0.918636, 0.0712742, 0.0670609, 0.015172, 0.992002, 1.14631, 0.0718492, 1.8422e-11}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 16, -1}, {1, 1, 1}, ""}, "at8x2+m16@24 at8x2+m32@8 aB wg 16x1x4 kr kc8 nse nmk li pt sr sb256 bk0 sm sn kv afb l2d", {16, (LoopType) 255, 128, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 8}, {16, 1, 4}, 1, (WGType) 1, 413, 0, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.18993e+06, -230103, -26635.9, 388995, 2.2528e+06, 0, 0.900793, 5.78162, 0.552809, 1.28255, 0.0627307, 0.0602325, 0.0232779, 1, 1.21284, 0.921396, 2.8065e-12}}}, +{{'F', "gemm", {"[OB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "xy"}, "sB4 sB4 aB wg 4x8 kc4 cab4 ks8 nse bo sr bk0 sm sn l4", {8, (LoopType) 0, 128, {(LoopType) 128, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {524288, 262144, 16777216}, {32, 16, 8}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {1, 2, 4}, {false, false, true}}, {'W', 1, {512}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB16 aB16 aB wg 4x8 cab4x2 ks16 af vav hi pt bk0 sn grf256 sys sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {8192, 8192, 16777216}, {64, 32, 16}, {4, 8, 1}, 1, (WGType) 1, 257, 65536, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.11006e+06, 935686, 0, 0, 0, 0, 1.58314, 3.00527, 1.01282, 1.59913, 0.00625344, 0.00625344, 0, 1, 1.56406, 1.11642, 3.07212e-12}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "Ip"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks32 af vav hi pt bk0 sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {8192, 8192, 16777216}, {64, 16, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07316e+06, 789496, 0, 0, 0, 0, 1.617, 1.63049, 0.937992, 1.68308, 0.0104723, 0.0104723, 0, 0.85096, 1.32269, 1.03329, 2.07642e-12}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aB64 aB64 aB wg 4x8 cab4 ks64 af vav hi pt bk0 sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 262144, 16777216}, {8192, 8192, 16777216}, {32, 16, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 131072, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.07471e+06, 779135, 0, 0, 0, 0, 1.55845, 1.44307, 0.897078, 1.82423, 0.0150498, 0.0150498, 0, 0.56959, 1.2821, 0.858742, 5.47604e-12}}}, @@ -1066,6 +1056,34 @@ auto _CATALOG_ = kcatalog::toArray({ {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S1,16@32 aB32x2 aB wg 1x4x8 kr cb4 ks32 af vav hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {8192, 8192, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 65536, 16384, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.0672e+06, 762832, 500.274, 77177.4, 0, 0, 1.36188, 1.19635, 3.53273, 8.13743, 0.0615908, 0.0615908, 0, 0.79968, 1.28461, 0.929214, 4.37091e-12}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S32@48 aB32/16x2 aB wg 8x4 cb4x2 ks32 af vav hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {8192, 8192, 16777216}, {16, 16, 32}, {8, 4, 1}, 1, (WGType) 1, 257, 16384, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {964729, 752109, 0, 0, 0, 0, 1.46062, 1.46554, 1.01724, 2.17327, 0.0239322, 0.0239322, 0, 1, 1.25665, 0.945813, 3.00962e-12}}}, {{'F', "gemm", {"[SB]", "[SB]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS16+S64@64 aB16x2 aB wg 4x8 cb4 ks64 af vav hi pt bk0 sm sn grf256 sys dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {262144, 131072, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.00341e+06, 675337, 0, 0, 0, 0, 2.06693, 1.45219, 1.47193, 3.06315, 0.0374548, 0.0374548, 0, 1, 1.33339, 0.377183, 1.26633e-11}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 4x8 cab4 ks64 af vav hi pt sr br bk0 sn nb 4x8 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {4, 8, 1}, 1, (WGType) 1, 257, 98304, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08073e+06, 763490, 0, 0, 0, 0, 0.80246, 1.46921, 0.963229, 2.26732, 0.0122976, 0.0122976, 0, 0.986616, 1.49449, 0.966336, 3.40072e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {8, 8, 1}, "IAB"}, "av16 am16x2 aB wg 16x1 af rr vav hi pt sr br sb64 bk0 sys ska rr kv afb", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 32}, {32, 8, 16}, {16, 1, 1}, 1, (WGType) 0, 441, 0, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.03641e+06, 123576, 0, 0, 1.681e+06, 3.63725e+06, 0.433141, 0.621138, 0.453913, 1.2219, 0.0515989, 0.00292722, 0.0485145, 0.979653, 1.17866, -0.454257, 5.43001e-11}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "ABI"}, "av16x2+m32@32 av16 aB wg 8x4 cb3 ks64 af vav hi pt sr br bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 64}, {32, 32, 64}, {8, 4, 1}, 1, (WGType) 1, 441, 49152, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {1.01629e+06, 551456, 0, 0, 5.42147e+06, 5.37395e+06, 0.789185, 1.6085, 1.03879, 1.72752, 0.00628363, 0.00628363, 0, 0.924224, 1.51259, 1.24909, 2.21382e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m16@32 at16+m16@32 aB wg 2x4x4 kr cab3 ks32 xaf st vav hi pt sr br bk0 nb 2x4 grf256 sys sn l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {2, 4, 4}, 1, (WGType) 1, 261, 18432, 18432, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.08698e+06, 601592, -687.022, 111689, 0, 0, 1.25555, 1.87166, 2.40426, 4.96622, 0.0272706, 0.0272706, 0, 0.946361, 1.27483, 0.962139, 2.75611e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, -1, -1}, {4, 4, 1}, "ABI"}, "aB16x2+m16@32 at16x2+m16@32 aB wg 1x16 xaf st vav li pt sr br sb64 bk0 grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 224, (LoopType) 255, (LoopType) 255}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 16}, {1, 16, 1}, 1, (WGType) 1, 441, 0, 0, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.03366e+06, 206195, 0, 0, 2.5985e+06, 2.90488e+06, 5.86262, 0.614358, 0.589806, 1.16158, 0.0300187, 0.00214889, 0.0294786, 0.604289, 1.43473, 0.822773, 7.94852e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABI"}, "at16x2+m32@48 am32+m16@64 aB wg 4x2x4 kr xaf st vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 32}, {16, 16, 32}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.18986e+06, -136092, -44118.4, 227346, 2.90816e+06, 1.96608e+06, 0.463277, 0.703205, 0.870747, 1.51625, 0.0198219, 0.0158398, 0.00608028, 0.596299, 1.37888, 0.920771, 9.99051e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {2048, -1, 2048}, {-1, 16, 5799}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+c32@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 32}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.12642e+06, 163000, 18949.6, 4807.2, 0, 0, 0.620253, 0.594119, 4.7783, 20.2378, 0.099453, 0.164053, 0.0490596, 1, 1.10688, 0.999502, 3.75419e-13}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "at16x2+m32@48 aB32 aB wg 4x2x4 kr cb4x2 ks64 af vav hi pt sr br bk0 sm sn dm grf256 kv afb sys l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 16384, 16384, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.61469e+06, -158427, -129747, 318436, 2.37568e+06, 1.70394e+06, 0.547022, 0.752344, 0.920728, 1.72909, 0.0199388, 0.0153192, 0.00791891, 0.829931, 1.39549, 0.959385, 6.05173e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "I"}, "aS32+S32@48 aB32+S16@48 aB wg 4x1 af vav nmk li pt br sr sb256 bk0 sm grf256 sys l4 kd", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 255}, {262144, 16384, 16777216}, {262144, 16384, 16777216}, {16, 1, 32}, {4, 1, 1}, 1, (WGType) 1, 257, 0, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.19239e+06, 54658.5, 0, 0, 0, 0, 0.422421, 7.31719, 4.20548, 10.9716, 0.192718, 0.0399192, 0.156028, 0.699508, 1.04701, 0, 0}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {1536, -1, 5800}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am32+m64@64 at32+m16@64 aS wg 1x1x16 kr af vav sr sb64 bm0 bk0 sm sn grf256 sys ikr", {16, (LoopType) 255, 256, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {4, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 256, {8, 4, 4}, {true, true, true}}, {'E', 17, {981931, 255034, 13270.4, 8465.73, 0, 0, 0.839094, 0.47734, 12.6109, 43.9669, 0.0831258, 0.0199484, 0.123368, 0.98723, 1.28194, 0.802064, 5.50004e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 16, -1}, {-1, -1, -1}, {-1, 16, -1}, {8, 4, 1}, "IAB"}, "am64+m64@64 at32 aS wg 1x1x16 ikr af vav sr sb256 bk0 bm0 sys rr", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {16777216, 8192, 16777216}, {8192, 8192, 16777216}, {8, 16, 64}, {1, 1, 16}, 1, (WGType) 1, 4357, 0, 512, {8, 4, 4}, {true, true, true}}, {'E', 17, {1.11497e+06, 130485, 19378.1, 6563.26, 0, 0, 0.453475, 0.601712, 1.83124, 9.66981, 0.0453973, 0.0806318, 0.0229963, 1, 1.18514, 0.619757, 9.22053e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m64@48 am32+m32@48 aB wg 8x4 af rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 917504, 16777216}, {524288, 917504, 32}, {32, 56, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {860257, 607544, 0, 0, 5.65002e+06, 8.42957e+06, 0.65045, 0.672365, 0.789643, 1.23363, 0.00404835, 0.00404835, 0, 1, 1.9756, 1.22265, 4.06886e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m64@64 am32+m32@64 aB wg 8x4 xaf rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 786432, 16777216}, {524288, 786432, 32}, {32, 48, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {874546, 579329, 0, 0, 5.41819e+06, 8.2985e+06, 0.514532, 0.714068, 0.749831, 1.19945, 0.00433911, 0.00433911, 0, 1, 1.97939, 1.22442, 3.02858e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIp"}, "at16x2+m64@56 am32+m32@56 aB wg 8x4 xaf fx rr vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 64, 32}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {863702, 681514, 0, 0, 6.07928e+06, 1.03055e+07, 0.723236, 0.751648, 0.771003, 1.22757, 0.00408183, 0.00408183, 0, 0.787241, 1.94989, 1.18778, 5.70022e-12}}}, +{{'F', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIpq"}, "at16x2+m32@48 am32+m32@32 aB wg 8x4 xaf vav hi pt sr br sb64 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 1048576, 16777216}, {524288, 1048576, 32}, {32, 32, 16}, {8, 4, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {857350, 514605, 0, 0, 4.49741e+06, 5.0217e+06, 0.578609, 1.14407, 0.733928, 1.13844, 0.00599451, 0.00599451, 0, 0.988626, 1.78984, 1.15379, 4.01784e-12}}}, +{{'F', "gemm", {"[OH]", "[OH]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB16x2 aB16x2 aB wg 2x16 cab3x2 ks64 af vav hi pt sr br bk0 sn nb 2x16 dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 65536, 16777216}, {524288, 65536, 16777216}, {32, 4, 64}, {2, 16, 1}, 1, (WGType) 1, 257, 49152, 0, {2, 2, 4}, {true, true, true}}, {'E', 17, {1.03544e+06, 807346, 0, 0, 0, 0, 1.28444, 1.03004, 1.50088, 3.00586, 0.0180079, 0.0180079, 0, 0.984218, 1.40315, 0.791237, 5.90572e-12}}}, +{{'F', "gemm", {"[OH]", "[SH]", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+B8@16 aS8+S8@24 aB wg 1x4 kc8 nse hi pt sr sb256 bk0 sn", {16, (LoopType) 255, 128, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {16, 8, 8}, {1, 4, 1}, 1, (WGType) 1, 257, 0, 0, {1, 4, 4}, {true, true, true}}, {'E', 17, {1.16538e+06, 40635.2, 0, 0, 0, 0, 1.30731, 1.53858, 0.584971, 1.42067, 0.0634061, 0.0581975, 0.0161667, 1, 1.44276, 1.00478, 2.34818e-11}}}, +{{'F', "gemm", {"[OH]", "[SH]", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, ""}, "aB8x2+m8@8 am8x2+m16@16 aB wg 2x2x8 kr kc8 nse hi pt sr sb64 bk0 grf256 kv afb l2d", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 262144, 16777216}, {524288, 262144, 32}, {32, 16, 8}, {2, 2, 8}, 1, (WGType) 1, 413, 0, 8192, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.27688e+06, -130710, -19650.6, 200055, 4.42368e+06, 0, 1.142, 1.25437, 0.640009, 1.36547, 0.0627194, 0.0629494, 0.0147313, 0.989971, 1.25231, -0.792803, 1.91812e-10}}}, +{{'F', "gemm", {"[OH]", "[SH]", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 4, -1}, {1, 1, 1}, ""}, "at16x2+m16@32 at16+m32@32 aB wg 16x1x2 kr kc16 nse nmk li pt sr sb256 bk0 sm grf256 kv afb l4 l2d", {16, (LoopType) 255, 256, {(LoopType) 225, (LoopType) 255, (LoopType) 2}, {262144, 65536, 16777216}, {262144, 65536, 32}, {16, 4, 16}, {16, 1, 2}, 1, (WGType) 1, 413, 0, 4096, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.1734e+06, -264907, -109870, 485064, 2.21266e+06, 0, 0.856653, 15.807, 1.98085, 3.89882, 0.125049, 0.0139237, 0.143865, 1, 1.34573, 0.978713, 4.7619e-12}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "am32 am32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32+m64@32 aB64+m64@32 aB wg 2x8x2 kr cab3 ks64 af hi pt sr br bk0 sn dm grf256 sys l4", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 64}, {2, 8, 2}, 1, (WGType) 1, 261, 24576, 24576, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.03799e+06, 666762, 2486.11, 229962, 0, 0, 0.52045, 0.670568, 0.707588, 2.31307, 0.00907287, 0.00907287, 0, 1, 1.37325, 0.975844, 1.68113e-12}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "av64 am64x2+m64@128 aB wg 2x16 ca3x2 ks128 af hi pt sr br bk0 sn grf256 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 32768, 16777216}, {524288, 32768, 16777216}, {32, 2, 128}, {2, 16, 1}, 1, (WGType) 1, 257, 24576, 0, {8, 8, 4}, {true, true, true}}, {'E', 17, {991020, 415137, 0, 0, 0, 0, 0.494061, 1.1783, 1.09356, 3.42418, 0.0192037, 0.0192037, 0, 0.99432, 1.16294, -0.903393, 1.39731e-11}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {8, 8, 1}, "IAB"}, "am32 at32 aB wg 2x2 sys", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 255}, {8192, 8192, 2048}, {8192, 8192, 2048}, {32, 16, 32}, {2, 2, 1}, 1, (WGType) 0, 257, 0, 0, {8, 8, 4}, {true, true, true}}, {'W', 1, {512}}}, +{{'F', "gemm", {"[F0]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 4, 1}, "#BIg"}, "aB32 at32+m32@96 aB wg 1x4x8 kr cab4x2 ks32 af hi pt sr br bk0 sn grf256 kv afb sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {1048576, 131072, 16777216}, {1048576, 131072, 32}, {64, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 445, 12288, 12288, {4, 4, 4}, {true, true, true}}, {'E', 17, {1.43554e+06, -119170, -46892.7, 189568, 2.77709e+06, 2.1504e+06, 0.402068, 0.981366, 0.646632, 1.0724, 0.0199081, 0.0164365, 0.0044588, 0.668568, 1.20591, 0.616402, 1.76161e-11}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"N", "T", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {1, 1, 1}, "#I"}, "aB32 aS32 aB wg 1x4x8 kr cab4 ks32 af hi pt bk0 grf256 sys l4 dm sr br", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 131072, 16777216}, {524288, 131072, 16777216}, {32, 8, 32}, {1, 4, 8}, 1, (WGType) 1, 261, 8192, 8192, {1, 1, 4}, {true, true, true}}, {'E', 17, {1.0714e+06, 814336, -806.186, 79098.3, 0, 0, 1.57352, 2.83206, 2.69271, 7.17928, 0.0520116, 0.0520116, 0, 0.665122, 1.00198, 1.00049, 9.78834e-15}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "IAB"}, "at32x2+m64@96 am64+m32@128 aB wg 4x2x4 kr xaf st hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {262144, 262144, 16777216}, {262144, 262144, 64}, {16, 16, 64}, {4, 2, 4}, 1, (WGType) 1, 445, 0, 8192, {4, 8, 4}, {true, true, true}}, {'E', 17, {1.12606e+06, -137008, -24369.3, 228077, 2.14139e+06, 1.80224e+06, 0.232306, 0.352856, 0.366483, 1.01866, 0.0096663, 0.00801051, 0.00318484, 0.842293, 1.37613, 0.921541, 4.97764e-12}}}, +{{'F', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {4, 8, 1}, "ABIps"}, "at32+m128@96 am64+m64@112 aB wg 4x8 xaf st hi pt sr br sb128 bk0 sm sn grf256 sys kv afb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 655360, 16777216}, {1048576, 655360, 64}, {16, 40, 32}, {4, 8, 1}, 1, (WGType) 1, 441, 0, 0, {4, 8, 4}, {true, true, true}}, {'E', 17, {884649, 705009, 0, 0, 6.08092e+06, 1.03629e+07, 0.496687, 0.364134, 0.840897, 1.28383, 0.00200956, 0.00200956, 0, 1, 2.31371, 1.15477, 1.64496e-12}}}, {{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, 8, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32 am128 aB wg 2x1x8 ikr xaf st vav hi pt sr br sb128 bk0 bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 131072, 16777216}, {262144, 131072, 16777216}, {16, 8, 128}, {2, 1, 8}, 1, (WGType) 0, 4357, 0, 1024, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, {{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 9, -1}, {-1, 16, -1}, {-1, 9, -1}, {-1, 16, -1}, {16, 16, 1}, "IAB"}, "at32+m128@80 am32+m128@80 aB wg 8x1x4 ikr wx2 xaf vav hi pt sr br sb128 bk0 sm sn bm0 nmk sys", {16, (LoopType) 255, 128, {(LoopType) 209, (LoopType) 255, (LoopType) 2}, {16777216, 262144, 16777216}, {262144, 262144, 16777216}, {16, 16, 128}, {8, 1, 4}, 2, (WGType) 1, 4357, 0, 8192, {16, 16, 4}, {true, true, true}}, {'W', 1, {256}}}, {{'G', "gemm", {"F", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 17, -1}, {-1, 24, -1}, {-1, 17, -1}, {-1, 24, -1}, {16, 16, 1}, "ABI"}, "at64x2 am32x2+m64@32 aB wg 4x2 af rr vav hi pt sr br sb64 bk0 grf256 sys np np", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {524288, 524288, 16777216}, {524288, 524288, 16777216}, {32, 32, 64}, {4, 2, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'W', 1, {1024}}}, @@ -1076,8 +1094,8 @@ auto _CATALOG_ = kcatalog::toArray({ {{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "ABI"}, "at64+m128@32 am128+m64@32 aB wg 2x8 xaf st hi pt sr br sb128 sn grf256 cr0 sys", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 262144, 16777216}, {1048576, 262144, 16777216}, {64, 16, 128}, {2, 8, 1}, 1, (WGType) 1, 257, 0, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {525002, 18498.2, 0, 0, 0, 0, 0.485217, 0.854072, 1.96694, 5.31108, 0.00356541, 0.00161949, 0.00452935, 0.938224, 1.01441, 1.01414, -4.57758e-14}}}, {{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, -1}, {-1, 1, 8191}, {-1, 1, -1}, {-1, 1, 8191}, {16, 16, 1}, "ABI"}, "at128+m64@48 am128+m32@48 aB wg 2x1x4 ikr af hi pt sr br sb128 grf256 sys bk0 acb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {2, 1, 4}, 1, (WGType) 1, 4357, 0, 256, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}}, {{'G', "gemm", {"F", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, 1, 8192}, {-1, 1, -1}, {-1, 1, 8192}, {-1, 1, -1}, {16, 16, 1}, "ABI"}, "at64+m128@64 am128+m128@64 aB wg 1x1x8 ikr af rr hi pt sr br sb128 sn grf256 sys bk0 acb", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 2}, {524288, 16384, 16777216}, {524288, 16384, 16777216}, {32, 1, 128}, {1, 1, 8}, 1, (WGType) 1, 4357, 0, 128, {16, 16, 4}, {true, true, true}}, {'W', 1, {32}}}, -{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav sr br sb64 bm0 bk0 sys nmk np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, -{{'G', "gemm", {"O", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 9, -1}, {-1, 64, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav sr br sb64 bm0 bk0 sys nmk grf256 np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, +{{'G', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 8, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav sr br sb64 bm0 bk0 sys nmk np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 8, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, +{{'G', "gemm", {"[OH]", "H", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 9, -1}, {-1, 64, -1}, {16, 16, 1}, "IAB"}, "at32x2+m64@16 am64x2 aB wg 4x1x4 ikr af vav sr br sb64 bm0 bk0 sys nmk grf256 np", {16, (LoopType) 255, 128, {(LoopType) 0, (LoopType) 1, (LoopType) 2}, {8192, 8192, 16777216}, {8192, 8192, 16777216}, {16, 16, 64}, {4, 1, 4}, 1, (WGType) 1, 261, 0, 2048, {16, 16, 4}, {true, true, true}}, {'W', 1, {128}}}, {{'G', "gemm", {"[FO]", "O", "S"}, {"N", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {16, 16, 1}, "ABIs"}, "am32+m32@112 am32x2+m32@112 aB wg 4x8 ca4 ks32 af st rr vav hi pt sr br bk0 sn nb 4x0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 458752, 16777216}, {1048576, 458752, 16777216}, {64, 28, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 32768, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {855191, 60448.1, 0, 0, 0, 0, 0.945598, 1.89384, 4.01425, 7.25708, 0.00522328, 0.00522328, 0, 1, 1.04077, 1.00307, 1.26521e-13}}}, {{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, 512, -1}, {-1, -1, -1}, {16, 16, 1}, "ABI"}, "at32+m32@64 am32+m64@64 aB wg 4x8 ca3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 4x0 grf256 sys acb cr16", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {64, 32, 32}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {969021, 62613.4, 0, 0, 0, 0, 1.08371, 1.91773, 3.98757, 8.74675, 0.00540242, 0.00540242, 0, 1, 1.00953, 1.00923, -1.52423e-14}}}, {{'G', "gemm", {"[FO]", "O", "S"}, {"T", "N", "N"}}, {-1, -1, {-1, -1, -1}, {-1, -1, -1}, {-1, -1, -1}, {-1, 511, -1}, {16, 16, 1}, "ABI"}, "at32+m32@32 am32+m64@32 aB wg 4x4 ca3 ks32 xaf st vav hi pt sr br bk0 sm sn nb 4x0 grf256 sys acb cr0 ", {16, (LoopType) 255, 256, {(LoopType) 208, (LoopType) 255, (LoopType) 255}, {1048576, 524288, 16777216}, {1048576, 524288, 16777216}, {32, 32, 128}, {4, 8, 1}, 1, (WGType) 1, 257, 24576, 0, {16, 16, 4}, {true, true, true}}, {'E', 17, {758684, 49729.1, 0, 0, 0, 0, 0.934748, 1.93997, 3.75236, 8.71198, 0.00639545, 0.00639545, 0, 1, 1.01136, 1.00459, 2.53456e-14}}}, diff --git a/src/gpu/intel/ocl/gemm/ref_gemm.hpp b/src/gpu/intel/ocl/gemm/ref_gemm.hpp index ab8dc82550f..bec5951a431 100644 --- a/src/gpu/intel/ocl/gemm/ref_gemm.hpp +++ b/src/gpu/intel/ocl/gemm/ref_gemm.hpp @@ -214,6 +214,7 @@ struct ref_gemm_t : public gpu_gemm_t { DNNL_ARG_A, DNNL_ARG_B, DNNL_ARG_C}; for (int arg : supported_args) { if (!zp.has_default_values(arg)) { + if (arg != DNNL_ARG_C) return false; const int mask = zp.get_mask(arg); if (mask > 0) return false; }