Skip to content

Commit 1e3b88e

Browse files
authored
[GPU] Fix regression by selection of reference MatMul (#25633)
+ Resolve unexpected input of dynamic shape from Reshape ### Details: - *item1* - *...* ### Tickets: - 147083 --------- Signed-off-by: Min, Byung-il <byungil.min@intel.com>
1 parent b2a471b commit 1e3b88e

File tree

4 files changed

+75
-32
lines changed

4 files changed

+75
-32
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl

+4-16
Original file line numberDiff line numberDiff line change
@@ -116,25 +116,13 @@ KERNEL(quantize_input)(
116116

117117

118118
#if !REALIGN_FP16_OFFSET
119-
# if OUTPUT_3D
120-
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_SIZE_Y
121-
# else
122-
# define MAIN_LOOP_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
123-
# endif
119+
#define MAIN_LOOP_ELEMENTS_COUNT IFM_SIZE
124120
#else
125-
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
126-
# if OUTPUT_3D
127-
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_SIZE_Y - 1)
128-
# else
129-
# define MAIN_LOOP_ELEMENTS_COUNT (INPUT0_ELEMENTS_COUNT - 1)
130-
# endif
121+
// For REALIGN_FP16_OFFSET one feature is processed separately before entering main loop to correct alignment.
122+
#define MAIN_LOOP_ELEMENTS_COUNT (IFM_SIZE - 1)
131123
#endif
132124

133-
#if OUTPUT_3D
134-
# define INPUT_ELEMENTS_COUNT INPUT0_SIZE_Y
135-
#else
136-
# define INPUT_ELEMENTS_COUNT INPUT0_ELEMENTS_COUNT
137-
#endif
125+
#define INPUT_ELEMENTS_COUNT IFM_SIZE
138126

139127
#if IS_DYNAMIC && COMPRESSED_WEIGHTS_INT4
140128
#pragma disable_includes_optimization

src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp

+26-11
Original file line numberDiff line numberDiff line change
@@ -15,14 +15,20 @@ static constexpr size_t min_slm_size = 256;
1515
namespace kernel_selector {
1616

1717
static std::pair<size_t, size_t> get_input_bf_size(const fully_connected_params& params) {
18-
size_t input_f = params.inputs[0].Feature().v;
19-
size_t input_batch = params.inputs[0].Batch().v;
18+
auto& input = params.inputs[0];
19+
size_t input_f = input.Feature().v;
20+
size_t input_batch = input.Batch().v;
21+
2022
// 3D input
2123
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
22-
input_f = params.inputs[0].Y().v;
23-
input_batch = params.inputs[0].Batch().v * params.inputs[0].Feature().v;
24+
input_f = input.Y().v;
25+
input_batch = input.Batch().v * input.Feature().v;
2426
}
2527

28+
// In Some model, input_f could be dynamic in input0. It refers to IFM value of weight.
29+
if (input.is_dynamic() && input_f == 0 && params.weights.IFM().v != 0)
30+
input_f = params.weights.IFM().v;
31+
2632
return {input_batch, input_f};
2733
}
2834

@@ -153,8 +159,7 @@ bool FullyConnected_bf_tiled::Validate(const Params& params) const {
153159

154160
// Dynamic kernel doesn't support dynamic weights yet
155161
if (fc_params.is_shape_agnostic && input.is_dynamic()) {
156-
if ((output.GetLayout() == DataLayout::bfyx && input.Y().v == 0) ||
157-
(output.GetLayout() == DataLayout::bf && input.Feature().v == 0))
162+
if (get_input_bf_size(fc_params).second == 0)
158163
return false;
159164
}
160165

@@ -509,6 +514,7 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
509514
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
510515
}
511516

517+
jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second));
512518
jit.AddConstant(MakeJitConstant("SIMD", simd));
513519
jit.AddConstant(MakeJitConstant("TILE_B", dispatchData.tile_m));
514520
jit.AddConstant(MakeJitConstant("HALF_TILE_B", dispatchData.tile_m/2));
@@ -539,16 +545,18 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
539545

540546
// for 3d output we are treating spatial as features
541547
if (params.outputs[0].GetLayout() == DataLayout::bfyx) {
548+
auto tile_in_b_pitch = (params.inputs[0].Feature().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Feature().pitch;
542549
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Y().v));
543550
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Y().pitch));
544-
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Feature().pitch));
551+
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
545552
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Feature().pitch));
546553
jit.AddConstant(MakeJitConstant("OUTPUT_3D", true));
547554
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM * OUTPUT_FEATURE_NUM)"));
548555
} else {
556+
auto tile_in_b_pitch = (params.inputs[0].Batch().pitch == 0) ? get_input_bf_size(params).second : params.inputs[0].Batch().pitch;
549557
jit.AddConstant(MakeJitConstant("TILE_OUT_F_NUM", params.outputs[0].Feature().v));
550558
jit.AddConstant(MakeJitConstant("TILE_OUT_F_PITCH", params.outputs[0].Feature().pitch));
551-
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", params.inputs[0].Batch().pitch));
559+
jit.AddConstant(MakeJitConstant("TILE_IN_B_PITCH", tile_in_b_pitch));
552560
jit.AddConstant(MakeJitConstant("TILE_OUT_B_PITCH", params.outputs[0].Batch().pitch));
553561
jit.AddConstant(MakeJitConstant("BATCH_SIZE", "(OUTPUT_BATCH_NUM)"));
554562
}
@@ -614,6 +622,12 @@ void FullyConnected_bf_tiled::GetUpdateDispatchDataFunc(KernelData& kd) const {
614622
kd.kernels[execute_kernel_idx].params.workGroups.local = dispatchData.lws;
615623
kd.kernels[execute_kernel_idx].skip_execution = KernelData::SkipKernelExecution(prim_params);
616624

625+
auto& input = prim_params.inputs[0];
626+
if (prim_params.outputs[0].GetLayout() == DataLayout::bfyx)
627+
OPENVINO_ASSERT(input.X().pad.Total() == 0 && input.Y().pad.Total() == 0, "[GPU] Invalid padding in spatial axes observed in FC bf tiled.");
628+
else
629+
OPENVINO_ASSERT(input.Feature().pad.Total() == 0, "[GPU] Invalid padding in f axis observed in FC bf tiled.");
630+
617631
if (!kd.internalBufferSizes.empty()) {
618632
// Pre-quantizing kernel was generated. Update the kernel and intermediate buffers or disable it.
619633
if (execute_type == KernelType::DEFAULT) {
@@ -784,7 +798,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
784798
{
785799
auto& quan_kernel = kd.kernels[0];
786800
DispatchData dyn_quan_dispatch = dispatchData;
787-
dyn_quan_dispatch.gws = {std::max((fc_params.inputs[0].PhysicalSize() / quantize_grp_size), (size_t)1), 1, 1};
801+
auto input_size = std::max(fc_params.inputs[0].PhysicalSize(), get_input_bf_size(fc_params).second);
802+
dyn_quan_dispatch.gws = {input_size / quantize_grp_size, 1, 1};
788803
dyn_quan_dispatch.lws = {16, 1, 1};
789804
quan_kernel.params.workGroups.global = dyn_quan_dispatch.gws;
790805
quan_kernel.params.workGroups.local = dyn_quan_dispatch.lws;
@@ -814,8 +829,8 @@ KernelsData FullyConnected_bf_tiled::GetMultiKernelsData(const Params &params,
814829
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INPUT, 0});
815830
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 0});
816831
quan_kernel.params.arguments.push_back({ArgumentDescriptor::Types::INTERNAL_BUFFER, 1});
817-
kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize());
818-
kd.internalBufferSizes.push_back(fc_params.inputs[0].PhysicalSize() / quantize_grp_size * 2);
832+
kd.internalBufferSizes.push_back(input_size);
833+
kd.internalBufferSizes.push_back(input_size / quantize_grp_size * 2);
819834
kernel_number++;
820835
}
821836
kd.internalBufferDataType = Datatype::F16;

src/plugins/intel_gpu/tests/functional/single_layer_tests/dynamic/matmul.cpp

+2-1
Original file line numberDiff line numberDiff line change
@@ -266,9 +266,10 @@ const std::vector<ShapeRelatedParams> IS3D_smoke = {
266266
},
267267

268268
{ov::test::static_shapes_to_test_representation({{1, 429}, {1, 429, 1}}), {true, true}},
269+
269270
{
270271
{
271-
{{-1, -1}, {{1, 129}, {2, 129}, {1, 129}, {2, 129}}},
272+
{{-1, -1, -1}, {{1, 1, 129}, {1, 2, 129}, {1, 1, 129}, {1, 2, 129}}},
272273
{{1, 129, 1}, {{1, 129, 1}, {1, 129, 1}, {1, 129, 1}, {1, 129, 1}}}
273274
},
274275
{true, true}

src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp

+43-4
Original file line numberDiff line numberDiff line change
@@ -1255,7 +1255,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
12551255
}
12561256
}
12571257

1258-
void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1) {
1258+
void test_compressed_int4_scale_dyn_quan(bool is_caching_test, bool is_dynamic, int batch = 1, bool is_wei_dyn = false) {
12591259
tests::random_generator rg(GET_SUITE_NAME);
12601260
auto& engine = get_test_engine();
12611261

@@ -1285,6 +1285,11 @@ class fully_connected_gpu_tests: public ::testing::Test {
12851285
auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -4.0f, 4.0f);
12861286
set_values(scale_mem, scale_data);
12871287

1288+
if (is_wei_dyn) {
1289+
// ifm_num is dynamic
1290+
dyn_input_ps = is_3d ? ov::PartialShape{ -1, -1, -1 } : ov::PartialShape{ -1, -1};
1291+
}
1292+
12881293
auto in_layout = is_dynamic ? layout{ dyn_input_ps, data_types::f16, format::bfyx }
12891294
: layout{ input_ps, data_types::f16, format::bfyx };
12901295

@@ -1302,7 +1307,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
13021307

13031308
auto config = get_test_default_config(engine);
13041309
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
1305-
config.set_property(ov::intel_gpu::optimize_data(true));
1310+
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
1311+
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
13061312

13071313
network network(engine, topology, config);
13081314
network.set_input_data("input", input_mem);
@@ -1365,13 +1371,13 @@ class fully_connected_gpu_tests: public ::testing::Test {
13651371
}
13661372

13671373

1368-
void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128) {
1374+
void test_compressed_int4_scale(bool is_caching_test, bool is_dynamic, long int batch_num, long int scales_group_size = 128, bool is_wei_dyn = false) {
13691375
tests::random_generator rg(GET_SUITE_NAME);
13701376
auto& engine = get_test_engine();
13711377
auto supports_immad = engine.get_device_info().supports_immad;
13721378

13731379
long int ifm_num = 256;
1374-
long int ofm_num = 256;
1380+
long int ofm_num = 512;
13751381

13761382
auto input_mem = engine.allocate_memory({ { batch_num, ifm_num}, data_types::f16, format::bfyx });
13771383
auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::u4, format::bfyx });
@@ -1392,6 +1398,11 @@ class fully_connected_gpu_tests: public ::testing::Test {
13921398
auto in_layout = is_dynamic ? layout{ {-1, ifm_num}, data_types::f16, format::bfyx }
13931399
: layout{ {batch_num, ifm_num}, data_types::f16, format::bfyx };
13941400

1401+
if (is_dynamic && is_wei_dyn) {
1402+
// ifm_num is dynamic
1403+
in_layout = layout{ {-1, -1}, data_types::f16, format::bfyx };
1404+
}
1405+
13951406
auto dcomp_zp_name = supports_immad ? "dcomp_zp" : "";
13961407

13971408
auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, padding(), 2, 2);
@@ -1409,6 +1420,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
14091420

14101421
auto config = get_test_default_config(engine);
14111422
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
1423+
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bfyx_ref", impl_types::ocl };
1424+
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
14121425

14131426
network network(engine, topology, config);
14141427
network.set_input_data("input", input_mem);
@@ -3324,6 +3337,32 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_cache_dynamic) {
33243337
this->test_compressed_int4_scale_dyn_quan(true, true, 512);
33253338
}
33263339

3340+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input) {
3341+
this->test_compressed_int4_scale(false, true, 256, true);
3342+
}
3343+
3344+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_cached) {
3345+
this->test_compressed_int4_scale(true, true, 260, true);
3346+
}
3347+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g64) {
3348+
this->test_compressed_int4_scale(false, true, 1, 64, true);
3349+
}
3350+
3351+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_f_input_b1g128) {
3352+
this->test_compressed_int4_scale(false, true, 1, 128, true);
3353+
}
3354+
3355+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_single_batch) {
3356+
this->test_compressed_int4_scale_dyn_quan(false, true, 1, true);
3357+
}
3358+
3359+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input) {
3360+
this->test_compressed_int4_scale_dyn_quan(false, true, 512, true);
3361+
}
3362+
3363+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dyn_quan_dynamic_f_input_unaligned) {
3364+
this->test_compressed_int4_scale_dyn_quan(false, true, 511, true);
3365+
}
33273366

33283367

33293368
TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {

0 commit comments

Comments
 (0)