Skip to content

Commit e1c167a

Browse files
authored
[GPU] Modify fc_gpu_bf_tiled kernel to enable weight zp (#26367)
### Details: - *item1* - *...* ### Tickets: - CVS-150930 --------- Signed-off-by: Min, Byung-il <byungil.min@intel.com> Signed-off-by: Min, Byungil <byungil.min@intel.com>
1 parent de30969 commit e1c167a

File tree

3 files changed

+72
-30
lines changed

3 files changed

+72
-30
lines changed

src/plugins/intel_gpu/src/kernel_selector/cl_kernels/fully_connected_gpu_bf_tiled.cl

+19-13
Original file line numberDiff line numberDiff line change
@@ -886,38 +886,44 @@ inline void FUNC(fc_bf_tiled_kernel_dyn_quan)(
886886
SLM_FILTER_PACKED_VEC wei_packed0 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, weights_idx);
887887
SLM_FILTER_PACKED_VEC wei_packed1 = BLOCK_READN(FILTER_TYPE, FILTER_ACTUAL_LOAD_BLOCK_SIZE, weights, (weights_idx + ((IFM_SIZE / 2) * 16)));
888888
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked;
889-
dq_wei_unpacked.s0123 = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
890-
dq_wei_unpacked.s4567 = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
889+
// loaded weights 'wei_packed' of os_iyx_osv16 format have continuous values along TILE_K. So no need to transpose while unpacking
890+
dq_wei_unpacked.s0123 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed0));
891+
dq_wei_unpacked.s4567 = UNPACK_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD*)&wei_packed1));
891892
#else
892893
SLM_FILTER_PACKED_VEC wei_packed = BLOCK_READN(FILTER_TYPE, FILTER_LOAD_BLOCK_SIZE, weights, weights_idx);
893894
DQ_SLM_FILTER_UNPACKED_VEC dq_wei_unpacked = UNPACK_TRANSPOSED_INT4(DQ_TYPE, *((INT4_PACKED_TYPE_PRELOAD *)&wei_packed));
894895
#endif
895896

896897
// Calculate zero-point and scale only for DECOMPRESSION_SCALE_POST_OP enabled
898+
// Calculate weight : w = (w - dzp) * ds
897899
#if DECOMPRESSION_ZP_TERM
898900
#if DECOMPRESSION_ZP_SCALAR
899901
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(DECOMPRESSION_ZP_VALUE);
902+
dq_wei_unpacked -= dzp;
900903
#elif DECOMPRESSION_ZP_GROUPS_NUM > 1
901-
DQ_SLM_FILTER_UNPACKED_VEC dzp;
904+
DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked);
905+
const uint ni_offset = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE;
902906
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
907+
const uint offset_ofm = out_f + fi*SIMD + sglid;
903908
unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) {
904-
const uint offset_ofm = out_f + fi*SIMD + sglid;
905-
const uint offset_ifm = ni * TILE_IFM * SIMD + local_id * FILTER_LOAD_ITERS * FILTER_LOAD_BLOCK_SIZE + load_iter * FILTER_LOAD_BLOCK_SIZE + kii;
909+
const uint offset_ifm = ni_offset + load_iter * FILTER_LOAD_BLOCK_SIZE + kii;
906910
const uint zp_offset = (offset_ofm % DECOMPRESSION_ZP_BATCH_NUM) * DECOMPRESSION_ZP_BATCH_PITCH +
907911
(offset_ifm / DECOMPRESSION_ZP_GROUP_SIZE) * DECOMPRESSION_ZP_FEATURE_PITCH;
908-
dzp[W_IDX] = decompression_zp[zp_offset];
912+
w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - TO_DQ_TYPE(decompression_zp[zp_offset]);
909913
}
910914
}
911915
#else
912-
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(d_zps[0]);
916+
DQ_TYPE* w = (DQ_TYPE*)(&dq_wei_unpacked);
917+
unroll_for(uint fi = 0; fi < TILE_OFM; ++fi) {
918+
unroll_for(uint kii = 0; kii < FILTER_LOAD_BLOCK_SIZE; ++kii) {
919+
w[W_DYN_QUAN_IDX] = w[W_DYN_QUAN_IDX] - d_zps[fi % DECOMPRESSION_ZP_LENGTH];
920+
}
921+
}
913922
#endif
914923
#else
915924
DQ_SLM_FILTER_UNPACKED_VEC dzp = (DQ_SLM_FILTER_UNPACKED_VEC)(ACCUMULATOR_VAL_ZERO);
916925
#endif
917926

918-
// Calculate weight : w = (w - dzp) * ds
919-
dq_wei_unpacked -= dzp;
920-
921927
#if FILTER_LOAD_BLOCK_SIZE == 2
922928
DQ_SLM_FILTER_VEC wei_1 = {dq_wei_unpacked.s01, dq_wei_unpacked.s23};
923929
char_slm_weight[wei_local_idx] = as_int(wei_1);
@@ -1117,7 +1123,7 @@ KERNEL(fc)(
11171123
#endif
11181124
) {
11191125
#if USE_SLM
1120-
#if DYNAMIC_QUANTIZE
1126+
#if DYNAMIC_QUANTIZE && (TILE_OFM == 2)
11211127
__local int dq_wei_local_mem[SIMD * TILE_OFM * SIMD];
11221128
#else
11231129
__local ACCUMULATOR_TYPE wei_local_mem[TILE_IFM * SIMD * TILE_OFM * SIMD];
@@ -1259,7 +1265,7 @@ KERNEL(fc)(
12591265
#endif
12601266
);
12611267
} else {
1262-
#if USE_SLM && DYNAMIC_QUANTIZE
1268+
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
12631269
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
12641270
OPTIONAL_SHAPE_INFO_TENSOR
12651271
input,
@@ -1306,7 +1312,7 @@ KERNEL(fc)(
13061312
#endif
13071313
}
13081314
#else
1309-
#if USE_SLM && DYNAMIC_QUANTIZE
1315+
#if USE_SLM && DYNAMIC_QUANTIZE && (TILE_OFM == 2)
13101316
FUNC_CALL(fc_bf_tiled_kernel_dyn_quan)(
13111317
OPTIONAL_SHAPE_INFO_TENSOR
13121318
input,

src/plugins/intel_gpu/src/kernel_selector/kernels/fully_connected/fully_connected_kernel_bf_tiled.cpp

+8-7
Original file line numberDiff line numberDiff line change
@@ -111,8 +111,7 @@ static bool should_dynamic_quantize(const fully_connected_params& params) {
111111
if ((scale_group_size % simd == 0) && (input_f % dynamic_quantization_group_size == 0) &&
112112
(params.is_shape_agnostic || (params.inputs[0].Batch().v > 1 && input_b > min_slm_size)) &&
113113
params.inputs[0].GetDType() == Datatype::F16 &&
114-
(params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4) &&
115-
(params.decompression_zero_point.Feature().v == 1)) {
114+
(params.weights.GetDType() == WeightsType::INT4 || params.weights.GetDType() == WeightsType::UINT4)) {
116115
GPU_DEBUG_TRACE_DETAIL << " Dynamic quantizing for FC : scale_group_size " << scale_group_size << ", Input (" <<
117116
kernel_selector::toString(params.inputs[0].GetDType()) << ", " << kernel_selector::toString(params.outputs[0].GetLayout()) <<
118117
") B: " << params.inputs[0].Batch().v << ", F: " << params.inputs[0].Feature().v << ", Y: " << params.inputs[0].Y().v << std ::endl;
@@ -524,13 +523,15 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
524523
if (scale_group_size % simd == 0 && !dispatchData.use_slm)
525524
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
526525
}
527-
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2)
526+
if (params.weights.GetLayout() == WeightsLayout::os_is_yx_osv32_isv2) {
528527
jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
529-
else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16)
528+
} else if (params.weights.GetLayout() == WeightsLayout::os_iyx_osv16) {
530529
jit.AddConstant(MakeJitConstant("W_IDX", "fi * TILE_K + kii"));
531-
else
530+
} else {
532531
jit.AddConstant(MakeJitConstant("W_IDX", "kii * TILE_OFM + fi"));
532+
}
533533

534+
jit.AddConstant(MakeJitConstant("W_DYN_QUAN_IDX", "fi * TILE_K + kii"));
534535

535536
if (dispatchData.use_slm) {
536537
OPENVINO_ASSERT(dispatchData.tile_n == 2, "[GPU] Unsupported TILE_OFM size for SLM kernel configuration");
@@ -576,14 +577,14 @@ JitConstants FullyConnected_bf_tiled::GetJitConstants(const fully_connected_para
576577
}
577578

578579
// Validated perf gain, Dynamic quantize force enable SCALE_POST_OP for char type multiplication
579-
if (should_dynamic_quantize(params) && dispatchData.tile_m > 1 && dispatchData.tile_n == 2) {
580+
if (should_dynamic_quantize(params)) {
580581
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 1));
581582
jit.AddConstant(MakeJitConstant("DECOMPRESSION_SCALE_POST_OP", 1));
582583
jit.AddConstant(MakeJitConstant("DQ_TYPE", "char"));
583584
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", quantize_grp_size));
584585
} else {
585586
jit.AddConstant(MakeJitConstant("DYNAMIC_QUANTIZE", 0));
586-
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", -1));
587+
jit.AddConstant(MakeJitConstant("QUANTIZE_GROUP_SIZE", min_quantize_grp_size));
587588
}
588589

589590
jit.AddConstant(MakeJitConstant("IFM_SIZE", get_input_bf_size(params).second));

src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp

+45-10
Original file line numberDiff line numberDiff line change
@@ -2540,7 +2540,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
25402540
}
25412541

25422542
void test_compressed_int4_scale_dyn_quan_weight_i4(bool is_dynamic, int batch = 1, int ifm = 512, int ofm = 2048,
2543-
int quantize_group_size = 32, int scales_group_size = 128) {
2543+
int quantize_group_size = 32, int scales_group_size = 128,
2544+
bool is_wzp_test = false, bool is_wzp_scalar = false) {
25442545
tests::random_generator rg(GET_SUITE_NAME);
25452546
auto& engine = get_test_engine();
25462547

@@ -2550,12 +2551,15 @@ class fully_connected_gpu_tests: public ::testing::Test {
25502551
long int batch_num = batch;
25512552
long int ifm_num = ifm;
25522553
long int ofm_num = ofm;
2554+
long int wzp_num = is_wzp_scalar ? 1 : ofm_num;
25532555

25542556
auto input_ps = ov::PartialShape{ batch_num, 1, ifm_num };
25552557
auto input_mem = engine.allocate_memory({ input_ps, data_types::f16, format::bfyx });
25562558

25572559
auto weights_mem = engine.allocate_memory({ {ofm_num, ifm_num}, data_types::i4, format::bfyx });
25582560
auto scale_mem = engine.allocate_memory({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::fbyx });
2561+
auto dcomp_zp_mem = engine.allocate_memory({ {wzp_num, 1}, data_types::u8, format::bfyx });
2562+
25592563

25602564
auto input_data = rg.generate_random_1d<ov::float16>(batch_num * ifm_num, -2.f, 2.f);
25612565
set_values(input_mem, input_data);
@@ -2566,28 +2570,38 @@ class fully_connected_gpu_tests: public ::testing::Test {
25662570
auto scale_data = rg.generate_random_1d<ov::float16>(ofm_num * ifm_num / scales_group_size, -2.f, 2.f);
25672571
set_values(scale_mem, scale_data);
25682572

2573+
if (is_wzp_test) {
2574+
auto zp_data = rg.generate_random_1d<uint8_t>(wzp_num, 0, 2);
2575+
set_values(dcomp_zp_mem, zp_data);
2576+
}
2577+
25692578
auto in_layout = is_dynamic ? layout{ ov::PartialShape{ -1, -1, -1 }, data_types::f16, format::bfyx }
25702579
: layout{ input_ps, data_types::f16, format::bfyx };
25712580

2572-
auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", "", data_types::f16, 3, 2);
2573-
fc_prim.decompression_zero_point_scalar = 0;
2581+
auto dcomp_zp_name = is_wzp_test ? "wzp" : "";
2582+
auto fc_prim = fully_connected("fc_prim", input_info("input"), "weights", "", "scale", dcomp_zp_name, data_types::f16, 3, 2);
2583+
2584+
if (is_wzp_test) {
2585+
fc_prim.compressed_weights = true;
2586+
fc_prim.decompression_zero_point = is_wzp_test ? "wzp" : "";
2587+
}
25742588

25752589
// Implemented dynamic quantize kernel
25762590
auto get_ref_results = [&]() {
2577-
topology topology(
2578-
input_layout("input", in_layout),
2579-
data("weights", weights_mem),
2580-
data("scale", scale_mem),
2581-
fc_prim
2582-
);
2591+
topology topo;
2592+
topo.add(input_layout("input", in_layout));
2593+
topo.add(data("weights", weights_mem));
2594+
topo.add(data("scale", scale_mem));
2595+
topo.add(data("wzp", dcomp_zp_mem));
2596+
topo.add(fc_prim);
25832597

25842598
auto config = get_test_default_config(engine);
25852599
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
25862600
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, "fully_connected_gpu_bf_tiled", impl_types::ocl };
25872601
config.set_property(ov::intel_gpu::force_implementations(ov::intel_gpu::ImplForcingMap{ {"fc_prim", fc_impl_desc} }));
25882602
config.set_property(ov::hint::dynamic_quantization_group_size(0));
25892603

2590-
network network(engine, topology, config);
2604+
network network(engine, topo, config);
25912605
network.set_input_data("input", input_mem);
25922606

25932607
auto outputs = network.execute();
@@ -2604,6 +2618,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
26042618
input_layout("input", in_layout),
26052619
data("weights", weights_mem),
26062620
data("scale", scale_mem),
2621+
data("wzp", dcomp_zp_mem),
26072622
fc_prim
26082623
);
26092624

@@ -3699,6 +3714,26 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_ca
36993714
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 359, 1536, 2560, 128, 64);
37003715
}
37013716

3717+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_no_wzp) {
3718+
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, false);
3719+
}
3720+
3721+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp) {
3722+
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, true);
3723+
}
3724+
3725+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_scalar) {
3726+
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 32, 32, true);
3727+
}
3728+
3729+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_128) {
3730+
this->test_compressed_int4_scale_dyn_quan_weight_i4(true, 320, 1024, 1024, 128, 128, true);
3731+
}
3732+
3733+
TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_static) {
3734+
this->test_compressed_int4_scale_dyn_quan_weight_i4(false, 320, 1024, 1024, 32, 32, true);
3735+
}
3736+
37023737
TEST_F(fully_connected_gpu_tests, compressed_scale_bias) {
37033738
this->test_compressed_scale_bias(false);
37043739
}

0 commit comments

Comments
 (0)