@@ -2540,7 +2540,8 @@ class fully_connected_gpu_tests: public ::testing::Test {
2540
2540
}
2541
2541
2542
2542
void test_compressed_int4_scale_dyn_quan_weight_i4 (bool is_dynamic, int batch = 1 , int ifm = 512 , int ofm = 2048 ,
2543
- int quantize_group_size = 32 , int scales_group_size = 128 ) {
2543
+ int quantize_group_size = 32 , int scales_group_size = 128 ,
2544
+ bool is_wzp_test = false , bool is_wzp_scalar = false ) {
2544
2545
tests::random_generator rg (GET_SUITE_NAME);
2545
2546
auto & engine = get_test_engine ();
2546
2547
@@ -2550,12 +2551,15 @@ class fully_connected_gpu_tests: public ::testing::Test {
2550
2551
long int batch_num = batch;
2551
2552
long int ifm_num = ifm;
2552
2553
long int ofm_num = ofm;
2554
+ long int wzp_num = is_wzp_scalar ? 1 : ofm_num;
2553
2555
2554
2556
auto input_ps = ov::PartialShape{ batch_num, 1 , ifm_num };
2555
2557
auto input_mem = engine.allocate_memory ({ input_ps, data_types::f16, format::bfyx });
2556
2558
2557
2559
auto weights_mem = engine.allocate_memory ({ {ofm_num, ifm_num}, data_types::i4, format::bfyx });
2558
2560
auto scale_mem = engine.allocate_memory ({ {ofm_num, ifm_num / scales_group_size}, data_types::f16, format::fbyx });
2561
+ auto dcomp_zp_mem = engine.allocate_memory ({ {wzp_num, 1 }, data_types::u8, format::bfyx });
2562
+
2559
2563
2560
2564
auto input_data = rg.generate_random_1d <ov::float16>(batch_num * ifm_num, -2 .f , 2 .f );
2561
2565
set_values (input_mem, input_data);
@@ -2566,28 +2570,38 @@ class fully_connected_gpu_tests: public ::testing::Test {
2566
2570
auto scale_data = rg.generate_random_1d <ov::float16>(ofm_num * ifm_num / scales_group_size, -2 .f , 2 .f );
2567
2571
set_values (scale_mem, scale_data);
2568
2572
2573
+ if (is_wzp_test) {
2574
+ auto zp_data = rg.generate_random_1d <uint8_t >(wzp_num, 0 , 2 );
2575
+ set_values (dcomp_zp_mem, zp_data);
2576
+ }
2577
+
2569
2578
auto in_layout = is_dynamic ? layout{ ov::PartialShape{ -1 , -1 , -1 }, data_types::f16, format::bfyx }
2570
2579
: layout{ input_ps, data_types::f16, format::bfyx };
2571
2580
2572
- auto fc_prim = fully_connected (" fc_prim" , input_info (" input" ), " weights" , " " , " scale" , " " , data_types::f16, 3 , 2 );
2573
- fc_prim.decompression_zero_point_scalar = 0 ;
2581
+ auto dcomp_zp_name = is_wzp_test ? " wzp" : " " ;
2582
+ auto fc_prim = fully_connected (" fc_prim" , input_info (" input" ), " weights" , " " , " scale" , dcomp_zp_name, data_types::f16, 3 , 2 );
2583
+
2584
+ if (is_wzp_test) {
2585
+ fc_prim.compressed_weights = true ;
2586
+ fc_prim.decompression_zero_point = is_wzp_test ? " wzp" : " " ;
2587
+ }
2574
2588
2575
2589
// Implemented dynamic quantize kernel
2576
2590
auto get_ref_results = [&]() {
2577
- topology topology (
2578
- input_layout (" input" , in_layout),
2579
- data (" weights" , weights_mem),
2580
- data (" scale" , scale_mem),
2581
- fc_prim
2582
- );
2591
+ topology topo;
2592
+ topo. add ( input_layout (" input" , in_layout));
2593
+ topo. add ( data (" weights" , weights_mem));
2594
+ topo. add ( data (" scale" , scale_mem));
2595
+ topo. add ( data ( " wzp " , dcomp_zp_mem));
2596
+ topo. add (fc_prim );
2583
2597
2584
2598
auto config = get_test_default_config (engine);
2585
2599
config.set_property (ov::intel_gpu::allow_new_shape_infer (true ));
2586
2600
ov::intel_gpu::ImplementationDesc fc_impl_desc = { format::bfyx, " fully_connected_gpu_bf_tiled" , impl_types::ocl };
2587
2601
config.set_property (ov::intel_gpu::force_implementations (ov::intel_gpu::ImplForcingMap{ {" fc_prim" , fc_impl_desc} }));
2588
2602
config.set_property (ov::hint::dynamic_quantization_group_size (0 ));
2589
2603
2590
- network network (engine, topology , config);
2604
+ network network (engine, topo , config);
2591
2605
network.set_input_data (" input" , input_mem);
2592
2606
2593
2607
auto outputs = network.execute ();
@@ -2604,6 +2618,7 @@ class fully_connected_gpu_tests: public ::testing::Test {
2604
2618
input_layout (" input" , in_layout),
2605
2619
data (" weights" , weights_mem),
2606
2620
data (" scale" , scale_mem),
2621
+ data (" wzp" , dcomp_zp_mem),
2607
2622
fc_prim
2608
2623
);
2609
2624
@@ -3699,6 +3714,26 @@ TEST_F(fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_edge_ca
3699
3714
this ->test_compressed_int4_scale_dyn_quan_weight_i4 (true , 359 , 1536 , 2560 , 128 , 64 );
3700
3715
}
3701
3716
3717
+ TEST_F (fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_no_wzp) {
3718
+ this ->test_compressed_int4_scale_dyn_quan_weight_i4 (true , 320 , 1024 , 1024 , 32 , 32 , false );
3719
+ }
3720
+
3721
+ TEST_F (fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp) {
3722
+ this ->test_compressed_int4_scale_dyn_quan_weight_i4 (true , 320 , 1024 , 1024 , 32 , 32 , true );
3723
+ }
3724
+
3725
+ TEST_F (fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_scalar) {
3726
+ this ->test_compressed_int4_scale_dyn_quan_weight_i4 (true , 320 , 1024 , 1024 , 32 , 32 , true );
3727
+ }
3728
+
3729
+ TEST_F (fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_128) {
3730
+ this ->test_compressed_int4_scale_dyn_quan_weight_i4 (true , 320 , 1024 , 1024 , 128 , 128 , true );
3731
+ }
3732
+
3733
+ TEST_F (fully_connected_gpu_tests, compressed_int4_scale_dynamic_quantize_wzp_static) {
3734
+ this ->test_compressed_int4_scale_dyn_quan_weight_i4 (false , 320 , 1024 , 1024 , 32 , 32 , true );
3735
+ }
3736
+
3702
3737
TEST_F (fully_connected_gpu_tests, compressed_scale_bias) {
3703
3738
this ->test_compressed_scale_bias (false );
3704
3739
}
0 commit comments