Skip to content

Commit cd7718b

Browse files
[GPU] Add remove reorder pattern i32 to f16
1 parent 606950d commit cd7718b

File tree

2 files changed

+83
-0
lines changed

2 files changed

+83
-0
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp

+46
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,52 @@ void remove_redundant_reorders::run(program& p) {
696696
p.remove_if_dangling(*node);
697697
}
698698

699+
// Remove reorder for data type convert
700+
// Target pattern: F32/16 -> I32 -> F16 reorder
701+
// F32 -> I32 -> F32 reorder should be done before here
702+
itr = p.get_processing_order().begin();
703+
while (itr != p.get_processing_order().end()) {
704+
auto& node = *itr++;
705+
if (!node->is_type<reorder>())
706+
continue;
707+
708+
auto& dep = node->get_dependency(0);
709+
710+
if (dep.is_type<reorder>())
711+
continue;
712+
713+
bool allowed_dep_input_type = true;
714+
auto dep_input_layouts = dep.get_input_layouts();
715+
for(auto& l : dep_input_layouts) {
716+
if (!(l.data_type == data_types::f16 || l.data_type == data_types::f32)) {
717+
allowed_dep_input_type = false;
718+
continue;
719+
}
720+
}
721+
722+
auto dep_output_layout = dep.get_output_layout();
723+
auto node_output_layout = node->get_output_layout();
724+
if (!(allowed_dep_input_type &&
725+
dep_output_layout.data_type == data_types::i32 &&
726+
node_output_layout.data_type == data_types::f16))
727+
continue;
728+
729+
// allow only daya_type conversion
730+
auto validate_layout = node_output_layout;
731+
validate_layout.data_type = dep_output_layout.data_type;
732+
if (validate_layout != dep_output_layout)
733+
continue;
734+
735+
dep_output_layout.data_type = node_output_layout.data_type;
736+
dep.set_output_layout(dep_output_layout);
737+
738+
LOG_NODE_REMOVAL(node->id());
739+
p.replace_all_usages(*node, dep);
740+
p.add_optimized_primitive_info(node->id());
741+
p.remove_all_connections(*node);
742+
p.remove_if_dangling(*node);
743+
}
744+
699745
for (auto n : p.get_processing_order()) {
700746
if (n->is_in_data_flow() && n->is_type<reorder>()) {
701747
auto preferred_impl = lo.get_preferred_impl_type(*n, n->get_input_layout(0).format);

src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp

+37
Original file line numberDiff line numberDiff line change
@@ -292,6 +292,43 @@ TEST(fully_connected_gpu, no_biases_int8) {
292292
ASSERT_EQ(-52.0f, output_ptr[3]);
293293
}
294294

295+
TEST(fully_connected_gpu, no_biases_fc_i32_reorder_f16) {
296+
const int32_t input_f = 2, input_b = 1, // size of the whole input buffer
297+
weight_b = 1, weight_f = 2; // size of the whole weights buffer
298+
299+
auto& engine = get_test_engine();
300+
301+
auto input_prim = engine.allocate_memory({ data_types::i32, format::bfyx, { input_b, input_f, 1, 1 } });
302+
auto weights_prim = engine.allocate_memory({ data_types::i32, format::bfyx, { weight_b, weight_f, 1, 1 } });
303+
304+
set_values<int32_t>(input_prim, { 1, 1 });
305+
set_values<int32_t>(weights_prim, { 1, 1 });
306+
307+
cldnn::topology topology{
308+
input_layout("input", input_prim->get_layout()),
309+
data("weights", weights_prim),
310+
fully_connected("fc_prim", input_info("input"), "weights"),
311+
reorder("reorder_to_f16", input_info("fc_prim"), { data_types::f16, format::bfyx, { input_b, weight_b, 1, 1 } }),
312+
activation("output", input_info("reorder_to_f16"), activation_func::floor)
313+
};
314+
315+
ExecutionConfig config = get_test_default_config(engine);
316+
config.set_property(ov::intel_gpu::optimize_data(true));
317+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
318+
319+
cldnn::network network(engine, topology, config);
320+
321+
network.set_input_data("input", input_prim);
322+
323+
auto outputs = network.execute();
324+
ASSERT_EQ(outputs.size(), size_t(1));
325+
ASSERT_EQ(outputs.begin()->first, "output");
326+
327+
auto output_prim = outputs.begin()->second.get_memory();
328+
cldnn::mem_lock<ov::float16> output_ptr (output_prim, get_test_stream());
329+
ASSERT_EQ(2, output_ptr[0]);
330+
}
331+
295332
TEST(fully_connected_gpu, xb_f32_batch_1) {
296333
// Input : 3x1
297334
// Output : 4x1

0 commit comments

Comments
 (0)