Skip to content

Commit 2111e3e

Browse files
[GPU] Add remove reorder pattern i32 to f16
1 parent ec25616 commit 2111e3e

File tree

2 files changed

+86
-0
lines changed

2 files changed

+86
-0
lines changed

src/plugins/intel_gpu/src/graph/graph_optimizer/remove_redundant_reorders.cpp

+49
Original file line numberDiff line numberDiff line change
@@ -696,6 +696,55 @@ void remove_redundant_reorders::run(program& p) {
696696
p.remove_if_dangling(*node);
697697
}
698698

699+
// Remove reorder for data type convert
700+
// Target pattern: F32 -> I32 -> F16 reorder
701+
// F32 -> I32 -> F32 reorder should be done before here
702+
itr = p.get_processing_order().begin();
703+
while (itr != p.get_processing_order().end()) {
704+
auto& node = *itr++;
705+
if (!node->is_type<reorder>())
706+
continue;
707+
708+
auto& deps = node->get_dependencies();
709+
if (deps.size() != 1)
710+
continue;
711+
712+
auto& dep = *deps[0].first;
713+
if (dep.is_type<reorder>())
714+
continue;
715+
716+
bool allowed_dep_input_type = true;
717+
auto dep_input_layouts = dep.get_input_layouts();
718+
for (auto& l : dep_input_layouts) {
719+
if (l.data_type != data_types::f32) {
720+
allowed_dep_input_type = false;
721+
continue;
722+
}
723+
}
724+
725+
auto dep_output_layout = dep.get_output_layout();
726+
auto node_output_layout = node->get_output_layout();
727+
if (!(allowed_dep_input_type &&
728+
dep_output_layout.data_type == data_types::i32 &&
729+
node_output_layout.data_type == data_types::f16))
730+
continue;
731+
732+
// allow only daya_type conversion
733+
auto validate_layout = node_output_layout;
734+
validate_layout.data_type = dep_output_layout.data_type;
735+
if (validate_layout != dep_output_layout)
736+
continue;
737+
738+
dep_output_layout.data_type = node_output_layout.data_type;
739+
dep.set_output_layout(dep_output_layout);
740+
741+
LOG_NODE_REMOVAL(node->id());
742+
p.replace_all_usages(*node, dep);
743+
p.add_optimized_primitive_info(node->id());
744+
p.remove_all_connections(*node);
745+
p.remove_if_dangling(*node);
746+
}
747+
699748
for (auto n : p.get_processing_order()) {
700749
if (n->is_in_data_flow() && n->is_type<reorder>()) {
701750
auto preferred_impl = lo.get_preferred_impl_type(*n, n->get_input_layout(0).format);

src/plugins/intel_gpu/tests/unit/test_cases/fully_connected_gpu_test.cpp

+37
Original file line numberDiff line numberDiff line change
@@ -316,6 +316,43 @@ TEST(fully_connected_gpu, no_biases_int8) {
316316
ASSERT_EQ(-52.0f, output_ptr[3]);
317317
}
318318

319+
TEST(fully_connected_gpu, no_biases_fc_i32_reorder_f16) {
320+
const int32_t input_f = 2, input_b = 1, // size of the whole input buffer
321+
weight_b = 1, weight_f = 2; // size of the whole weights buffer
322+
323+
auto& engine = get_test_engine();
324+
325+
auto input_prim = engine.allocate_memory({ data_types::i32, format::bfyx, { input_b, input_f, 1, 1 } });
326+
auto weights_prim = engine.allocate_memory({ data_types::i32, format::bfyx, { weight_b, weight_f, 1, 1 } });
327+
328+
set_values<int32_t>(input_prim, { 1, 1 });
329+
set_values<int32_t>(weights_prim, { 1, 1 });
330+
331+
cldnn::topology topology{
332+
input_layout("input", input_prim->get_layout()),
333+
data("weights", weights_prim),
334+
fully_connected("fc_prim", input_info("input"), "weights"),
335+
reorder("reorder_to_f16", input_info("fc_prim"), { data_types::f16, format::bfyx, { input_b, weight_b, 1, 1 } }),
336+
activation("output", input_info("reorder_to_f16"), activation_func::floor)
337+
};
338+
339+
ExecutionConfig config = get_test_default_config(engine);
340+
config.set_property(ov::intel_gpu::optimize_data(true));
341+
config.set_property(ov::intel_gpu::allow_new_shape_infer(true));
342+
343+
cldnn::network network(engine, topology, config);
344+
345+
network.set_input_data("input", input_prim);
346+
347+
auto outputs = network.execute();
348+
ASSERT_EQ(outputs.size(), size_t(1));
349+
ASSERT_EQ(outputs.begin()->first, "output");
350+
351+
auto output_prim = outputs.begin()->second.get_memory();
352+
cldnn::mem_lock<ov::float16> output_ptr (output_prim, get_test_stream());
353+
ASSERT_EQ(2, output_ptr[0]);
354+
}
355+
319356
TEST(fully_connected_gpu, xb_f32_batch_1) {
320357
// Input : 3x1
321358
// Output : 4x1

0 commit comments

Comments
 (0)