@@ -586,19 +586,44 @@ event::ptr primitive_inst::realloc_if_needed() {
586
586
user_insts.size (), " and " , user_insts_origin.size ());
587
587
}
588
588
for (auto user : user_insts) {
589
+ auto is_fused_prim_of_user = [&](primitive_id id) -> bool {
590
+ for (auto & p : user->get_node ().get_fused_primitives ()) {
591
+ if (p.has_outer_dep ()) {
592
+ const auto start_idx = p.outer_dep_start_idx ;
593
+ // exclude fused_node from total_num_deps
594
+ const auto end_idx = p.outer_dep_start_idx + p.total_num_deps -1 ;
595
+ for (size_t idx = start_idx; idx < end_idx; idx++) {
596
+ if (user->get_node ().get_dependency (idx).id () == id) {
597
+ return true ;
598
+ }
599
+ }
600
+ }
601
+ }
602
+ return false ;
603
+ };
589
604
// Since fake alignment is applicable for input tensor as well, make sure we allocate enough memory
590
605
// to prevent reading beyond the allocated memory bounds
591
- if (user->get_node ().is_type <fully_connected>() && user->is_dynamic () && user->_deps [0 ].first == this ) {
592
- GPU_DEBUG_TRACE_DETAIL << " Check fc user " << user->id () << " 's fake alignment-ed input size" << std::endl;
593
- user->update_shape ();
594
- user->update_shape_done_by_other = true ;
595
-
596
- auto fc_impl_params = *user->_impl_params ;
597
- auto fc_input_layout = user->get_node ().type ()->get_fake_aligned_params (fc_impl_params).input_layouts [0 ];
598
- if (fc_input_layout.bytes_count () > updated_layout.bytes_count ()) {
599
- GPU_DEBUG_TRACE_DETAIL << id () << " : increase output layout allocation size from " << actual_layout.to_short_string () << " -> "
600
- << fc_input_layout.to_short_string () << " to meet the input buffer alignment requirements for FC\n " ;
601
- updated_layout = fc_input_layout;
606
+ if (user->get_node ().is_type <fully_connected>() && user->is_dynamic ()) {
607
+ if (user->_deps [0 ].first == this
608
+ || (is_fused_prim_of_user (id ()) && user->update_shape_done_by_other )) {
609
+ GPU_DEBUG_TRACE_DETAIL << " Check fc user " << user->id () << " 's fake alignment-ed input size" << std::endl;
610
+ // Setting update_shape_done_by_other to false before running update_shape,
611
+ // since update_Shape is already called in realloc_if_needed of current node's dep node
612
+ // but current node's output layout is not updated to the this user node yet.
613
+ user->update_shape_done_by_other = false ;
614
+ bool prev_shape_changed = user->shape_changed ();
615
+ user->update_shape ();
616
+ // Set again shape_change status if shape is changed in the prev udpate_shape() for this user node.
617
+ if (prev_shape_changed)
618
+ user->set_shape_change ();
619
+ user->update_shape_done_by_other = true ;
620
+ auto fc_impl_params = *user->_impl_params ;
621
+ auto fc_input_layout = user->get_node ().type ()->get_fake_aligned_params (fc_impl_params).input_layouts [0 ];
622
+ if (fc_input_layout.bytes_count () > updated_layout.bytes_count ()) {
623
+ GPU_DEBUG_TRACE_DETAIL << id () << " : increase output layout allocation size from " << actual_layout.to_short_string () << " -> "
624
+ << fc_input_layout.to_short_string () << " to meet the input buffer alignment requirements for FC\n " ;
625
+ updated_layout = fc_input_layout;
626
+ }
602
627
}
603
628
}
604
629
}
0 commit comments