Skip to content

Commit da29cc6

Browse files
[GPU] Add NMS_Gather ops
1 parent 77c6ade commit da29cc6

File tree

12 files changed

+347
-2
lines changed

12 files changed

+347
-2
lines changed

src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp

+25
Original file line numberDiff line numberDiff line change
@@ -156,4 +156,29 @@ struct non_max_suppression : public primitive_base<non_max_suppression> {
156156
ib >> make_data(&rotation, sizeof(rotation));
157157
}
158158
};
159+
160+
struct non_max_suppression_gather : primitive_base<non_max_suppression_gather> {
161+
CLDNN_DECLARE_PRIMITIVE(non_max_suppression_gather)
162+
163+
/// @brief Constructs non_max_suppression_gather primitive.
164+
/// @param id This primitive id.
165+
/// @param inputs Input primitives ids.
166+
non_max_suppression_gather(const primitive_id& id,
167+
const std::vector<input_info>& inputs,
168+
const size_t num_outputs = 1)
169+
: primitive_base(id, inputs, {padding()}, {optional_data_type()}, num_outputs) {}
170+
171+
size_t hash() const override {
172+
size_t seed = primitive::hash();
173+
return seed;
174+
}
175+
176+
bool operator==(const primitive& rhs) const override {
177+
if (!compare_common_params(rhs)) {
178+
return false;
179+
}
180+
181+
return true;
182+
}
183+
};
159184
} // namespace cldnn

src/plugins/intel_gpu/src/graph/graph_optimizer/mark_runtime_skippable_nodes.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44

55
#include "pass_manager.h"
66
#include "gather_inst.h"
7+
#include "non_max_suppression_inst.h"
78
#include "permute_inst.h"
89
#include "strided_slice_inst.h"
910
#include "kv_cache_inst.h"

src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp

+52
Original file line numberDiff line numberDiff line change
@@ -440,6 +440,58 @@ attach_non_max_suppression_impl::attach_non_max_suppression_impl() {
440440
}
441441

442442
} // namespace detail
443+
444+
struct non_max_suppression_gather_impl : typed_primitive_impl<non_max_suppression_gather> {
445+
using parent = typed_primitive_impl<non_max_suppression_gather>;
446+
447+
DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::non_max_suppression_gather_impl)
448+
449+
std::unique_ptr<primitive_impl> clone() const override {
450+
return make_unique<non_max_suppression_gather_impl>(*this);
451+
}
452+
453+
non_max_suppression_gather_impl() : parent("non_max_suppression_gather_impl") {}
454+
455+
event::ptr execute_impl(const std::vector<event::ptr>& events, typed_primitive_inst<non_max_suppression_gather>& instance) override {
456+
auto& stream = instance.get_network().get_stream();
457+
458+
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph();
459+
460+
if (!pass_through_events) {
461+
for (auto e : events) {
462+
e->wait();
463+
}
464+
}
465+
466+
if (pass_through_events) {
467+
if (events.size() > 1) {
468+
return stream.group_events(events);
469+
} else if (events.size() == 1) {
470+
return events[0];
471+
}
472+
}
473+
474+
return stream.create_user_event(true);
475+
}
476+
477+
static std::unique_ptr<primitive_impl> create(const non_max_suppression_gather_node&, const kernel_impl_params&) {
478+
return make_unique<non_max_suppression_gather_impl>();
479+
}
480+
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
481+
};
482+
483+
namespace detail {
484+
485+
attach_non_max_suppression_gather_impl::attach_non_max_suppression_gather_impl() {
486+
implementation_map<non_max_suppression_gather>::add(impl_types::cpu, non_max_suppression_gather_impl::create, {
487+
std::make_tuple(data_types::i32, format::bfyx),
488+
std::make_tuple(data_types::f16, format::bfyx),
489+
std::make_tuple(data_types::f32, format::bfyx),
490+
});
491+
}
492+
493+
} // namespace detail
494+
443495
} // namespace cpu
444496
} // namespace cldnn
445497

src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ void register_implementations() {
1616
REGISTER_CPU(proposal);
1717
REGISTER_CPU(read_value);
1818
REGISTER_CPU(non_max_suppression);
19+
REGISTER_CPU(non_max_suppression_gather);
1920
REGISTER_CPU(shape_of);
2021
REGISTER_CPU(concatenation);
2122
REGISTER_CPU(gather);

src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -40,6 +40,7 @@ REGISTER_CPU(assign);
4040
REGISTER_CPU(proposal);
4141
REGISTER_CPU(read_value);
4242
REGISTER_CPU(non_max_suppression);
43+
REGISTER_CPU(non_max_suppression_gather);
4344
REGISTER_CPU(detection_output);
4445
REGISTER_CPU(shape_of);
4546
REGISTER_CPU(concatenation);

src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h

+40
Original file line numberDiff line numberDiff line change
@@ -186,4 +186,44 @@ class typed_primitive_inst<non_max_suppression> : public typed_primitive_inst_ba
186186

187187
using non_max_suppression_inst = typed_primitive_inst<non_max_suppression>;
188188

189+
template <>
190+
struct typed_program_node<non_max_suppression_gather> : typed_program_node_base<non_max_suppression_gather> {
191+
using parent = typed_program_node_base<non_max_suppression_gather>;
192+
using parent::parent;
193+
194+
public:
195+
typed_program_node(const std::shared_ptr<non_max_suppression_gather> prim, program& prog) : parent(prim, prog) {
196+
can_be_optimized(true);
197+
set_runtime_skippable(true);
198+
}
199+
200+
bool generates_dynamic_output() const override {
201+
return true;
202+
}
203+
204+
std::vector<size_t> get_shape_infer_dependencies() const override { return {0, 1, 2}; }
205+
};
206+
207+
using non_max_suppression_gather_node = typed_program_node<non_max_suppression_gather>;
208+
209+
template <>
210+
class typed_primitive_inst<non_max_suppression_gather> : public typed_primitive_inst_base<non_max_suppression_gather> {
211+
public:
212+
using parent = typed_primitive_inst_base<non_max_suppression_gather>;
213+
using parent::parent;
214+
215+
static layout calc_output_layout(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
216+
template <typename ShapeType>
217+
static std::vector<layout> calc_output_layouts(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
218+
static std::string to_string(const non_max_suppression_gather_node& node);
219+
220+
typed_primitive_inst(network& network, non_max_suppression_gather_node const& node);
221+
void update_output_memory() override;
222+
223+
private:
224+
void on_execute() override;
225+
};
226+
227+
using non_max_suppression_gather_inst = typed_primitive_inst<non_max_suppression_gather>;
228+
189229
} // namespace cldnn

src/plugins/intel_gpu/src/graph/layout_optimizer.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -1568,6 +1568,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
15681568
}
15691569
}
15701570
}
1571+
} else if (node.is_type<non_max_suppression_gather>()) {
1572+
return impl_types::cpu;
15711573
} else if (node.is_type<reorder>()) {
15721574
if (!_optimization_attributes.use_onednn_impls)
15731575
return impl_types::ocl;

src/plugins/intel_gpu/src/graph/non_max_suppression.cpp

+79
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,10 @@
1111
#include "nms_shape_inference.hpp"
1212

1313
namespace cldnn {
14+
15+
// -----------------------------------------------
16+
// non_max_suppression
17+
// -----------------------------------------------
1418
GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression)
1519

1620
layout non_max_suppression_inst::calc_output_layout(non_max_suppression_node const& node, kernel_impl_params const& impl_param) {
@@ -81,4 +85,79 @@ std::string non_max_suppression_inst::to_string(non_max_suppression_node const&
8185
return description.str();
8286
}
8387

88+
// -----------------------------------------------
89+
// non_max_suppression_gather
90+
// -----------------------------------------------
91+
GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression_gather)
92+
93+
layout non_max_suppression_gather_inst::calc_output_layout(non_max_suppression_gather_node const& node, kernel_impl_params const& impl_param) {
94+
OPENVINO_THROW("Only calc_output_layouts should be used!");
95+
}
96+
97+
template<typename ShapeType>
98+
std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts(non_max_suppression_gather_node const& /*node*/,
99+
const kernel_impl_params& impl_param) {
100+
std::vector<layout> layouts;
101+
102+
auto desc = impl_param.typed_desc<non_max_suppression_gather>();
103+
std::vector<ShapeType> output_shapes = { ShapeType{}, ShapeType{}, ShapeType{} };
104+
105+
auto& memory_deps = impl_param.memory_deps;
106+
if (memory_deps.count(2)) {
107+
auto third_output = memory_deps.at(2);
108+
cldnn::mem_lock<int32_t, mem_lock_type::read> third_output_lock(third_output, impl_param.get_stream());
109+
auto third_output_data = third_output_lock.data();
110+
111+
output_shapes[0] = ShapeType{third_output_data[0], 3};
112+
} else {
113+
output_shapes[0] = ShapeType{ov::Dimension::dynamic(), 3};
114+
}
115+
output_shapes[1] = output_shapes[0];
116+
output_shapes[2] = ShapeType{1};
117+
118+
for (size_t i = 0; i < desc->num_outputs; ++i) {
119+
layouts.push_back({output_shapes[i],
120+
impl_param.get_input_layout(i).data_type,
121+
format::get_default_format(output_shapes[i].size())});
122+
}
123+
return layouts;
124+
}
125+
126+
template std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts<ov::PartialShape>(non_max_suppression_gather_node const& node,
127+
const kernel_impl_params& impl_param);
128+
129+
std::string non_max_suppression_gather_inst::to_string(non_max_suppression_gather_node const& node) {
130+
auto desc = node.get_primitive();
131+
auto node_info = node.desc_to_json();
132+
133+
json_composite info;
134+
135+
node_info->add("non max suppression gather info", info);
136+
137+
std::stringstream description;
138+
node_info->dump(description);
139+
return description.str();
140+
}
141+
142+
void non_max_suppression_gather_inst::on_execute() {
143+
update_output_memory();
144+
}
145+
146+
void non_max_suppression_gather_inst::update_output_memory() {
147+
if (!can_be_optimized())
148+
return;
149+
150+
for (size_t i = 0; i < inputs_memory_count(); i++) {
151+
if (node->get_program().is_new_shape_infer() && input_memory_ptr(i) == nullptr)
152+
return;
153+
154+
if (output_memory_ptr(i) != nullptr && _network.get_engine().is_the_same_buffer(output_memory(i), input_memory(i)))
155+
return;
156+
157+
_outputs[i] = {_network.get_engine().reinterpret_buffer(input_memory(i), _impl_params->get_output_layout(i))};
158+
}
159+
}
160+
161+
non_max_suppression_gather_inst::typed_primitive_inst(network& network, non_max_suppression_gather_node const& node) : parent(network, node) {}
162+
84163
} // namespace cldnn

src/plugins/intel_gpu/src/graph/program.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -1496,6 +1496,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
14961496
prim.type() != cldnn::broadcast::type_id() &&
14971497
prim.type() != cldnn::ctc_loss::type_id() &&
14981498
prim.type() != cldnn::non_max_suppression::type_id() &&
1499+
prim.type() != cldnn::non_max_suppression_gather::type_id() &&
14991500
prim.type() != cldnn::roi_align::type_id() &&
15001501
prim.type() != cldnn::matrix_nms::type_id() &&
15011502
prim.type() != cldnn::adaptive_pooling::type_id() &&
@@ -1548,6 +1549,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
15481549
prim.type() != cldnn::quantize::type_id() &&
15491550
prim.type() != cldnn::ctc_loss::type_id() &&
15501551
prim.type() != cldnn::non_max_suppression::type_id() &&
1552+
prim.type() != cldnn::non_max_suppression_gather::type_id() &&
15511553
prim.type() != cldnn::roi_align::type_id() &&
15521554
prim.type() != cldnn::matrix_nms::type_id() &&
15531555
prim.type() != cldnn::adaptive_pooling::type_id() &&

src/plugins/intel_gpu/src/kernel_selector/common_types.h

+1
Original file line numberDiff line numberDiff line change
@@ -77,6 +77,7 @@ enum class KernelType {
7777
EXTRACT_IMAGE_PATCHES,
7878
LOOP,
7979
NON_MAX_SUPPRESSION,
80+
NON_MAX_SUPPRESSION_GATHER,
8081
DETECTION_OUTPUT,
8182
EXPERIMENTAL_DETECTRON_DETECTION_OUTPUT,
8283
EXPERIMENTAL_DETECTRON_GENERATE_PROPOSALS_SINGLE_IMAGE,

src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp

+20-2
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
5454
auto boxesShape = op->get_input_partial_shape(0);
5555
size_t num_outputs = op->get_output_size();
5656
if (p.use_new_shape_infer()) {
57-
auto nonMaxSuppressionLayerName = layer_type_name_ID(op);
57+
auto NMSLayerName = layer_type_name_ID(op);
5858
auto prim = cldnn::non_max_suppression(
59-
nonMaxSuppressionLayerName,
59+
NMSLayerName,
6060
reordered_inputs[0],
6161
reordered_inputs[1],
6262
0,
@@ -78,6 +78,24 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
7878
}
7979

8080
p.add_primitive(*op, prim);
81+
82+
auto NMSGatherLayerName = layer_type_name_ID(op) + "_NMSGather";
83+
std::vector<cldnn::input_info> nms_gather_inputs;
84+
const std::vector<cldnn::input_info> nms_gather_input_list = {
85+
cldnn::input_info(NMSLayerName, 0),
86+
cldnn::input_info(NMSLayerName, 1),
87+
cldnn::input_info(NMSLayerName, 2)
88+
};
89+
for (size_t i = 0; i < num_outputs; i++) {
90+
nms_gather_inputs.push_back(nms_gather_input_list[i]);
91+
}
92+
93+
auto nms_gather_prim = cldnn::non_max_suppression_gather(
94+
NMSGatherLayerName,
95+
nms_gather_inputs,
96+
num_outputs);
97+
98+
p.add_primitive(*op, nms_gather_prim);
8199
} else {
82100
auto outputIndices = op->get_output_partial_shape(0)[0].get_length();
83101

0 commit comments

Comments
 (0)