Skip to content

Commit d081f19

Browse files
[GPU] Add NMS_Gather ops
1 parent 60ce288 commit d081f19

File tree

10 files changed

+249
-2
lines changed

10 files changed

+249
-2
lines changed

src/plugins/intel_gpu/include/intel_gpu/primitives/non_max_suppression.hpp

+24
Original file line numberDiff line numberDiff line change
@@ -156,4 +156,28 @@ struct non_max_suppression : public primitive_base<non_max_suppression> {
156156
ib >> make_data(&rotation, sizeof(rotation));
157157
}
158158
};
159+
160+
struct non_max_suppression_gather : primitive_base<non_max_suppression_gather> {
161+
CLDNN_DECLARE_PRIMITIVE(non_max_suppression_gather)
162+
163+
/// @brief Constructs non_max_suppression_gather primitive.
164+
/// @param id This primitive id.
165+
/// @param inputs Input primitives ids.
166+
non_max_suppression_gather(const primitive_id& id,
167+
const std::vector<input_info>& inputs)
168+
: primitive_base(id, inputs, {padding()}, {optional_data_type()}) {}
169+
170+
size_t hash() const override {
171+
size_t seed = primitive::hash();
172+
return seed;
173+
}
174+
175+
bool operator==(const primitive& rhs) const override {
176+
if (!compare_common_params(rhs)) {
177+
return false;
178+
}
179+
180+
return true;
181+
}
182+
};
159183
} // namespace cldnn

src/plugins/intel_gpu/src/graph/impls/cpu/non_max_suppression.cpp

+106
Original file line numberDiff line numberDiff line change
@@ -442,6 +442,112 @@ attach_non_max_suppression_impl::attach_non_max_suppression_impl() {
442442
}
443443

444444
} // namespace detail
445+
446+
namespace {
447+
448+
std::vector<int32_t> get_nms_gather_input(stream& stream, memory::ptr mem) {
449+
auto dep_mem_layout = mem->get_layout();
450+
auto dep_mem_batch = static_cast<size_t>(dep_mem_layout.batch());
451+
452+
mem_lock<int32_t, mem_lock_type::read> dep_mem_lock(mem, stream);
453+
auto dep_mem_ptr = dep_mem_lock.data();
454+
455+
size_t actual_valid_num = dep_mem_batch;
456+
size_t idx = 0;
457+
for (size_t i = 0; i < dep_mem_batch; i++) {
458+
idx = i * 3;
459+
if (dep_mem_ptr[idx] == -1) {
460+
actual_valid_num = i;
461+
break;
462+
}
463+
}
464+
465+
std::vector<int32_t> result;
466+
for (size_t i = 0; i < actual_valid_num; i++) {
467+
idx = i * 3;
468+
result.push_back(dep_mem_ptr[idx + 0]);
469+
result.push_back(dep_mem_ptr[idx + 1]);
470+
result.push_back(dep_mem_ptr[idx + 2]);
471+
}
472+
473+
return result;
474+
}
475+
476+
void store_nms_gather_output(stream& stream, memory::ptr mem, std::vector<int32_t> valid_input) {
477+
auto valid_input_size = valid_input.size() / 3;
478+
479+
mem_lock<int32_t, mem_lock_type::write> lock(mem, stream);
480+
auto ptr = lock.data();
481+
482+
auto output_batch = static_cast<size_t>(mem->get_layout().batch());
483+
for (size_t si = 0; si < std::min(valid_input_size, output_batch); ++si) {
484+
auto offset = si * 3;
485+
ptr[offset + 0] = static_cast<int32_t>(valid_input[offset + 0]);
486+
ptr[offset + 1] = static_cast<int32_t>(valid_input[offset + 1]);
487+
ptr[offset + 2] = static_cast<int32_t>(valid_input[offset + 2]);
488+
}
489+
}
490+
491+
void run_nms_gather(non_max_suppression_gather_inst& instance) {
492+
auto& stream = instance.get_network().get_stream();
493+
494+
auto valid_input = get_nms_gather_input(stream, instance.dep_memory_ptr(0));
495+
store_nms_gather_output(stream, instance.output_memory_ptr(), valid_input);
496+
}
497+
}
498+
struct non_max_suppression_gather_impl : typed_primitive_impl<non_max_suppression_gather> {
499+
using parent = typed_primitive_impl<non_max_suppression_gather>;
500+
501+
DECLARE_OBJECT_TYPE_SERIALIZATION(cldnn::cpu::non_max_suppression_gather_impl)
502+
503+
std::unique_ptr<primitive_impl> clone() const override {
504+
return make_unique<non_max_suppression_gather_impl>(*this);
505+
}
506+
507+
non_max_suppression_gather_impl() : parent("non_max_suppression_gather_impl") {}
508+
509+
event::ptr execute_impl(const std::vector<event::ptr>& events, typed_primitive_inst<non_max_suppression_gather>& instance) override {
510+
auto& stream = instance.get_network().get_stream();
511+
512+
const bool pass_through_events = (stream.get_queue_type() == QueueTypes::out_of_order) && instance.get_node().is_in_shape_of_subgraph();
513+
514+
if (!pass_through_events) {
515+
for (auto e : events) {
516+
e->wait();
517+
}
518+
}
519+
520+
run_nms_gather(instance);
521+
522+
if (pass_through_events) {
523+
if (events.size() > 1) {
524+
return stream.group_events(events);
525+
} else if (events.size() == 1) {
526+
return events[0];
527+
}
528+
}
529+
530+
return stream.create_user_event(true);
531+
}
532+
533+
static std::unique_ptr<primitive_impl> create(const non_max_suppression_gather_node&, const kernel_impl_params&) {
534+
return make_unique<non_max_suppression_gather_impl>();
535+
}
536+
void init_kernels(const kernels_cache&, const kernel_impl_params&) override {}
537+
};
538+
539+
namespace detail {
540+
541+
attach_non_max_suppression_gather_impl::attach_non_max_suppression_gather_impl() {
542+
implementation_map<non_max_suppression_gather>::add(impl_types::cpu, non_max_suppression_gather_impl::create, {
543+
std::make_tuple(data_types::i32, format::bfyx),
544+
std::make_tuple(data_types::f16, format::bfyx),
545+
std::make_tuple(data_types::f32, format::bfyx),
546+
});
547+
}
548+
549+
} // namespace detail
550+
445551
} // namespace cpu
446552
} // namespace cldnn
447553

src/plugins/intel_gpu/src/graph/impls/cpu/register.cpp

+1
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ void register_implementations() {
1616
REGISTER_CPU(proposal);
1717
REGISTER_CPU(read_value);
1818
REGISTER_CPU(non_max_suppression);
19+
REGISTER_CPU(non_max_suppression_gather);
1920
REGISTER_CPU(shape_of);
2021
REGISTER_CPU(concatenation);
2122
REGISTER_CPU(gather);

src/plugins/intel_gpu/src/graph/impls/cpu/register.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,7 @@ REGISTER_CPU(assign);
3939
REGISTER_CPU(proposal);
4040
REGISTER_CPU(read_value);
4141
REGISTER_CPU(non_max_suppression);
42+
REGISTER_CPU(non_max_suppression_gather);
4243
REGISTER_CPU(detection_output);
4344
REGISTER_CPU(shape_of);
4445
REGISTER_CPU(concatenation);

src/plugins/intel_gpu/src/graph/include/non_max_suppression_inst.h

+30
Original file line numberDiff line numberDiff line change
@@ -186,4 +186,34 @@ class typed_primitive_inst<non_max_suppression> : public typed_primitive_inst_ba
186186

187187
using non_max_suppression_inst = typed_primitive_inst<non_max_suppression>;
188188

189+
template <>
190+
struct typed_program_node<non_max_suppression_gather> : typed_program_node_base<non_max_suppression_gather> {
191+
using parent = typed_program_node_base<non_max_suppression_gather>;
192+
using parent::parent;
193+
194+
bool generates_dynamic_output() const override {
195+
return true;
196+
}
197+
198+
std::vector<size_t> get_shape_infer_dependencies() const override {
199+
return {0};
200+
}
201+
};
202+
203+
using non_max_suppression_gather_node = typed_program_node<non_max_suppression_gather>;
204+
205+
template <>
206+
class typed_primitive_inst<non_max_suppression_gather> : public typed_primitive_inst_base<non_max_suppression_gather> {
207+
public:
208+
using parent = typed_primitive_inst_base<non_max_suppression_gather>;
209+
using parent::parent;
210+
211+
static layout calc_output_layout(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
212+
template <typename ShapeType>
213+
static std::vector<layout> calc_output_layouts(const non_max_suppression_gather_node& node, const kernel_impl_params& impl_param);
214+
static std::string to_string(const non_max_suppression_gather_node& node);
215+
};
216+
217+
using non_max_suppression_gather_inst = typed_primitive_inst<non_max_suppression_gather>;
218+
189219
} // namespace cldnn

src/plugins/intel_gpu/src/graph/layout_optimizer.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -1583,6 +1583,8 @@ impl_types layout_optimizer::get_preferred_impl_type(program_node& node, format
15831583
}
15841584
}
15851585
}
1586+
} else if (node.is_type<non_max_suppression_gather>()) {
1587+
return impl_types::cpu;
15861588
} else if (node.is_type<reorder>()) {
15871589
if (!_optimization_attributes.use_onednn_impls)
15881590
return impl_types::ocl;

src/plugins/intel_gpu/src/graph/non_max_suppression.cpp

+72
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,10 @@
99
#include "nms_shape_inference.hpp"
1010

1111
namespace cldnn {
12+
13+
// -----------------------------------------------
14+
// non_max_suppression
15+
// -----------------------------------------------
1216
GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression)
1317

1418
layout non_max_suppression_inst::calc_output_layout(non_max_suppression_node const& node, kernel_impl_params const& impl_param) {
@@ -83,4 +87,72 @@ std::string non_max_suppression_inst::to_string(non_max_suppression_node const&
8387
return description.str();
8488
}
8589

90+
// -----------------------------------------------
91+
// non_max_suppression_gather
92+
// -----------------------------------------------
93+
GPU_DEFINE_PRIMITIVE_TYPE_ID(non_max_suppression_gather)
94+
95+
layout non_max_suppression_gather_inst::calc_output_layout(non_max_suppression_gather_node const& node, kernel_impl_params const& impl_param) {
96+
OPENVINO_THROW("Only calc_output_layouts should be used!");
97+
}
98+
99+
template<typename ShapeType>
100+
std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts(non_max_suppression_gather_node const& node, const kernel_impl_params& impl_param) {
101+
std::vector<layout> layouts;
102+
103+
auto desc = impl_param.typed_desc<non_max_suppression_gather>();
104+
const auto input0_layout = node.get_dependency(0).get_output_layout(0); // impl_param.get_input_layout(0);
105+
const auto input1_layout = node.get_dependency(0).get_output_layout(1); // impl_param.get_input_layout(1);
106+
107+
std::vector<ShapeType> output_shapes = { ShapeType{}, ShapeType{}, ShapeType{} };
108+
109+
auto& memory_deps = impl_param.memory_deps;
110+
if (memory_deps.count(0)) {
111+
auto actual_output = memory_deps.at(0);
112+
cldnn::mem_lock<int32_t, mem_lock_type::read> actual_output_lock(actual_output, impl_param.get_stream());
113+
114+
auto output_ps = actual_output->get_layout().get_partial_shape();
115+
auto b = output_ps[0].get_length();
116+
auto f = output_ps[1].get_length(); // should be 3
117+
118+
// find valid data size
119+
auto output_data = actual_output_lock.data();
120+
int64_t actual_valid_num = b;
121+
for (int64_t i = 0; i < b ; i += 1) {
122+
if (output_data[i * f] == -1) {
123+
actual_valid_num = i;
124+
break;
125+
}
126+
}
127+
128+
output_shapes[0] = output_shapes[1] = ShapeType{actual_valid_num, f};
129+
output_shapes[2] = ShapeType{1};
130+
} else {
131+
output_shapes[0] = output_shapes[1] = ShapeType{ov::Dimension::dynamic(), 3};
132+
output_shapes[2] = ShapeType{1};
133+
}
134+
135+
for (size_t i = 0; i < desc->num_outputs; ++i) {
136+
auto dt = desc->output_data_types[i].value_or(data_types::i32);
137+
layouts.push_back({output_shapes[i], dt, format::get_default_format(output_shapes[i].size())});
138+
}
139+
return layouts;
140+
}
141+
142+
template std::vector<layout> non_max_suppression_gather_inst::calc_output_layouts<ov::PartialShape>(non_max_suppression_gather_node const& node,
143+
const kernel_impl_params& impl_param);
144+
145+
std::string non_max_suppression_gather_inst::to_string(non_max_suppression_gather_node const& node) {
146+
auto desc = node.get_primitive();
147+
auto node_info = node.desc_to_json();
148+
149+
json_composite info;
150+
151+
node_info->add("non max suppression gather info", info);
152+
153+
std::stringstream description;
154+
node_info->dump(description);
155+
return description.str();
156+
}
157+
86158
} // namespace cldnn

src/plugins/intel_gpu/src/graph/program.cpp

+2
Original file line numberDiff line numberDiff line change
@@ -1494,6 +1494,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
14941494
prim.type() != cldnn::broadcast::type_id() &&
14951495
prim.type() != cldnn::ctc_loss::type_id() &&
14961496
prim.type() != cldnn::non_max_suppression::type_id() &&
1497+
prim.type() != cldnn::non_max_suppression_gather::type_id() &&
14971498
prim.type() != cldnn::roi_align::type_id() &&
14981499
prim.type() != cldnn::matrix_nms::type_id() &&
14991500
prim.type() != cldnn::adaptive_pooling::type_id() &&
@@ -1546,6 +1547,7 @@ void program::set_layout_optimizer_attributes(layout_optimizer& lo) {
15461547
prim.type() != cldnn::quantize::type_id() &&
15471548
prim.type() != cldnn::ctc_loss::type_id() &&
15481549
prim.type() != cldnn::non_max_suppression::type_id() &&
1550+
prim.type() != cldnn::non_max_suppression_gather::type_id() &&
15491551
prim.type() != cldnn::roi_align::type_id() &&
15501552
prim.type() != cldnn::matrix_nms::type_id() &&
15511553
prim.type() != cldnn::adaptive_pooling::type_id() &&

src/plugins/intel_gpu/src/kernel_selector/common_types.h

+1
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ enum class KernelType {
7676
EXTRACT_IMAGE_PATCHES,
7777
LOOP,
7878
NON_MAX_SUPPRESSION,
79+
NON_MAX_SUPPRESSION_GATHER,
7980
DETECTION_OUTPUT,
8081
EXPERIMENTAL_DETECTRON_DETECTION_OUTPUT,
8182
EXPERIMENTAL_DETECTRON_GENERATE_PROPOSALS_SINGLE_IMAGE,

src/plugins/intel_gpu/src/plugin/ops/non_max_suppression.cpp

+10-2
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,9 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
5454
auto boxesShape = op->get_input_partial_shape(0);
5555
size_t num_outputs = op->get_output_size();
5656
if (p.use_new_shape_infer()) {
57-
auto nonMaxSuppressionLayerName = layer_type_name_ID(op);
57+
auto NMSLayerName = layer_type_name_ID(op);
5858
auto prim = cldnn::non_max_suppression(
59-
nonMaxSuppressionLayerName,
59+
NMSLayerName,
6060
reordered_inputs[0],
6161
reordered_inputs[1],
6262
0,
@@ -78,6 +78,14 @@ static void CreateNonMaxSuppressionIEInternalOp(ProgramBuilder& p, const std::sh
7878
}
7979

8080
p.add_primitive(*op, prim);
81+
82+
auto NMSGatherLayerName = layer_type_name_ID(op) + "_NMSGather";
83+
auto nms_gather_prim = cldnn::non_max_suppression_gather(
84+
NMSGatherLayerName,
85+
{NMSLayerName}
86+
);
87+
88+
p.add_primitive(*op, nms_gather_prim);
8189
} else {
8290
auto outputIndices = op->get_output_partial_shape(0)[0].get_length();
8391

0 commit comments

Comments
 (0)