Skip to content

Commit e2390f6

Browse files
[GPU] Add weight size limit for conv opt kernels
1 parent dbef32e commit e2390f6

File tree

3 files changed

+91
-0
lines changed

3 files changed

+91
-0
lines changed

src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_bfyx_gemm_like.cpp

+6
Original file line numberDiff line numberDiff line change
@@ -110,6 +110,12 @@ bool ConvolutionKernel_bfyx_GEMMLike::Validate(const Params& p) const {
110110
return false;
111111
}
112112

113+
// To prevent big sized filter which causes lots of CL build time.
114+
const size_t acceptable_filter_x_size = 64; // This acceptable size was decided by heuristics
115+
if (params.filterSize.x > acceptable_filter_x_size) {
116+
return false;
117+
}
118+
113119
return true;
114120
}
115121

src/plugins/intel_gpu/src/kernel_selector/kernels/convolution/convolution_kernel_bfyx_os_iyx_osv16.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -205,6 +205,14 @@ bool ConvolutionKernel_bfyx_os_iyx_osv16::Validate(const Params& p) const {
205205
return false;
206206
}
207207

208+
// To prevent big sized filter which causes lots of CL build time.
209+
const size_t acceptable_filter_size = 1024; // This acceptable size was decided by heuristics
210+
const auto& params = static_cast<const convolution_params&>(p);
211+
auto filter_size = params.filterSize.x * params.filterSize.y;
212+
if (filter_size > acceptable_filter_size) {
213+
return false;
214+
}
215+
208216
return true;
209217
}
210218

src/plugins/intel_gpu/tests/unit/test_cases/convolution_gpu_test.cpp

+77
Original file line numberDiff line numberDiff line change
@@ -1638,6 +1638,83 @@ TEST(convolution_f32_fw_gpu, basic_convolution) {
16381638
}
16391639
}
16401640

1641+
TEST(convolution_f32_fw_gpu, convolution_big_size_weights) {
1642+
auto& engine = get_test_engine();
1643+
1644+
const std::vector<int> filter_size_data = {
1645+
65, 65,
1646+
};
1647+
1648+
const std::vector<std::string> impl_kernel_data = {
1649+
"convolution_gpu_ref__f32"
1650+
};
1651+
1652+
for (size_t m = 0 ; m < filter_size_data.size() / 2; m++) {
1653+
const int in_y = filter_size_data[m * 2];
1654+
const int in_x = filter_size_data[m * 2 + 1];
1655+
1656+
auto input = engine.allocate_memory({ data_types::f32, format::yxfb, { 1, 1, in_y, in_x } });
1657+
auto weights = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, in_y, in_x } });
1658+
auto biases = engine.allocate_memory({ data_types::f32, format::bfyx, { 1, 1, 1, 1 } });
1659+
1660+
tests::random_generator rg(GET_SUITE_NAME);
1661+
VVVVF<float> input_rnd = rg.generate_random_4d<float>(1, 1, in_y, in_x, -10, 10);
1662+
VF<float> input_rnd_vec = flatten_4d<float>(format::yxfb, input_rnd);
1663+
VVVVF<float> filter_rnd = rg.generate_random_4d<float>(1, 1, in_y, in_x, -10, 10);
1664+
VF<float> filter_rnd_vec = flatten_4d<float>(format::bfyx, filter_rnd);
1665+
1666+
set_values(biases, { 0.0f });
1667+
set_values(input, input_rnd_vec);
1668+
set_values(weights, filter_rnd_vec);
1669+
1670+
float output_sum = 0.f;
1671+
size_t idx = 0;
1672+
for (int i = 0 ; i < in_y; i++) {
1673+
for (int k = 0 ; k < in_x; k++) {
1674+
idx = i * in_x + k;
1675+
output_sum += input_rnd_vec[idx] * filter_rnd_vec[idx];
1676+
}
1677+
}
1678+
1679+
topology topology(
1680+
input_layout("input", input->get_layout()),
1681+
data("weights", weights),
1682+
data("biases", biases),
1683+
convolution( "conv", input_info("input"), "weights", "biases", 1, {1, 1}, {1, 1}, {0, 0}, {0, 0}, false));
1684+
1685+
ExecutionConfig config = get_test_default_config(engine);
1686+
config.set_property(ov::intel_gpu::optimize_data(true));
1687+
1688+
network network(engine, topology, config);
1689+
1690+
auto impl_info = network.get_implementation_info("conv");
1691+
ASSERT_EQ(impl_info, impl_kernel_data[m]);
1692+
1693+
network.set_input_data("input", input);
1694+
1695+
auto outputs = network.execute();
1696+
ASSERT_EQ(outputs.size(), size_t(1));
1697+
ASSERT_EQ(outputs.begin()->first, "conv");
1698+
1699+
auto output_memory = outputs.at("conv").get_memory();
1700+
auto output_layout = output_memory->get_layout();
1701+
cldnn::mem_lock<float> output_ptr(output_memory, get_test_stream());
1702+
1703+
int y_size = output_layout.spatial(1);
1704+
int x_size = output_layout.spatial(0);
1705+
int f_size = output_layout.feature();
1706+
int b_size = output_layout.batch();
1707+
1708+
ASSERT_EQ(y_size, 1);
1709+
ASSERT_EQ(x_size, 1);
1710+
ASSERT_EQ(f_size, 1);
1711+
ASSERT_EQ(b_size, 1);
1712+
1713+
ASSERT_EQ(output_sum, output_ptr[0]);
1714+
}
1715+
1716+
}
1717+
16411718
TEST(convolution_f32_fw_gpu, basic_convolution_bfyx_weights_as_input_layout) {
16421719
//Same params as convolution_f32_fw_gpu, basic_convolution but with bfyx optimized data and weights set as input_layout
16431720
auto& engine = get_test_engine();

0 commit comments

Comments
 (0)