Skip to content

Commit 217d249

Browse files
committed
aarch64: matmul: addition of JIT int8 kernel
1 parent 46a9c22 commit 217d249

13 files changed

+2376
-60
lines changed

cmake/options.cmake

+52-48
Original file line numberDiff line numberDiff line change
@@ -1,23 +1,23 @@
1-
#===============================================================================
2-
# Copyright 2018-2025 Intel Corporation
1+
#== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
2+
#Copyright 2018 - 2025 Intel Corporation
33
#
4-
# Licensed under the Apache License, Version 2.0 (the "License");
5-
# you may not use this file except in compliance with the License.
6-
# You may obtain a copy of the License at
4+
#Licensed under the Apache License, Version 2.0(the "License");
5+
#you may not use this file except in compliance with the License.
6+
#You may obtain a copy of the License at
77
#
8-
# http://www.apache.org/licenses/LICENSE-2.0
8+
#http: //www.apache.org/licenses/LICENSE-2.0
99
#
10-
# Unless required by applicable law or agreed to in writing, software
11-
# distributed under the License is distributed on an "AS IS" BASIS,
12-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
# See the License for the specific language governing permissions and
14-
# limitations under the License.
15-
#===============================================================================
10+
#Unless required by applicable law or agreed to in writing, software
11+
#distributed under the License is distributed on an "AS IS" BASIS,
12+
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
#See the License for the specific language governing permissions and
14+
#limitations under the License.
15+
#== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
1616

17-
# Manage different library options
18-
#===============================================================================
17+
#Manage different library options
18+
#== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
1919

20-
if(options_cmake_included)
20+
if (options_cmake_included)
2121
return()
2222
endif()
2323
set(options_cmake_included true)
@@ -26,14 +26,18 @@ if(CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR)
2626
set(DNNL_IS_MAIN_PROJECT TRUE)
2727
endif()
2828

29-
# ========
30-
# Features
31-
# ========
29+
#== == == ==
30+
#Features
31+
#== == == ==
3232

3333
option(DNNL_VERBOSE
3434
"allows oneDNN be verbose whenever ONEDNN_VERBOSE
3535
environment variable set to 1" ON) # enabled by default
3636

37+
option(DNNL_AARCH64_MATMUL_SRC_QUANT
38+
"allows oneDNN to use dynamic quantisation for source(A) matrix when
39+
environment variable set to 1" OFF) # disabled by default
40+
3741
option(DNNL_ENABLE_CONCURRENT_EXEC
3842
"disables sharing a common scratchpad between primitives.
3943
This option must be turned ON if there is a possibility of executing
@@ -42,7 +46,7 @@ option(DNNL_ENABLE_CONCURRENT_EXEC
4246
OFF) # disabled by default
4347

4448
option(DNNL_ENABLE_PRIMITIVE_CACHE "enables primitive cache." ON)
45-
# enabled by default
49+
#enabled by default
4650

4751
option(DNNL_ENABLE_MAX_CPU_ISA
4852
"enables control of CPU ISA detected by oneDNN via DNNL_MAX_CPU_ISA
@@ -58,9 +62,9 @@ option(ONEDNN_ENABLE_GRAPH_DUMP "enables control of dumping graph artifacts via
5862
ONEDNN_GRAPH_DUMP environment variable. The option and feature are valid only
5963
when ONEDNN_BUILD_GRAPH is ON" OFF)
6064

61-
# =============================
62-
# Building properties and scope
63-
# =============================
65+
#== == == == == == == == == == == == == == =
66+
#Building properties and scope
67+
#== == == == == == == == == == == == == == =
6468

6569
set(DNNL_LIBRARY_TYPE "SHARED" CACHE STRING
6670
"specifies whether oneDNN library should be SHARED or STATIC")
@@ -169,9 +173,9 @@ set(DNNL_AMD_SYCL_KERNELS_TARGET_ARCH "" CACHE STRING
169173
stops to require specifying the target architecture. After removing the option
170174
the generic SYCL kernels will always be enabled for AMD vendor.")
171175

172-
# =============
173-
# Optimizations
174-
# =============
176+
#== == == == == == =
177+
#Optimizations
178+
#== == == == == == =
175179

176180
set(DNNL_ARCH_OPT_FLAGS "HostOpts" CACHE STRING
177181
"specifies compiler optimization flags (see below for more information).
@@ -230,13 +234,13 @@ set(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_LLVM_CONFIG "AUTO" CACHE STRING
230234
set(ONEDNN_EXPERIMENTAL_GRAPH_COMPILER_CPU_JIT "builtin" CACHE STRING
231235
"the optional JIT backends for graph-compiler: llvm;c;builtin")
232236

233-
# ======================
234-
# Profiling capabilities
235-
# ======================
237+
#== == == == == == == == == == ==
238+
#Profiling capabilities
239+
#== == == == == == == == == == ==
236240

237-
# TODO: restore default to ON after the issue with linking C files by
238-
# Intel oneAPI DPC++ Compiler is fixed. Currently this compiler issues a warning
239-
# when linking object files built from C and C++ sources.
241+
#TODO : restore default to ON after the issue with linking C files by
242+
#Intel oneAPI DPC++ Compiler is fixed.Currently this compiler issues a warning
243+
#when linking object files built from C and C++ sources.
240244
option(DNNL_ENABLE_JIT_PROFILING
241245
"Enable registration of oneDNN kernels that are generated at
242246
runtime with VTune Profiler (on by default). Without the
@@ -250,9 +254,9 @@ option(DNNL_ENABLE_ITT_TASKS
250254
on those ITT tasks and show corresponding timeline information."
251255
ON)
252256

253-
# ===================
254-
# Engine capabilities
255-
# ===================
257+
#== == == == == == == == == =
258+
#Engine capabilities
259+
#== == == == == == == == == =
256260

257261
set(DNNL_CPU_RUNTIME "OMP" CACHE STRING
258262
"specifies the threading runtime for CPU engines;
@@ -305,8 +309,8 @@ set(OPENCLROOT "" CACHE STRING
305309
"path to Intel SDK for OpenCL applications.
306310
Use this option to specify custom location for OpenCL.")
307311

308-
# TODO: move logic to other cmake files?
309-
# Shortcuts for SYCL/DPC++
312+
#TODO : move logic to other cmake files ?
313+
#Shortcuts for SYCL / DPC++
310314
if(DNNL_CPU_RUNTIME STREQUAL "DPCPP" OR DNNL_CPU_RUNTIME STREQUAL "SYCL")
311315
set(DNNL_CPU_SYCL true)
312316
else()
@@ -346,18 +350,18 @@ if(DNNL_SYCL_HIP AND NOT "${DNNL_AMD_SYCL_KERNELS_TARGET_ARCH}" STREQUAL "")
346350
set(DNNL_AMD_ENABLE_SYCL_KERNELS TRUE)
347351
endif()
348352

349-
# =============
350-
# Miscellaneous
351-
# =============
353+
#== == == == == == =
354+
#Miscellaneous
355+
#== == == == == == =
352356

353357
option(BENCHDNN_USE_RDPMC
354358
"enables rdpms counter to report precise cpu frequency in benchdnn.
355359
CAUTION: may not work on all cpus (hence disabled by default)"
356360
OFF) # disabled by default
357361

358-
# =========================
359-
# Developer and debug flags
360-
# =========================
362+
#== == == == == == == == == == == == =
363+
#Developer and debug flags
364+
#== == == == == == == == == == == == =
361365

362366
set(DNNL_USE_CLANG_SANITIZER "" CACHE STRING
363367
"instructs build system to use a Clang sanitizer. Possible values:
@@ -398,9 +402,9 @@ option(DNNL_DISABLE_GPU_REF_KERNELS
398402
"builds oneDNN with only optimized kernels for GPU compute
399403
primitives" OFF)
400404

401-
# =============================
402-
# External BLAS library options
403-
# =============================
405+
#== == == == == == == == == == == == == == =
406+
#External BLAS library options
407+
#== == == == == == == == == == == == == == =
404408

405409
set(DNNL_BLAS_VENDOR "NONE" CACHE STRING
406410
"Use an external BLAS library. Valid values:
@@ -416,9 +420,9 @@ set(DNNL_BLAS_VENDOR "NONE" CACHE STRING
416420
installation. This vendor is supported for performance analysis
417421
purposes only.")
418422

419-
# ==============================================
420-
# AArch64 optimizations with Arm Compute Library
421-
# ==============================================
423+
#== == == == == == == == == == == == == == == == == == == == == == ==
424+
#AArch64 optimizations with Arm Compute Library
425+
#== == == == == == == == == == == == == == == == == == == == == == ==
422426

423427
option(DNNL_AARCH64_USE_ACL "Enables use of AArch64 optimised functions
424428
from Arm Compute Library.

include/oneapi/dnnl/dnnl.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -1615,6 +1615,9 @@ struct memory : public handle<dnnl_memory_t> {
16151615
BA16a32b4a = dnnl_BA16a32b4a,
16161616
BA16a48b4a = dnnl_BA16a48b4a,
16171617
BA16a64b4a = dnnl_BA16a64b4a,
1618+
BA24b8a = dnnl_BA24b8a,
1619+
aCB24c8b = dnnl_aCB24c8b,
1620+
abDC24d8c = dnnl_abDC24d8c,
16181621
decbA16a = dnnl_decbA16a,
16191622
decbA8a = dnnl_decbA8a,
16201623
defcbA16a = dnnl_defcbA16a,

include/oneapi/dnnl/dnnl_types.h

+3
Original file line numberDiff line numberDiff line change
@@ -715,6 +715,9 @@ typedef enum {
715715
dnnl_aBC16b32c,
716716
dnnl_AB16a16b,
717717
dnnl_AB16a32b,
718+
dnnl_BA24b8a,
719+
dnnl_aCB24c8b,
720+
dnnl_abDC24d8c,
718721
dnnl_ABcde16a16b2a,
719722
dnnl_aBCdef16b16c2b,
720723
dnnl_Acedb16a,

src/CMakeLists.txt

+16-12
Original file line numberDiff line numberDiff line change
@@ -1,18 +1,18 @@
1-
#===============================================================================
2-
# Copyright 2016-2025 Intel Corporation
1+
#== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
2+
#Copyright 2016 - 2025 Intel Corporation
33
#
4-
# Licensed under the Apache License, Version 2.0 (the "License");
5-
# you may not use this file except in compliance with the License.
6-
# You may obtain a copy of the License at
4+
#Licensed under the Apache License, Version 2.0(the "License");
5+
#you may not use this file except in compliance with the License.
6+
#You may obtain a copy of the License at
77
#
8-
# http://www.apache.org/licenses/LICENSE-2.0
8+
#http: //www.apache.org/licenses/LICENSE-2.0
99
#
10-
# Unless required by applicable law or agreed to in writing, software
11-
# distributed under the License is distributed on an "AS IS" BASIS,
12-
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13-
# See the License for the specific language governing permissions and
14-
# limitations under the License.
15-
#===============================================================================
10+
#Unless required by applicable law or agreed to in writing, software
11+
#distributed under the License is distributed on an "AS IS" BASIS,
12+
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
#See the License for the specific language governing permissions and
14+
#limitations under the License.
15+
#== == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == == =
1616

1717
file(GLOB HEADERS_ROOT
1818
${CMAKE_CURRENT_SOURCE_DIR}/../include/*.h
@@ -80,6 +80,10 @@ if(DNNL_EXPERIMENTAL_SPARSE)
8080
message(STATUS "Experimental functionality for sparse domain is enabled")
8181
endif()
8282

83+
if(DNNL_AARCH64_MATMUL_SRC_QUANT)
84+
message(STATUS "Functionality for dynamic quantisation for source(A) matrix in matmuls")
85+
endif()
86+
8387
if(DNNL_EXPERIMENTAL_UKERNEL)
8488
if(DNNL_TARGET_ARCH STREQUAL "ARCH_GENERIC")
8589
message(FATAL_ERROR "ukernel API does not support generic architecture.")

src/common/c_types_map.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -372,6 +372,9 @@ const format_tag_t aCB16b16c = dnnl_aCB16b16c;
372372
const format_tag_t aCB16b32c = dnnl_aCB16b32c;
373373
const format_tag_t aCB16b48c = dnnl_aCB16b48c;
374374
const format_tag_t aCB16b64c = dnnl_aCB16b64c;
375+
const format_tag_t BA24b8a = dnnl_BA24b8a;
376+
const format_tag_t aCB24c8b = dnnl_aCB24c8b;
377+
const format_tag_t abDC24d8c = dnnl_abDC24d8c;
375378
const format_tag_t aCB16b16c2b = dnnl_aCB16b16c2b;
376379
const format_tag_t aCB16b32c2b = dnnl_aCB16b32c2b;
377380
const format_tag_t aCB16b48c2b = dnnl_aCB16b48c2b;

src/common/memory_desc_wrapper.cpp

+3
Original file line numberDiff line numberDiff line change
@@ -202,6 +202,9 @@ status_t memory_desc_wrapper::compute_blocking(
202202
C(BA16a32b, {1, 0}, {16, 32}, {0, 1});
203203
C(BA16a48b, {1, 0}, {16, 48}, {0, 1});
204204
C(BA16a64b, {1, 0}, {16, 64}, {0, 1});
205+
C(BA24b8a, {1, 0}, {24, 8}, {1, 0});
206+
C(aCB24c8b, {0, 2, 1}, {24, 8}, {2, 1});
207+
C(abDC24d8c, {0, 1, 3, 2}, {24, 8}, {3, 2});
205208
C(BA16a16b2a, {1, 0}, {16, 16, 2}, {0, 1, 0});
206209
C(BA16a32b2a, {1, 0}, {16, 32, 2}, {0, 1, 0});
207210
C(BA16a48b2a, {1, 0}, {16, 48, 2}, {0, 1, 0});

src/common/primitive_attr_quant.hpp

+3
Original file line numberDiff line numberDiff line change
@@ -277,6 +277,9 @@ struct zero_points_t : public c_compatible {
277277

278278
// arg-specific checks
279279
bool common(int arg) const { return get_mask(arg) == 0; }
280+
bool per_ocic(int arg, int ndims) const {
281+
return get_mask(arg) == 3 << (ndims - 2);
282+
}
280283
bool per_dim_1(int arg) const { return get_mask(arg) == 2; }
281284
bool has_default_values(int arg) const {
282285
return is_set(arg) == false && has_default_data_type(arg);

0 commit comments

Comments
 (0)