include, doc: update and fix brgemm texts and labels

dzarukin · dzarukin · commit bab7bf429140 · 2024-05-23T10:26:20.000-07:00
f
diff --git a/doc/Doxyfile.in b/doc/Doxyfile.in
@@ -1962,7 +1962,7 @@ INCLUDE_FILE_PATTERNS  =
 # recursively expanded use the := operator instead of the = operator.
 # This tag requires that the tag ENABLE_PREPROCESSING is set to YES.
 
-PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS
+PREDEFINED             = DOXYGEN_SHOULD_SKIP_THIS DNNL_GPU_RUNTIME=DNNL_RUNTIME_OCL DNNL_WITH_SYCL DNNL_USE_SYCL_BUFFERS DNNL_EXPERIMENTAL_UKERNEL
 
 # If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then this
 # tag can be used to specify a list of macro names that should be expanded. The
diff --git a/doc/build/link.md b/doc/build/link.md
@@ -7,18 +7,19 @@ on how oneDNN was built.
 
 ## Header Files
 
-| File                                       | Description                       |
-|:-------------------------------------------|:----------------------------------|
-| ``include/oneapi/dnnl/dnnl.h``             | C header                          |
-| ``include/oneapi/dnnl/dnnl.hpp``           | C++ header                        |
-| ``include/oneapi/dnnl/dnnl_types.h``       | Auxiliary C header                |
-| ``include/oneapi/dnnl/dnnl_config.h``      | Auxiliary C header                |
-| ``include/oneapi/dnnl/dnnl_version.h``     | C header with version information |
-| ``include/oneapi/dnnl/dnnl_graph.h``       | C header for graph API            |
-| ``include/oneapi/dnnl/dnnl_graph.hpp``     | C++ header for graph API          |
-| ``include/oneapi/dnnl/dnnl_graph_types.h`` | Auxiliary C header for graph API  |
-| ``include/oneapi/dnnl/dnnl_ukernel.h``     | C header with ukernel API         |
-| ``include/oneapi/dnnl/dnnl_ukernel.hpp``   | C++ header with ukernel API       |
+| File                                         | Description                        |
+|:---------------------------------------------|:-----------------------------------|
+| ``include/oneapi/dnnl/dnnl.h``               | C header                           |
+| ``include/oneapi/dnnl/dnnl.hpp``             | C++ header                         |
+| ``include/oneapi/dnnl/dnnl_types.h``         | Auxiliary C header                 |
+| ``include/oneapi/dnnl/dnnl_config.h``        | Auxiliary C header                 |
+| ``include/oneapi/dnnl/dnnl_version.h``       | C header with version information  |
+| ``include/oneapi/dnnl/dnnl_graph.h``         | C header for graph API             |
+| ``include/oneapi/dnnl/dnnl_graph.hpp``       | C++ header for graph API           |
+| ``include/oneapi/dnnl/dnnl_graph_types.h``   | Auxiliary C header for graph API   |
+| ``include/oneapi/dnnl/dnnl_ukernel.h``       | C header for ukernel API           |
+| ``include/oneapi/dnnl/dnnl_ukernel.hpp``     | C++ header for ukernel API         |
+| ``include/oneapi/dnnl/dnnl_ukernel_types.h`` | Auxiliary C header for ukernel API |
 
 ## Libraries
 
diff --git a/doc/rst/index.rst b/doc/rst/index.rst
@@ -11,6 +11,7 @@ oneAPI Deep Neural Network Library Developer Guide and Reference
    dev_guide_examples
    performance_profiling_and_inspection
    advanced_topics
+   ukernels
    group_dnnl_api.rst
 
 oneAPI Deep Neural Network Library (oneDNN) is an open-source cross-platform performance library of basic building blocks for deep learning applications. The library is optimized for Intel Architecture Processors, Intel Processor Graphics and Xe Architecture graphics. Support for other architectures such as Arm* 64-bit Architecture (AArch64) and OpenPOWER* Power ISA (PPC64) is experimental.
diff --git a/doc/rst/ukernels.rst b/doc/rst/ukernels.rst
@@ -0,0 +1,10 @@
+Ukernels
+#####################
+
+.. toctree::
+   :maxdepth: 1
+
+   dev_guide_ukernel_basic_concepts.rst
+   dev_guide_ukernel_brgemm.rst
+   dev_guide_ukernel_transform.rst
+   page_cpu_brgemm_example_cpp.rst
diff --git a/doc/sphinx/conf.py b/doc/sphinx/conf.py
@@ -187,7 +187,7 @@ def setup(app):
 
 def fixFileNameRefs(app, env, docnames):
 
-    replacements = {"page_dev_guide": "dev_guide", "group_Dnnl":"group_dnnl"}
+    replacements = {"page_dev_guide":"dev_guide", "group_Dnnl":"group_dnnl", "brgemm_pack_B":"brgemm_pack_b"}
     targetDir = "rst"
 
     fileExtension = ".rst"
diff --git a/doc/ukernel/operations/brgemm.md b/doc/ukernel/operations/brgemm.md
@@ -1,4 +1,4 @@
-Batch-reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm}
+Batch-Reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm}
 =======================================
 
 >
@@ -8,33 +8,28 @@ Batch-reduce General Matrix Multiplication {#dev_guide_ukernel_brgemm}
 
 ## General
 
-The batch-reduce General Matrix Multiplication ukernel (BRGeMM) is an
-operation that allows to compute a batch of small matrix
-multiplication and accumulate their results in the same destination.
+The batch-reduce General Matrix Multiplication ukernel (BRGeMM) is an operation
+that computes a small matrix multiplication batch and accumulates their results
+in the same destination.
 
-```math
-C = \sum_i A_i \cdot B_i
-```
+\f$C = \sum_i A_i \cdot B_i\f$
 
 with
 - \f$A_i\f$ a set of matrices of dimension \f$M \times K\f$
 - \f$B_i\f$ a set of matrices of dimension \f$K \times N\f$
-- C matrix of dimension \f$M \times N\f$.
+- \f$C\f$ matrix of dimension \f$M \times N\f$.
 
-The BRGeMM ukernel also supports accumulation with values already
-present in \f$C\f$, as well as post-operation and down-conversion to
-another \f$D\f$ matrix:
+The BRGeMM ukernel also supports accumulation with values already present in
+\f$C\f$, as well as post-operation and down-conversion to another \f$D\f$
+matrix:
 
-```math
-D = \operatorname{convert}( \operatorname{post\_ops}(C + \sum_i A_i \cdot B_i, post_ops_args)).
-```
+\f$D = \operatorname{convert}( \operatorname{post\_ops}(C + \sum_i A_i \cdot B_i, post\_ops\_args))\f$
 
 ## Data Types
 
-In general, C represents an accumulation buffer. Hence when
-computations are carried in floating-point arithmetic, C shall be of
-type f32, and when computation is carried in integer arithmetic, C
-should be of type s32.
+In general, C represents an accumulation buffer. Hence, when computations are
+carried in floating-point arithmetic, C shall be of type f32; when computation
+is carried in integer arithmetic, C should be of type s32.
 
 The BRGeMM ukernel supports the following combinations of data-types.
 
@@ -47,21 +42,14 @@ The BRGeMM ukernel supports the following combinations of data-types.
 
 ## Data Representation
 
-Because of hardware restrictions, the BRGeMM ukernel requires specific
-data layout.
+Because of hardware restrictions, the BRGeMM ukernel requires a specific data
+layout.
 
-<!-- TODO: update with proper query documentation when updated --> The
-@ref dnnl::ukernel::brgemm_pack_B::need_pack() method can be called to determine
-if packing is necessary. If so,
+The @ref dnnl_brgemm_pack_B_need_pack method can be called to
+determine if packing is necessary. If so,
 [packB ukernel](@ref dev_guide_ukernel_transform) shall be created to do the
 actual packing.
 
-<!-- Which pack_type is required can be queried through #ref
-dnnl::ukernel::brgemm::get_pack_type(). Using the pack_type, user is
-responsible to pack the data appropriately before calling @ref
-brgemm::execute, either with custom code, or using the [transform
-ukernel](@ref dev_guide_ukernel_transform) -->
-
 ## Attributes
 
 The following ukernel attributes can be set through dedicated setters.
@@ -73,19 +61,19 @@ The following ukernel attributes can be set through dedicated setters.
 | Post-op   | [Binary](@ref dnnl::post_ops::append_binary)               | Applies a @ref dnnl_api_binary operation to the result    | General binary post-op restrictions |
 
 
-@note if zero-points are passed for A/B, fpmath_mode should be set for
-the computation to happen over floating-point format (so up-conversion
-to floating-point format would happen before computation). If
-computation in integer format is needed, BRGeMM ukernel should be
-configured without zero-point, and the user should prepare a
-compensation term that will be passed to the binary post-op.
+@note if zero-points are passed for A/B, fpmath_mode should be set for the
+computation to happen over floating-point format (so up-conversion to
+floating-point format would happen before computation). If computation in
+integer format is needed, BRGeMM ukernel should be configured without
+zero-point, and the user should prepare a compensation term that will be passed
+to the binary post-op.
 
 ## Implementation limitations
 
 BRGeMM ukernel has no known limitations.
 
 ## Examples
 
-[BRGeMM ukernel example](@ref brgemm_example_cpp)
+[BRGeMM ukernel example](@ref cpu_brgemm_example_cpp)
 
-@copydetails brgemm_example_cpp_short
+@copydetails cpu_brgemm_example_cpp
diff --git a/doc/ukernel/operations/transform.md b/doc/ukernel/operations/transform.md
@@ -7,9 +7,9 @@ Data transformation {#dev_guide_ukernel_transform}
 
 ## General
 
-The packB ukernel allows to pack BRGeMM B matrices in optimal layout
-before executing the [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm).
-This is an out-of-place operation.
+The packB ukernel allows users to pack BRGeMM B matrices in an optimal layout
+before executing the [BRGeMM ukernel](@ref dev_guide_ukernel_brgemm). This is an
+out-of-place operation.
 
 ## Data Types
 
@@ -37,6 +37,6 @@ No attribute is supported for packB ukernel.
 
 ## Examples
 
-[BRGeMM ukernel example](@ref brgemm_example_cpp)
+[BRGeMM ukernel example](@ref cpu_brgemm_example_cpp)
 
-@copydetails brgemm_example_cpp_short
+@copydetails cpu_brgemm_example_cpp
diff --git a/examples/ukernels/cpu_brgemm.cpp b/examples/ukernels/cpu_brgemm.cpp
@@ -14,18 +14,14 @@
 * limitations under the License.
 *******************************************************************************/
 
-/// @example brgemm.cpp
-/// > Annotated version: @ref brgemm_example_cpp
-///
-/// @page brgemm_example_cpp_short
+/// @example cpu_brgemm.cpp
+/// > Annotated version: @ref cpu_brgemm_example_cpp
 ///
+/// @page cpu_brgemm_example_cpp BRGeMM ukernel example
 /// This C++ API example demonstrates how to create and execute a BRGeMM
 /// ukernel.
 ///
-/// @page brgemm_example_cpp Example of using BRGeMM ukernel to implement Matmul
-/// @copydetails brgemm_example_cpp_short
-///
-/// @include brgemm.cpp
+/// @include cpu_brgemm.cpp
 
 #include <algorithm>
 #include <cmath>
diff --git a/include/oneapi/dnnl/dnnl_ukernel.h b/include/oneapi/dnnl/dnnl_ukernel.h
@@ -124,8 +124,8 @@ dnnl_status_t DNNL_API dnnl_brgemm_execute(const_dnnl_brgemm_t brgemm,
 /// Executes a BRGeMM ukernel object with post operations.
 ///
 /// @param brgemm BRGeMM ukernel object.
-/// @param A_ptr Base pointer to a tensor A.
-/// @param B_ptr Base pointer to a tensor B.
+/// @param A Base pointer to a tensor A.
+/// @param B Base pointer to a tensor B.
 /// @param A_B_offsets Pointer to a set of tensor A and tensor B offsets for
 ///     each batch. A set must be contiguous in memory. A single batch should
 ///     supply offsets for both tensors A and B simultaneously. The number of
@@ -177,7 +177,7 @@ dnnl_status_t DNNL_API dnnl_brgemm_pack_B_need_pack(
         const_dnnl_brgemm_pack_B_t brgemm_pack_B, int *need_pack);
 
 /// Generates an executable part of BRGeMM ukernel packing B object.
-/// @param brgemm BRGeMM ukernel packing B object.
+/// @param brgemm_pack_B BRGeMM ukernel packing B object.
 /// @returns #dnnl_success on success and a status describing the error
 ///     otherwise.
 dnnl_status_t DNNL_API dnnl_brgemm_pack_B_generate(
diff --git a/include/oneapi/dnnl/dnnl_ukernel.hpp b/include/oneapi/dnnl/dnnl_ukernel.hpp
@@ -52,7 +52,8 @@ struct handle_traits<dnnl_brgemm_pack_B_t> {
 
 /// @} dnnl_api_utils
 
-/// @addtogroup dnnl_api_ukernel
+/// @addtogroup dnnl_api_ukernel Ukernels
+/// Collection of ukernels
 /// @{
 
 /// ukernel namespace
@@ -61,6 +62,7 @@ namespace ukernel {
 #ifdef DNNL_EXPERIMENTAL_UKERNEL
 
 /// @addtogroup dnnl_api_ukernel_brgemm BRGeMM ukernel
+/// BRGeMM ukernel routines
 /// @{
 
 struct brgemm : public handle<dnnl_brgemm_t> {
@@ -282,8 +284,8 @@ struct brgemm_pack_B : public handle<dnnl_brgemm_pack_B_t> {
 
     /// Executes a BRGeMM ukernel packing tensor B object.
     ///
-    /// @param in_ptr Pointer to an input buffer.
-    /// @param out_ptr Pointer to an output buffer.
+    /// @param in Pointer to an input buffer.
+    /// @param out Pointer to an output buffer.
     void execute(const void *in, void *out) const {
         dnnl_status_t status = dnnl_brgemm_pack_B_execute(get(), in, out);
         if (status != dnnl_success)