cpu: matmul: update reference impl for coo sparse matmul

avmanerikar · avmanerikar · commit 9233c5a12772 · 2024-09-13T12:15:51.000-07:00
diff --git a/src/common/memory_tracking.hpp b/src/common/memory_tracking.hpp
@@ -257,6 +257,7 @@ enum {
     key_matmul_wei_trans,
     key_matmul_dst_trans,
     key_matmul_dst_cast_acc,
+    key_matmul_sparse_tmp_ptr,
     key_pool_dst_bf16cvt,
     key_pool_dst_plain2blocked_cvt,
     key_pool_ind_plain2blocked_cvt,
diff --git a/src/cpu/matmul/ref_sparse_matmul.cpp b/src/cpu/matmul/ref_sparse_matmul.cpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -18,6 +18,8 @@
 #include "common/math_utils.hpp"
 #include "common/type_helpers.hpp"
 
+#include "cpu/ref_io_helper.hpp"
+
 #include "cpu/matmul/ref_sparse_matmul.hpp"
 
 namespace dnnl {
@@ -27,7 +29,7 @@ namespace matmul {
 
 status_t ref_sparse_matmul_t::execute(const exec_ctx_t &ctx) const {
     status_t status = status::success;
-    auto dst = CTX_OUT_CLEAN_MEM(float *, DNNL_ARG_DST, status);
+    auto dst = CTX_OUT_CLEAN_MEM(void *, DNNL_ARG_DST, status);
     CHECK(status);
 
     const auto src_d = ctx.memory_mdw(DNNL_ARG_SRC, pd()->src_md());
@@ -38,48 +40,161 @@ status_t ref_sparse_matmul_t::execute(const exec_ctx_t &ctx) const {
     const dim_t N = dst_d.dims()[1];
     const dim_t K = src_d.dims()[1];
 
-    parallel_nd(M, N, [&](dim_t i, dim_t j) { dst[i * N + j] = 0.0f; });
+    const data_type_t mm_dt = src_d.data_type();
+    auto scratchpad = ctx.get_scratchpad_grantor();
+
+    parallel_nd(M, N, [&](dim_t i, dim_t j) {
+        const dim_t dst_idx = i * N + j;
+        io::store_float_value(dst_d.data_type(), 0.0f, dst, dst_idx);
+    });
 
     if (weights_d.is_sparse_desc()) {
-        const auto src = CTX_IN_MEM(const float *, DNNL_ARG_SRC);
-        const auto wei_values = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS, 0);
-        const auto wei_indices
-                = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 1);
-        const auto wei_pointers
-                = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 2);
 
+        const auto src = CTX_IN_MEM(const void *, DNNL_ARG_SRC);
+        const auto wei_values = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS, 0);
+        auto wei_buffer_1 = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 1);
+        auto wei_buffer_2 = CTX_IN_MEM(const int32_t *, DNNL_ARG_WEIGHTS, 2);
+
+        // Both COO and CSR encoded data is operated on using CSR kernel for
+        // matrix multiplication.
+        // For COO encoding, data preparation includes using a temporary
+        // buffer to convert the data to the CSR format.
+        // Matrix multiplication is then carried out using the CSR encoded data.
+        const int32_t *wei_indices;
+        const int32_t *wei_pointers;
+
+        if (weights_d.encoding() == sparse_encoding::csr) {
+            // For CSR encodings, pointer and indices assignment is
+            // staightforward as,
+            // index 1 - index buffer, index 2 - pointer buffer.
+            wei_indices = wei_buffer_1;
+            wei_pointers = wei_buffer_2;
+        } else if (weights_d.encoding() == sparse_encoding::coo) {
+            // For COO encodings, the two index buffers hold the row and column
+            // indices respectively. For CSR conversion, the row indices are
+            // compressed to generate the CSR pointers.
+            wei_indices = wei_buffer_2;
+
+            int32_t *wei_row_pointers = scratchpad.template get<int32_t>(
+                    memory_tracking::names::key_matmul_sparse_tmp_ptr);
+
+            parallel_nd(K + 1, [&](dim_t k) {
+                io::store_float_value(
+                        weights_d.metadata_type(0), 0, wei_row_pointers, k);
+            });
+
+            cvt_coo_indices_to_csr_pointers(
+                    wei_buffer_1, wei_row_pointers, weights_d.nnz(), K);
+
+            wei_pointers = wei_row_pointers;
+        }
+
+        run_csr_kernel(src, wei_values, wei_indices, wei_pointers, dst, M, N, K,
+                mm_dt, src_d.is_sparse_desc());
+
+    } else if (src_d.is_sparse_desc()) {
+        const auto weights = CTX_IN_MEM(const void *, DNNL_ARG_WEIGHTS);
+        const auto src_values = CTX_IN_MEM(const void *, DNNL_ARG_SRC, 0);
+        auto src_buffer_1 = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 1);
+        auto src_buffer_2 = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 2);
+
+        // Both COO and CSR encoded data is operated on using CSR kernel for
+        // matrix multiplication.
+        // For COO encoding, data preparation includes using a temporary
+        // buffer to convert the data to the CSR format.
+        // Matrix multiplication is then carried out using the CSR encoded data.
+        const int32_t *src_indices;
+        const int32_t *src_pointers;
+
+        if (src_d.encoding() == sparse_encoding::csr) {
+            // For CSR encodings, pointer and indices assignment is
+            // staightforward as
+            // index 1 - index buffer, index 2 - pointer buffer.
+            src_indices = src_buffer_1;
+            src_pointers = src_buffer_2;
+        } else if (src_d.encoding() == sparse_encoding::coo) {
+            // For COO encodings, the two index buffers hold the row and column
+            // indices respectively. For CSR conversion, the row indices are
+            // compressed to generate the CSR pointers.
+            src_indices = src_buffer_2;
+
+            int32_t *src_row_pointers = scratchpad.template get<int32_t>(
+                    memory_tracking::names::key_matmul_sparse_tmp_ptr);
+
+            parallel_nd(M + 1, [&](dim_t m) {
+                io::store_float_value(
+                        src_d.metadata_type(0), 0, src_row_pointers, m);
+            });
+
+            cvt_coo_indices_to_csr_pointers(
+                    src_buffer_1, src_row_pointers, src_d.nnz(), M);
+            src_pointers = src_row_pointers;
+        }
+
+        run_csr_kernel(weights, src_values, src_indices, src_pointers, dst, M,
+                N, K, mm_dt, src_d.is_sparse_desc());
+    }
+    return status::success;
+}
+
+void ref_sparse_matmul_t::cvt_coo_indices_to_csr_pointers(
+        const int32_t *indices, int32_t *pointers, const int nnz,
+        const int nrows) const {
+    parallel_nd(
+            nnz, [&](dim_t i) { fetch_and_add(&pointers[indices[i] + 1], 1); });
+    for (int i = 0; i < nrows; ++i) {
+        pointers[i + 1] += pointers[i];
+    }
+}
+
+void ref_sparse_matmul_t::run_csr_kernel(const void *dmat, const void *values,
+        const int32_t *indices, const int32_t *pointers, void *res,
+        const dim_t M, const dim_t N, const dim_t K, const data_type_t mm_dt,
+        bool is_src_sparse) const {
+
+    if (is_src_sparse) {
+        // With a sparse source tensor, the matrix multiplication is carried out
+        // for a sparse multiplier with parallelization over the sparse rows
+        // of the multiplier matrix.
         parallel_nd(M, [&](dim_t m) {
-            for (dim_t k = 0; k < K; k++) {
-                const dim_t row_start = wei_pointers[k];
-                const dim_t row_end = wei_pointers[k + 1];
-                for (dim_t n = row_start; n < row_end; n++) {
-                    const dim_t src_idx = m * K + k;
-                    const dim_t dst_idx = m * N + wei_indices[n];
-                    dst[dst_idx] = dst[dst_idx] + src[src_idx] * wei_values[n];
+            const dim_t row_start = pointers[m];
+            const dim_t row_end = pointers[m + 1];
+
+            for (dim_t n = 0; n < N; n++) {
+                const dim_t c_idx = m * N + n;
+                float c_val = io::load_float_value(mm_dt, res, c_idx);
+
+                for (dim_t k = row_start; k < row_end; k++) {
+                    const dim_t b_idx = indices[k] * N + n;
+                    const float a_val = io::load_float_value(mm_dt, values, k);
+                    const float b_val
+                            = io::load_float_value(mm_dt, dmat, b_idx);
+                    c_val += a_val * b_val;
                 }
+                io::store_float_value(mm_dt, c_val, res, c_idx);
             }
         });
-    } else if (src_d.is_sparse_desc()) {
-        const auto weights = CTX_IN_MEM(const float *, DNNL_ARG_WEIGHTS);
-        const auto src_values = CTX_IN_MEM(const float *, DNNL_ARG_SRC, 0);
-        const auto src_indices = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 1);
-        const auto src_pointers = CTX_IN_MEM(const int32_t *, DNNL_ARG_SRC, 2);
-
+    } else {
+        // With a sparse weights tensor, the matrix multiplication is carried
+        // out for a sparse multiplicand with parallelization over the dense
+        // rows of the multiplier matrix.
         parallel_nd(M, [&](dim_t m) {
-            const dim_t row_start = src_pointers[m];
-            const dim_t row_end = src_pointers[m + 1];
-            for (dim_t k = row_start; k < row_end; k++) {
-                for (dim_t n = 0; n < N; n++) {
-                    const dim_t dst_idx = m * N + n;
-                    const dim_t wei_idx = src_indices[k] * N + n;
-                    dst[dst_idx]
-                            = dst[dst_idx] + src_values[k] * weights[wei_idx];
+            for (dim_t k = 0; k < K; k++) {
+                const dim_t row_start = pointers[k];
+                const dim_t row_end = pointers[k + 1];
+                for (dim_t n = row_start; n < row_end; n++) {
+                    const dim_t a_idx = m * K + k;
+                    const dim_t c_idx = m * N + indices[n];
+                    const float a_val
+                            = io::load_float_value(mm_dt, dmat, a_idx);
+                    const float b_val = io::load_float_value(mm_dt, values, n);
+                    float c_val = io::load_float_value(mm_dt, res, c_idx);
+                    c_val += a_val * b_val;
+                    io::store_float_value(mm_dt, c_val, res, c_idx);
                 }
             }
         });
     }
-
-    return status::success;
 }
 
 } // namespace matmul
diff --git a/src/cpu/matmul/ref_sparse_matmul.hpp b/src/cpu/matmul/ref_sparse_matmul.hpp
@@ -1,5 +1,5 @@
 /*******************************************************************************
-* Copyright 2023 Intel Corporation
+* Copyright 2023-2024 Intel Corporation
 *
 * Licensed under the Apache License, Version 2.0 (the "License");
 * you may not use this file except in compliance with the License.
@@ -44,25 +44,62 @@ struct ref_sparse_matmul_t : public primitive_t {
             memory_desc_wrapper src_d(src_md());
             memory_desc_wrapper wei_d(weights_md(0));
 
-            const bool ok
-                    = utils::everyone_is(f32, src_type, wei_type, dst_type)
-                    && utils::one_of(true, wei_d.is_sparse_desc(),
-                            src_d.is_sparse_desc())
-                    && IMPLICATION(wei_d.is_sparse_desc(),
-                            wei_d.encoding() == sparse_encoding::csr)
-                    && IMPLICATION(src_d.is_sparse_desc(),
-                            src_d.encoding() == sparse_encoding::csr)
-                    && IMPLICATION(
-                            wei_d.is_sparse_desc(), !src_d.is_sparse_desc())
-                    && IMPLICATION(src_d.is_sparse_desc(),
-                            utils::everyone_is(s32, src_d.metadata_type(0),
-                                    src_d.metadata_type(1)))
-                    && IMPLICATION(wei_d.is_sparse_desc(),
-                            utils::everyone_is(s32, wei_d.metadata_type(0),
-                                    wei_d.metadata_type(1)))
-                    && !with_bias() && attr()->has_default_values()
-                    && set_default_formats() && formats_ok(src_d, wei_d);
-            return ok ? status::success : status::unimplemented;
+            VDISPATCH_MATMUL(wei_d.is_sparse_desc() || src_d.is_sparse_desc(),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            VDISPATCH_MATMUL(wei_d.is_sparse_desc() ^ src_d.is_sparse_desc(),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            VDISPATCH_MATMUL(IMPLICATION(src_d.is_sparse_desc(),
+                                     utils::one_of(src_d.encoding(),
+                                             sparse_encoding::csr,
+                                             sparse_encoding::coo)),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            VDISPATCH_MATMUL(IMPLICATION(wei_d.is_sparse_desc(),
+                                     utils::one_of(wei_d.encoding(),
+                                             sparse_encoding::csr,
+                                             sparse_encoding::coo)),
+                    VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+            VDISPATCH_MATMUL(
+                    utils::everyone_is(f16, src_type, wei_type, dst_type)
+                            || utils::everyone_is(
+                                    f32, src_type, wei_type, dst_type),
+                    VERBOSE_UNSUPPORTED_DT_CFG);
+
+            if (src_d.is_sparse_desc()) {
+                sparse_mem_encoding = src_d.encoding();
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::coo,
+                                s32 == src_d.metadata_type(0)),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::csr,
+                                utils::everyone_is(s32, src_d.metadata_type(0),
+                                        src_d.metadata_type(1))),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            }
+            if (wei_d.is_sparse_desc()) {
+                sparse_mem_encoding = wei_d.encoding();
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::coo,
+                                s32 == wei_d.metadata_type(0)),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+
+                VDISPATCH_MATMUL(
+                        IMPLICATION(sparse_mem_encoding == sparse_encoding::csr,
+                                utils::everyone_is(s32, wei_d.metadata_type(0),
+                                        wei_d.metadata_type(1))),
+                        VERBOSE_UNSUPPORTED_SPARSE_CFG);
+            }
+
+            VDISPATCH_MATMUL(!with_bias(), VERBOSE_UNSUPPORTED_BIAS_CFG);
+            VDISPATCH_MATMUL(
+                    attr()->has_default_values(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(set_default_formats(), VERBOSE_UNSUPPORTED_ATTR);
+            VDISPATCH_MATMUL(formats_ok(src_d, wei_d), VERBOSE_UNSUPPORTED_TAG);
+
+            init_scratchpad();
+            return status::success;
         }
 
         bool formats_ok(const memory_desc_wrapper &src_d,
@@ -76,10 +113,40 @@ struct ref_sparse_matmul_t : public primitive_t {
                 return src_d.matches_one_of_tag(format_tag::ab);
             return false;
         }
+
+    private:
+        void init_scratchpad() {
+            using namespace memory_tracking::names;
+            const memory_desc_wrapper src_d(src_md());
+            const memory_desc_wrapper wei_d(weights_md());
+
+            if (sparse_mem_encoding == sparse_encoding::coo) {
+                auto scratchpad = scratchpad_registry().registrar();
+                const auto ptr_size
+                        = src_d.dims()[(int)wei_d.is_sparse_desc()] + 1;
+                scratchpad.template book<int32_t>(
+                        key_matmul_sparse_tmp_ptr, ptr_size);
+            }
+        }
+
+        sparse_encoding_t sparse_mem_encoding = sparse_encoding::undef;
     };
 
     ref_sparse_matmul_t(const pd_t *apd) : primitive_t(apd) {}
 
+    // COO sparse encodings are converted to CSR format by
+    // compressing the respective row indices into CSR pointers.
+    void cvt_coo_indices_to_csr_pointers(const int32_t *indices,
+            int32_t *pointers, const int nnz, const int nrows) const;
+
+    // Executes the matrix mutiplication, C = A x B where one of the input
+    // matrices is dense. Operation indices are determined depending on
+    // whether the mulitplier or multiplicand is dense
+    void run_csr_kernel(const void *dmat, const void *values,
+            const int32_t *indices, const int32_t *pointers, void *res,
+            const dim_t M, const dim_t N, const dim_t K,
+            const data_type_t mm_dt, bool is_src_sparse) const;
+
     status_t execute(const exec_ctx_t &ctx) const override;
 
 private: