[FORK][FIX] changed comp_tile_len data type from int16_t to int

jianan-gu · jianan-gu · commit 2ead5d4fe599 · 2023-10-26T00:49:49.000-07:00
[FORK][FEATURE] cpu: add inner product with sparse packed weights
diff --git a/src/common/memory_desc_wrapper.hpp b/src/common/memory_desc_wrapper.hpp
@@ -235,7 +235,9 @@ struct memory_desc_wrapper : public c_compatible {
                 // assert(matches_tag(format_tag::OI16i64o4i)); - TODO: enable for sparse packed.
                 const size_t metadata = padded_dims()[0] * padded_dims()[1] / 64
                         * sizeof(uint64_t);
-                size_t comp_tile_data_size = ceil(static_cast<float>(padded_dims()[0] * padded_dims()[1]) / (64 * 64 * 32)) * 64;
+                using comp_tile_len_type = int;
+                size_t comp_tile_data_size = ceil(static_cast<float>(padded_dims()[0] * padded_dims()[1])
+                        / (64 * 64 * (64 / sizeof(comp_tile_len_type)))) * 64;
                 return comp_tile_data_size + (padded_dims()[0] * padded_dims()[1] * data_type_size())
                         + metadata + 1000;
                         // todo: [av] why 1000?
diff --git a/src/cpu/reorder/simple_sparse_reorder.hpp b/src/cpu/reorder/simple_sparse_reorder.hpp
@@ -134,17 +134,14 @@ struct simple_sparse_reorder_impl<SIMPLE_SPARSE_REORDER_TEMPL_CALL,
         size_t offset = padded_dims[0] * padded_dims[1];
 
         int total_blocks = offset / 4096;
-        int16_t *comp_tile_len_ptr = reinterpret_cast<int16_t *>(output);
+        using comp_tile_len_type = int;
+        comp_tile_len_type *comp_tile_len_ptr = reinterpret_cast<comp_tile_len_type *>(output);
         int comp_tile_len_index = 0;
         int cl_length = 0;
-        // TODO: why 2 / 64?
         // Wasting memory space due to allocation a buffer for the whole tensor?
-        int output_offset = ceil((float)total_blocks * 2 / 64.0);
-
-        size_t offset_2 = static_cast<size_t>(ceil((float)total_blocks * 2 / 64.0)) * 64;
-        uint64_t *bitmask_ptr = reinterpret_cast<uint64_t *>(output + offset + offset_2);
-
-        auto outp = &output[output_d.blk_off(0, 0, 0, 0) + output_offset * 64];
+        int output_offset = ceil((float)total_blocks * sizeof(comp_tile_len_type) / 64.0) * 64;
+        uint64_t *bitmask_ptr = reinterpret_cast<uint64_t *>(output + output_offset + offset);
+        auto outp = &output[output_d.blk_off(0, 0, 0, 0) + output_offset];
 
         // TODO: add threading.
         for (int O = 0; O < NB_OC; O++) {
@@ -184,7 +181,7 @@ struct simple_sparse_reorder_impl<SIMPLE_SPARSE_REORDER_TEMPL_CALL,
                         if (count % 64 == 0) { bitmask_idx++; }
                     }
                 }
-                int16_t cl = (int16_t)ceil(non_zeros / 64.0);
+                comp_tile_len_type cl = (comp_tile_len_type)ceil(non_zeros / 64.0);
                 comp_tile_len_index++;
                 cl_length = comp_tile_len_ptr[comp_tile_len_index - 1] + cl;
                 int unsed_bytes_in_cl = 64 - (non_zeros % 64);
diff --git a/src/cpu/x64/jit_brgemm_inner_product.cpp b/src/cpu/x64/jit_brgemm_inner_product.cpp
@@ -292,8 +292,9 @@ status_t brgemm_inner_product_fwd_t<isa>::execute_forward(
                 const dim_t wei_offset = (wei_cur_ocb
                         + wei_ic_stride * (icb + b * ic_blocks_per_batch)) / typesize_scale;
                 if (jbgp.weights_compressed) {
-                    const int16_t *compressed_tile_lengths_ptr
-                            = reinterpret_cast<const int16_t *>(weights);
+                    using comp_tile_len_type = int;
+                    const comp_tile_len_type *compressed_tile_lengths_ptr
+                            = reinterpret_cast<const comp_tile_len_type *>(weights);
                     int compressed_weights_offset = wei_offset / 4096;
 
                     auto dcomp_params = brgemm_decomp_kernel_params_t();
diff --git a/src/cpu/x64/jit_brgemm_inner_product_utils.cpp b/src/cpu/x64/jit_brgemm_inner_product_utils.cpp
@@ -1382,8 +1382,9 @@ status_t jit_brgemm_ip_conf_t::init_conf_base(cpu_isa_t isa,
         if (jbgp.weights_compressed) {
             jbgp.weights_compressed = true;
             int total_blocks = (jbgp.oc * jbgp.ic) / 4096;
+            using comp_tile_len_type = int;
             jbgp.weights_starting_offset
-                    = ceil((float)total_blocks * 2 / 64.0) * 64;
+                    = ceil((float)total_blocks * sizeof(comp_tile_len_type) / 64.0) * 64;
             jbgp.weight_comp_bitmask_off = jbgp.weights_starting_offset + jbgp.ic * jbgp.oc;
         }
     } else if (is_bf16) {