diff --git a/behavior_tests/src/query-api-mapping/do_test.py b/behavior_tests/src/query-api-mapping/do_test.py
index fea3ef5c7..712b99375 100644
--- a/behavior_tests/src/query-api-mapping/do_test.py
+++ b/behavior_tests/src/query-api-mapping/do_test.py
@@ -182,7 +182,7 @@ def migrate_test():
             ],
             [],
             [
-                "dpct::sparse::spmm(*handle, transa, transb, alpha, a, b, beta, c, computetype);"
+                "dpct::sparse::spmm(handle->get_queue(), transa, transb, alpha, a, b, beta, c, computetype);"
             ],
         ],
         [  # NCCL
diff --git a/features/config/TEMPLATE_cublasLt.xml b/features/config/TEMPLATE_cublasLt.xml
new file mode 100644
index 000000000..cb15eeed1
--- /dev/null
+++ b/features/config/TEMPLATE_cublasLt.xml
@@ -0,0 +1,13 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<test driverID="test_feature" name="TEMPLATE">
+    <description>test</description>
+    <files>
+        <file path="feature_case/cublasLt/${testName}.cu" />
+    </files>
+    <rules>
+        <platformRule OSFamily="Linux" kit="CUDA11.6" kitRange="OLDER" runOnThisPlatform="false"/>
+        <platformRule OSFamily="Windows" kit="CUDA11.6" kitRange="OLDER" runOnThisPlatform="false"/>
+        <optlevelRule excludeOptlevelNameString="cuda" />
+    </rules>
+</test>
diff --git a/features/config/TEMPLATE_cusparse_after_112_cpu.xml b/features/config/TEMPLATE_cusparse_after_112_cpu.xml
new file mode 100644
index 000000000..5776a3d74
--- /dev/null
+++ b/features/config/TEMPLATE_cusparse_after_112_cpu.xml
@@ -0,0 +1,14 @@
+<?xml version="1.0" encoding="UTF-8"?>
+
+<test driverID="test_feature" name="TEMPLATE">
+    <description>test</description>
+    <files>
+        <file path="feature_case/cusparse/${testName}.cu" />
+    </files>
+    <rules>
+        <platformRule OSFamily="Linux" kit="CUDA11.2" kitRange="OLDER" runOnThisPlatform="false"/>
+        <platformRule OSFamily="Windows" kit="CUDA11.2" kitRange="OLDER" runOnThisPlatform="false"/>
+        <optlevelRule excludeOptlevelNameString="gpu" />
+        <optlevelRule excludeOptlevelNameString="cuda" />
+    </rules>
+</test>
diff --git a/features/feature_case/cublasLt/matmul.cu b/features/feature_case/cublasLt/matmul.cu
new file mode 100644
index 000000000..0cf382f7a
--- /dev/null
+++ b/features/feature_case/cublasLt/matmul.cu
@@ -0,0 +1,754 @@
+// ===------------ matmul.cu ----------------------------- *- CUDA -* ----=== //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#include <cublasLt.h>
+#include <cstdint>
+#include <stdexcept>
+
+const constexpr int COL_TURING = 0;
+const constexpr int COL_AMPERE = 1;
+
+// The original source of below two functions was under the license below:
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Repo: https://github.com/TimDettmers/bitsandbytes.git
+inline int checkCublasStatus(cublasStatus_t status) {
+    if (status != CUBLAS_STATUS_SUCCESS) {
+        printf("cuBLAS API failed with status %d\n", status);
+        //throw std::logic_error("cuBLAS API failed");
+        return 1;
+    }
+    return 0;
+}
+
+template <int FORMATB, int DTYPE_OUT, int SCALE_ROWS> int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+{
+    int has_error = 0;
+    cublasLtMatmulDesc_t matmulDesc = NULL;
+    cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+    cublasOperation_t opT = CUBLAS_OP_T;
+    cublasLtPointerMode_t alphaVec = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+    cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+    cublasLtOrder_t col_turing = CUBLASLT_ORDER_COL4_4R2_8C;
+    cublasLtOrder_t col_ampere = CUBLASLT_ORDER_COL32_2R_4R4;
+
+    has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8I, m, k, lda));
+    has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8I, n, k, ldb));
+
+    has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+    if(FORMATB == COL_TURING)
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
+    else
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
+
+    if(DTYPE_OUT == 32)
+    {
+      has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, CUDA_R_32I));
+      has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32I, m, n, ldc));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      int alpha = 1, beta = 0;
+      has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
+    }
+    else
+    {
+      has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, CUDA_R_32F));
+      has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_8I, m, n, ldc));
+      has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+      if(!SCALE_ROWS)
+      {
+        float alpha = 1.0f, beta = 0.0f;
+        has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+      }
+      else
+      {
+        has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
+        has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+      }
+    }
+
+    cudaStreamSynchronize(0);
+
+    if (Cdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Cdesc));
+    if (Bdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Bdesc));
+    if (Adesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Adesc));
+    if (matmulDesc) has_error |= checkCublasStatus(cublasLtMatmulDescDestroy(matmulDesc));
+    if(has_error == 1)
+      printf("error detected");
+
+    return has_error;
+}
+
+void transform(cublasLtHandle_t ltHandle, const void *in, int ld_in,
+               cublasLtMatrixLayout_t layout_in, void *out, int ld_out,
+               cublasLtMatrixLayout_t layout_out) {
+  cublasLtMatrixTransformDesc_t transform_desc = NULL;
+  cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F);
+  float alpha = 1.0f, beta = 0.0f;
+  cublasLtMatrixTransform(ltHandle, transform_desc, &alpha, in, layout_in,
+                          &beta, NULL, NULL, out, layout_out, 0);
+  cublasLtMatrixTransformDescDestroy(transform_desc);
+}
+
+// igemmlt<COL_TURING, 32, 0>
+bool test1() {
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+  const constexpr int m = 4;
+  const constexpr int n = 2;
+  const constexpr int k = 3;
+  int lda = m;
+  int ldb = n;
+  int ldc = m;
+  void *Adev;
+  void *Bdev;
+  void *Cdev;
+  cudaMalloc(&Adev, m * k * sizeof(int8_t));
+  cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+  cudaMalloc(&Cdev, m * n * sizeof(int32_t));
+
+  int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+  cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+  cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+                         Cdesc_col_major = NULL;
+  cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+  cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_32I, m, n, ldc);
+
+  // Convert A and B
+  cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL,
+                         Cdesc_col32 = NULL;
+  int8_t *A_col32, *B_col4_4r2_8c;
+  int32_t *C_col32;
+  cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+  cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t));
+  cudaMalloc(&C_col32, m * 32 * sizeof(std::int32_t));
+  cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+  cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n,
+                             ((n + 8 - 1) / 8) * 8 * 32);
+  cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_32I, m, n, m * 32);
+  cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+  cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C;
+  cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+  cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c,
+                                   CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c,
+                                   sizeof(col4_4r2_8c));
+  cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+
+  transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+  transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32,
+            Bdesc_col4_4r2_8c);
+
+  // Matmul
+  igemmlt<COL_TURING, 32, 0>(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32,
+                             nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32,
+                             m * 32);
+
+  // Convert C
+  transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+  cudaStreamSynchronize(0);
+
+  // Check result
+  int32_t Chost[m * n];
+  cudaMemcpy(Chost, Cdev, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+  bool error = false;
+  int32_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+  for (int i = 0; i < m * n; i++) {
+    if (Chost[i] != C_ref[i]) {
+      error = true;
+      break;
+    }
+  }
+  printf("c:\n");
+  for (int i = 0; i < m * n; i++)
+    printf("%d, ", Chost[i]);
+  printf("\n");
+
+  if (error) {
+    printf("error\n");
+  } else {
+    printf("success\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+  cublasLtMatrixLayoutDestroy(Adesc_col32);
+  cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c);
+  cublasLtMatrixLayoutDestroy(Cdesc_col32);
+  cublasLtMatrixLayoutDestroy(Adesc_col_major);
+  cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+  cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+  cudaFree(Adev);
+  cudaFree(Bdev);
+  cudaFree(Cdev);
+
+  return !error;
+}
+
+// igemmlt<COL_TURING, 8, 0>
+bool test2() {
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+  const constexpr int m = 4;
+  const constexpr int n = 2;
+  const constexpr int k = 3;
+  int lda = m;
+  int ldb = n;
+  int ldc = m;
+  void *Adev;
+  void *Bdev;
+  void *Cdev;
+  cudaMalloc(&Adev, m * k * sizeof(int8_t));
+  cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+  cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+  int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+  cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+  cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+                         Cdesc_col_major = NULL;
+  cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+  cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+  // Convert A and B
+  cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL,
+                         Cdesc_col32 = NULL;
+  int8_t *A_col32, *B_col4_4r2_8c;
+  int8_t *C_col32;
+  cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+  cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t));
+  cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+  cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+  cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n,
+                             ((n + 8 - 1) / 8) * 8 * 32);
+  cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+  cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+  cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C;
+  cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+  cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c,
+                                   CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c,
+                                   sizeof(col4_4r2_8c));
+  cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+
+  transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+  transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32,
+            Bdesc_col4_4r2_8c);
+
+  // Matmul
+  igemmlt<COL_TURING, 8, 0>(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32,
+                            nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32,
+                            m * 32);
+
+  // Convert C
+  transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+  cudaStreamSynchronize(0);
+
+  // Check result
+  int8_t Chost[m * n];
+  cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+  bool error = false;
+  int8_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+  for (int i = 0; i < m * n; i++) {
+    if (Chost[i] != C_ref[i]) {
+      error = true;
+      break;
+    }
+  }
+  printf("c:\n");
+  for (int i = 0; i < m * n; i++)
+    printf("%d, ", Chost[i]);
+  printf("\n");
+
+  if (error) {
+    printf("error\n");
+  } else {
+    printf("success\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+  cublasLtMatrixLayoutDestroy(Adesc_col32);
+  cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c);
+  cublasLtMatrixLayoutDestroy(Cdesc_col32);
+  cublasLtMatrixLayoutDestroy(Adesc_col_major);
+  cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+  cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+  cudaFree(Adev);
+  cudaFree(Bdev);
+  cudaFree(Cdev);
+
+  return !error;
+}
+
+// igemmlt<COL_TURING, 8, 1>
+bool test3() {
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+  const constexpr int m = 4;
+  const constexpr int n = 2;
+  const constexpr int k = 3;
+  int lda = m;
+  int ldb = n;
+  int ldc = m;
+  void *Adev;
+  void *Bdev;
+  void *Cdev;
+  cudaMalloc(&Adev, m * k * sizeof(int8_t));
+  cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+  cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+  int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+  cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+  cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+                         Cdesc_col_major = NULL;
+  cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+  cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+  // Convert A and B
+  cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL,
+                         Cdesc_col32 = NULL;
+  int8_t *A_col32, *B_col4_4r2_8c;
+  int8_t *C_col32;
+  cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+  cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t));
+  cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+  cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+  cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n,
+                             ((n + 8 - 1) / 8) * 8 * 32);
+  cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+  cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+  cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C;
+  cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+  cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c,
+                                   CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c,
+                                   sizeof(col4_4r2_8c));
+  cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+
+  transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+  transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32,
+            Bdesc_col4_4r2_8c);
+
+  float *alpha;
+  cudaMallocManaged(&alpha, 4 * sizeof(float));
+  alpha[0] = 0;
+  alpha[1] = 1;
+  alpha[2] = 2;
+  alpha[3] = 3;
+
+  // Matmul
+  igemmlt<COL_TURING, 8, 1>(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32,
+                            alpha, m * 32, ((n + 8 - 1) / 8) * 8 * 32, m * 32);
+
+  // Convert C
+  transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+  cudaStreamSynchronize(0);
+
+  // Check result
+  int8_t Chost[m * n];
+  cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+  bool error = false;
+  int8_t C_ref[m * n] = {0, 17, 40, 69, 0, 6, 16, 30};
+  for (int i = 0; i < m * n; i++) {
+    if (Chost[i] != C_ref[i]) {
+      error = true;
+      break;
+    }
+  }
+  printf("c:\n");
+  for (int i = 0; i < m * n; i++)
+    printf("%d, ", Chost[i]);
+  printf("\n");
+
+  if (error) {
+    printf("error\n");
+  } else {
+    printf("success\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+  cublasLtMatrixLayoutDestroy(Adesc_col32);
+  cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c);
+  cublasLtMatrixLayoutDestroy(Cdesc_col32);
+  cublasLtMatrixLayoutDestroy(Adesc_col_major);
+  cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+  cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+  cudaFree(Adev);
+  cudaFree(Bdev);
+  cudaFree(Cdev);
+  cudaFree(alpha);
+
+  return !error;
+}
+
+// igemmlt<COL_AMPERE, 32, 0>
+bool test4() {
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+  const constexpr int m = 4;
+  const constexpr int n = 2;
+  const constexpr int k = 3;
+  int lda = m;
+  int ldb = n;
+  int ldc = m;
+  void *Adev;
+  void *Bdev;
+  void *Cdev;
+  cudaMalloc(&Adev, m * k * sizeof(int8_t));
+  cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+  cudaMalloc(&Cdev, m * n * sizeof(int32_t));
+
+  int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+  cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+  cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+                         Cdesc_col_major = NULL;
+  cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+  cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_32I, m, n, ldc);
+
+  // Convert A and B
+  cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL,
+                         Cdesc_col32 = NULL;
+  int8_t *A_col32, *B_col32_2r_4r4;
+  int32_t *C_col32;
+  cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+  cudaMalloc(&B_col32_2r_4r4,
+             ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t));
+  cudaMalloc(&C_col32, m * 32 * sizeof(std::int32_t));
+  cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+  cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n,
+                             ((n + 32 - 1) / 32) * 32 * 32);
+  cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_32I, m, n, m * 32);
+  cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+  cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+  cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+  cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4,
+                                   CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4,
+                                   sizeof(col32_2r_4r4));
+  cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+
+  transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+  transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32,
+            Bdesc_col32_2r_4r4);
+
+  // Matmul
+  igemmlt<COL_AMPERE, 32, 0>(ltHandle, m, n, k, A_col32, B_col32_2r_4r4,
+                             C_col32, nullptr, m * 32,
+                             ((n + 8 - 1) / 8) * 8 * 32, m * 32);
+
+  // Convert C
+  transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+  cudaStreamSynchronize(0);
+
+  // Check result
+  int32_t Chost[m * n];
+  cudaMemcpy(Chost, Cdev, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+  bool error = false;
+  int32_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+  for (int i = 0; i < m * n; i++) {
+    if (Chost[i] != C_ref[i]) {
+      error = true;
+      break;
+    }
+  }
+  printf("c:\n");
+  for (int i = 0; i < m * n; i++)
+    printf("%d, ", Chost[i]);
+  printf("\n");
+
+  if (error) {
+    printf("error\n");
+  } else {
+    printf("success\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+  cublasLtMatrixLayoutDestroy(Adesc_col32);
+  cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4);
+  cublasLtMatrixLayoutDestroy(Cdesc_col32);
+  cublasLtMatrixLayoutDestroy(Adesc_col_major);
+  cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+  cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+  cudaFree(Adev);
+  cudaFree(Bdev);
+  cudaFree(Cdev);
+
+  return !error;
+}
+
+// igemmlt<COL_AMPERE, 8, 0>
+bool test5() {
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+  const constexpr int m = 4;
+  const constexpr int n = 2;
+  const constexpr int k = 3;
+  int lda = m;
+  int ldb = n;
+  int ldc = m;
+  void *Adev;
+  void *Bdev;
+  void *Cdev;
+  cudaMalloc(&Adev, m * k * sizeof(int8_t));
+  cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+  cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+  int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+  cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+  cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+                         Cdesc_col_major = NULL;
+  cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+  cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+  // Convert A and B
+  cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL,
+                         Cdesc_col32 = NULL;
+  int8_t *A_col32, *B_col32_2r_4r4;
+  int8_t *C_col32;
+  cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+  cudaMalloc(&B_col32_2r_4r4,
+             ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t));
+  cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+  cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+  cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n,
+                             ((n + 32 - 1) / 32) * 32 * 32);
+  cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+  cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+  cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+  cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+  cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4,
+                                   CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4,
+                                   sizeof(col32_2r_4r4));
+  cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+
+  transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+  transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32,
+            Bdesc_col32_2r_4r4);
+
+  // Matmul
+  igemmlt<COL_AMPERE, 8, 0>(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, C_col32,
+                            nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32,
+                            m * 32);
+
+  // Convert C
+  transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+  cudaStreamSynchronize(0);
+
+  // Check result
+  int8_t Chost[m * n];
+  cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+  bool error = false;
+  int8_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+  for (int i = 0; i < m * n; i++) {
+    if (Chost[i] != C_ref[i]) {
+      error = true;
+      break;
+    }
+  }
+  printf("c:\n");
+  for (int i = 0; i < m * n; i++)
+    printf("%d, ", Chost[i]);
+  printf("\n");
+
+  if (error) {
+    printf("error\n");
+  } else {
+    printf("success\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+  cublasLtMatrixLayoutDestroy(Adesc_col32);
+  cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4);
+  cublasLtMatrixLayoutDestroy(Cdesc_col32);
+  cublasLtMatrixLayoutDestroy(Adesc_col_major);
+  cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+  cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+  cudaFree(Adev);
+  cudaFree(Bdev);
+  cudaFree(Cdev);
+
+  return !error;
+}
+
+// igemmlt<COL_AMPERE, 8, 1>
+bool test6() {
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+  const constexpr int m = 4;
+  const constexpr int n = 2;
+  const constexpr int k = 3;
+  int lda = m;
+  int ldb = n;
+  int ldc = m;
+  void *Adev;
+  void *Bdev;
+  void *Cdev;
+  cudaMalloc(&Adev, m * k * sizeof(int8_t));
+  cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+  cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+  int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+  int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+  cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+  cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+  cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+                         Cdesc_col_major = NULL;
+  cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+  cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+  cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+  // Convert A and B
+  cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL,
+                         Cdesc_col32 = NULL;
+  int8_t *A_col32, *B_col32_2r_4r4;
+  int8_t *C_col32;
+  cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+  cudaMalloc(&B_col32_2r_4r4,
+             ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t));
+  cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+  cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+  cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n,
+                             ((n + 32 - 1) / 32) * 32 * 32);
+  cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+  cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+  cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+  cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+  cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4,
+                                   CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4,
+                                   sizeof(col32_2r_4r4));
+  cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &col32, sizeof(col32));
+
+  transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+  transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32,
+            Bdesc_col32_2r_4r4);
+
+  float *alpha;
+  cudaMallocManaged(&alpha, 4 * sizeof(float));
+  alpha[0] = 0;
+  alpha[1] = 1;
+  alpha[2] = 2;
+  alpha[3] = 3;
+
+  // Matmul
+  igemmlt<COL_AMPERE, 8, 1>(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, C_col32,
+                            alpha, m * 32, ((n + 8 - 1) / 8) * 8 * 32, m * 32);
+
+  // Convert C
+  transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+  cudaStreamSynchronize(0);
+
+  // Check result
+  int8_t Chost[m * n];
+  cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+  bool error = false;
+  int8_t C_ref[m * n] = {0, 17, 40, 69, 0, 6, 16, 30};
+  for (int i = 0; i < m * n; i++) {
+    if (Chost[i] != C_ref[i]) {
+      error = true;
+      break;
+    }
+  }
+  printf("c:\n");
+  for (int i = 0; i < m * n; i++)
+    printf("%d, ", Chost[i]);
+  printf("\n");
+
+  if (error) {
+    printf("error\n");
+  } else {
+    printf("success\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+  cublasLtMatrixLayoutDestroy(Adesc_col32);
+  cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4);
+  cublasLtMatrixLayoutDestroy(Cdesc_col32);
+  cublasLtMatrixLayoutDestroy(Adesc_col_major);
+  cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+  cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+  cudaFree(Adev);
+  cudaFree(Bdev);
+  cudaFree(Cdev);
+  cudaFree(alpha);
+
+  return !error;
+}
+
+// clang-format off
+// A (4*3)    B (2*3)
+// 6 10 14    5 -3 1
+// 7 11 15    4 -2 0
+// 8 12 16
+// 9 13 17
+//
+// alpha * A          * op(B)   = alpha * C       =  C
+// 0       6  10  14    5  4      0       14  4      0   0
+// 1       7  11  15   -3 -2      1       17  6      17  6
+// 2       8  12  16    1  0      2       20  8      40  16
+// 3       9  13  17              3       23  10     69  30
+//
+// alpha * A          * op(B)   = alpha * C       =  C
+// 1       6  10  14    5  4      1       14  4      14  4
+//         7  11  15   -3 -2              17  6      17  6
+//         8  12  16    1  0              20  8      20  8
+//         9  13  17                      23  10     23  10
+// clang-format on
+
+int main() {
+  bool pass = true;
+  pass = test1() && pass;
+  pass = test2() && pass;
+  pass = test3() && pass;
+  pass = test4() && pass;
+  pass = test5() && pass;
+  pass = test6() && pass;
+  return pass ? 0 : 1;
+}
diff --git a/features/feature_case/cublasLt/transform.cu b/features/feature_case/cublasLt/transform.cu
new file mode 100644
index 000000000..3a1205add
--- /dev/null
+++ b/features/feature_case/cublasLt/transform.cu
@@ -0,0 +1,600 @@
+// ===------------ transform.cu -------------------------- *- CUDA -* ----=== //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#include "cublasLt.h"
+#include <cstring>
+
+void transform(cublasLtHandle_t ltHandle, void *in, int ld_in,
+               cublasLtOrder_t order_in, void *out, int ld_out,
+               cublasLtOrder_t order_out, int dim1, int dim2) {
+  cublasLtMatrixLayout_t in_desc = NULL, out_desc = NULL;
+  cublasLtMatrixTransformDesc_t transform_desc = NULL;
+
+  cublasLtMatrixLayoutCreate(&in_desc, CUDA_R_8I, dim1, dim2, ld_in);
+  cublasLtMatrixLayoutCreate(&out_desc, CUDA_R_8I, dim1, dim2, ld_out);
+
+  cublasLtMatrixLayoutSetAttribute(in_desc, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &order_in, sizeof(order_in));
+  cublasLtMatrixLayoutSetAttribute(out_desc, CUBLASLT_MATRIX_LAYOUT_ORDER,
+                                   &order_out, sizeof(order_out));
+
+  cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F);
+
+  float alpha = 1.0f, beta = 0.0f;
+  cublasLtMatrixTransform(ltHandle, transform_desc, &alpha, in, in_desc, &beta,
+                          NULL, NULL, out, out_desc, 0);
+
+  cublasLtMatrixLayoutDestroy(in_desc);
+  cublasLtMatrixLayoutDestroy(out_desc);
+  cublasLtMatrixTransformDescDestroy(transform_desc);
+}
+
+bool test_ROW() {
+  const constexpr int m = 2;
+  const constexpr int n = 33;
+  const constexpr int in_ld = 4;
+  void *in_dev;
+  cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+  int8_t in_host[n * in_ld];
+  int8_t value = 0;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % 4 < 2) {
+      in_host[i] = value;
+      value++;
+    } else
+      in_host[i] = 99;
+  }
+  int8_t ref_2nd[n * in_ld];
+  std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+  cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+             cudaMemcpyHostToDevice);
+
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+
+  void *out_dev;
+  const constexpr int out_ld = 36;
+  cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+  cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+  transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+            CUBLASLT_ORDER_ROW, m, n);
+
+  int8_t out_host[out_ld * m];
+  cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_1st = true;
+  int8_t ref_1st[out_ld * m] = 
+     {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 0, 0, 0,
+      1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 0, 0, 0};
+  for (int i = 0; i < out_ld * m; i++) {
+    if (i % out_ld < n) {
+      if (out_host[i] != ref_1st[i]) {
+        pass_1st = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < out_ld * m; i++) {
+    printf("%d, ", out_host[i]);
+  }
+  printf("\n");
+  if (pass_1st) {
+    printf("ROW 1st pass\n");
+  } else {
+    printf("ROW 1st fail\n");
+  }
+
+  cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+  std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+  transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_ROW, in_dev, in_ld,
+            CUBLASLT_ORDER_COL, m, n);
+  cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_2nd = true;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % in_ld < m) {
+      if (in_host[i] != ref_2nd[i]) {
+        pass_2nd = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < n * in_ld; i++) {
+    printf("%d, ", in_host[i]);
+  }
+  printf("\n");
+  if (pass_2nd) {
+    printf("ROW 2nd pass\n");
+  } else {
+    printf("ROW 2nd fail\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+
+  return pass_1st && pass_2nd;
+}
+
+bool test_COL32() {
+  const constexpr int m = 2;
+  const constexpr int n = 33;
+  const constexpr int in_ld = 4;
+  void *in_dev;
+  cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+  int8_t in_host[n * in_ld];
+  int8_t value = 0;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % 4 < 2) {
+      in_host[i] = value;
+      value++;
+    } else
+      in_host[i] = 99;
+  }
+  int8_t ref_2nd[n * in_ld];
+  std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+  cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+             cudaMemcpyHostToDevice);
+
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+
+  void *out_dev;
+  const constexpr int out_ld = 64;
+  cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+  cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+  transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+            CUBLASLT_ORDER_COL32, m, n);
+
+  int8_t out_host[out_ld * m];
+  cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_1st = true;
+  int8_t ref_1st[out_ld * m] = 
+     {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+      1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+      64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  for (int i = 0; i < out_ld * m; i++) {
+    if (i % out_ld < n) {
+      if (out_host[i] != ref_1st[i]) {
+        pass_1st = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < out_ld * m; i++) {
+    printf("%d, ", out_host[i]);
+  }
+  printf("\n");
+  if (pass_1st) {
+    printf("COL32 1st pass\n");
+  } else {
+    printf("COL32 1st fail\n");
+  }
+
+  cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+  std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+  transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL32, in_dev, in_ld,
+            CUBLASLT_ORDER_COL, m, n);
+  cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_2nd = true;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % in_ld < m) {
+      if (in_host[i] != ref_2nd[i]) {
+        pass_2nd = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < n * in_ld; i++) {
+    printf("%d, ", in_host[i]);
+  }
+  printf("\n");
+  if (pass_2nd) {
+    printf("COL32 2nd pass\n");
+  } else {
+    printf("COL32 2nd fail\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+
+  return pass_1st && pass_2nd;
+}
+
+bool test_COL4_4R2_8C() {
+  const constexpr int m = 2;
+  const constexpr int n = 33;
+  const constexpr int in_ld = 4;
+  void *in_dev;
+  cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+  int8_t in_host[n * in_ld];
+  int8_t value = 0;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % 4 < 2) {
+      in_host[i] = value;
+      value++;
+    } else
+      in_host[i] = 99;
+  }
+  int8_t ref_2nd[n * in_ld];
+  std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+  cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+             cudaMemcpyHostToDevice);
+
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+
+  void *out_dev;
+  const constexpr int out_ld = (32 * 8) * 2;
+  cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+  cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+  transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+            CUBLASLT_ORDER_COL4_4R2_8C, m, n);
+
+  int8_t out_host[out_ld * m];
+  cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_1st = true;
+  int8_t ref_1st[out_ld * m] = 
+     {0,  2,  4,  6,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      8,  10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      24, 26, 28, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      32, 34, 36, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      40, 42, 44, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      48, 50, 52, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      56, 58, 60, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      1,  3,  5,  7,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      9,  11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      17, 19, 21, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      25, 27, 29, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      33, 35, 37, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      41, 43, 45, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      49, 51, 53, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      57, 59, 61, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      64, 0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      65, 0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0,  0,  0,  0,  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  for (int i = 0; i < out_ld * m; i++) {
+    if (i % out_ld < n) {
+      if (out_host[i] != ref_1st[i]) {
+        pass_1st = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < out_ld * m; i++) {
+    printf("%d, ", out_host[i]);
+  }
+  printf("\n");
+  if (pass_1st) {
+    printf("COL4_4R2_8C 1st pass\n");
+  } else {
+    printf("COL4_4R2_8C 1st fail\n");
+  }
+
+  cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+  std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+  transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL4_4R2_8C, in_dev,
+            in_ld, CUBLASLT_ORDER_COL, m, n);
+  cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_2nd = true;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % in_ld < m) {
+      if (in_host[i] != ref_2nd[i]) {
+        pass_2nd = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < n * in_ld; i++) {
+    printf("%d, ", in_host[i]);
+  }
+  printf("\n");
+  if (pass_2nd) {
+    printf("COL4_4R2_8C 2nd pass\n");
+  } else {
+    printf("COL4_4R2_8C 2nd fail\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+
+  return pass_1st && pass_2nd;
+}
+
+bool test_COL32_2R_4R4() {
+  const constexpr int m = 2;
+  const constexpr int n = 33;
+  const constexpr int in_ld = 4;
+  void *in_dev;
+  cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+  int8_t in_host[n * in_ld];
+  int8_t value = 0;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % 4 < 2) {
+      in_host[i] = value;
+      value++;
+    } else
+      in_host[i] = 99;
+  }
+  int8_t ref_2nd[n * in_ld];
+  std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+  cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+             cudaMemcpyHostToDevice);
+
+  cublasLtHandle_t ltHandle;
+  cublasLtCreate(&ltHandle);
+
+  void *out_dev;
+  const constexpr int out_ld = (32 * 32) * 2;
+  cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+  cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+  transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+            CUBLASLT_ORDER_COL32_2R_4R4, m, n);
+
+  int8_t out_host[out_ld * m];
+  cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_1st = true;
+  int8_t ref_1st[out_ld * m] = 
+     {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+      1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+  for (int i = 0; i < out_ld * m; i++) {
+    if (i % out_ld < n) {
+      if (out_host[i] != ref_1st[i]) {
+        pass_1st = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < out_ld * m; i++) {
+    printf("%d, ", out_host[i]);
+  }
+  printf("\n");
+  if (pass_1st) {
+    printf("COL32_2R_4R4 1st pass\n");
+  } else {
+    printf("COL32_2R_4R4 1st fail\n");
+  }
+
+  cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+  std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+  transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL32_2R_4R4, in_dev,
+            in_ld, CUBLASLT_ORDER_COL, m, n);
+  cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+             cudaMemcpyDeviceToHost);
+
+  bool pass_2nd = true;
+  for (int i = 0; i < n * in_ld; i++) {
+    if (i % in_ld < m) {
+      if (in_host[i] != ref_2nd[i]) {
+        pass_2nd = false;
+        break;
+      }
+    }
+  }
+
+  for (int i = 0; i < n * in_ld; i++) {
+    printf("%d, ", in_host[i]);
+  }
+  printf("\n");
+  if (pass_2nd) {
+    printf("COL32_2R_4R4 2nd pass\n");
+  } else {
+    printf("COL32_2R_4R4 2nd fail\n");
+  }
+
+  cublasLtDestroy(ltHandle);
+
+  return pass_1st && pass_2nd;
+}
+
+// Input col_major matrix:
+// 2 rows * 33 columns, ld is 4
+int main() {
+  bool pass = true;
+  pass = test_ROW() && pass;
+  pass = test_COL32() && pass;
+  pass = test_COL4_4R2_8C() && pass;
+  pass = test_COL32_2R_4R4() && pass;
+  return pass ? 0 : 1;
+}
diff --git a/features/feature_case/cusparse/cusparse_10.cu b/features/feature_case/cusparse/cusparse_10.cu
new file mode 100644
index 000000000..10b5abd61
--- /dev/null
+++ b/features/feature_case/cusparse/cusparse_10.cu
@@ -0,0 +1,266 @@
+// ===------- cusparse_10.cu ------------------------------- *- CUDA -* ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===----------------------------------------------------------------------===//
+
+#include "cusparse.h"
+
+#include <cmath>
+#include <complex>
+#include <cstdio>
+#include <vector>
+
+template <class d_data_t>
+struct Data {
+  float *h_data;
+  d_data_t *d_data;
+  int element_num;
+  Data(int element_num) : element_num(element_num) {
+    h_data = (float *)malloc(sizeof(float) * element_num);
+    memset(h_data, 0, sizeof(float) * element_num);
+    cudaMalloc(&d_data, sizeof(d_data_t) * element_num);
+    cudaMemset(d_data, 0, sizeof(d_data_t) * element_num);
+  }
+  Data(float *input_data, int element_num) : element_num(element_num) {
+    h_data = (float *)malloc(sizeof(float) * element_num);
+    cudaMalloc(&d_data, sizeof(d_data_t) * element_num);
+    cudaMemset(d_data, 0, sizeof(d_data_t) * element_num);
+    memcpy(h_data, input_data, sizeof(float) * element_num);
+  }
+  ~Data() {
+    free(h_data);
+    cudaFree(d_data);
+  }
+  void H2D() {
+    d_data_t *h_temp = (d_data_t *)malloc(sizeof(d_data_t) * element_num);
+    memset(h_temp, 0, sizeof(d_data_t) * element_num);
+    from_float_convert(h_data, h_temp);
+    cudaMemcpy(d_data, h_temp, sizeof(d_data_t) * element_num,
+               cudaMemcpyHostToDevice);
+    free(h_temp);
+  }
+  void D2H() {
+    d_data_t *h_temp = (d_data_t *)malloc(sizeof(d_data_t) * element_num);
+    memset(h_temp, 0, sizeof(d_data_t) * element_num);
+    cudaMemcpy(h_temp, d_data, sizeof(d_data_t) * element_num,
+               cudaMemcpyDeviceToHost);
+    to_float_convert(h_temp, h_data);
+    free(h_temp);
+  }
+
+private:
+  inline void from_float_convert(float *in, d_data_t *out) {
+    for (int i = 0; i < element_num; i++)
+      out[i] = in[i];
+  }
+  inline void to_float_convert(d_data_t *in, float *out) {
+    for (int i = 0; i < element_num; i++)
+      out[i] = in[i];
+  }
+};
+template <>
+inline void Data<float2>::from_float_convert(float *in, float2 *out) {
+  for (int i = 0; i < element_num; i++)
+    out[i].x = in[i];
+}
+template <>
+inline void Data<double2>::from_float_convert(float *in, double2 *out) {
+  for (int i = 0; i < element_num; i++)
+    out[i].x = in[i];
+}
+
+template <>
+inline void Data<float2>::to_float_convert(float2 *in, float *out) {
+  for (int i = 0; i < element_num; i++)
+    out[i] = in[i].x;
+}
+template <>
+inline void Data<double2>::to_float_convert(double2 *in, float *out) {
+  for (int i = 0; i < element_num; i++)
+    out[i] = in[i].x;
+}
+
+bool compare_result(float *expect, float *result, int element_num) {
+  for (int i = 0; i < element_num; i++) {
+    if (std::abs(result[i] - expect[i]) >= 0.05) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool compare_result(float *expect, float *result, std::vector<int> indices) {
+  for (int i = 0; i < indices.size(); i++) {
+    if (std::abs(result[indices[i]] - expect[indices[i]]) >= 0.05) {
+      return false;
+    }
+  }
+  return true;
+}
+
+bool test_passed = true;
+
+// A
+// 1 4 0 0 0
+// 0 2 3 0 0
+// 5 0 7 8 0
+// 0 0 9 0 6
+void test_cusparseSpMM_COO() {
+  std::vector<float> a_val_vec = {1, 4, 2, 3, 5, 7, 8, 9, 6};
+  Data<float> a_s_val(a_val_vec.data(), 9);
+  Data<double> a_d_val(a_val_vec.data(), 9);
+  Data<float2> a_c_val(a_val_vec.data(), 9);
+  Data<double2> a_z_val(a_val_vec.data(), 9);
+  std::vector<float> a_row_ptr_vec = {1, 1, 2, 2, 3, 3, 3, 4, 4};
+  Data<int> a_row_ptr(a_row_ptr_vec.data(), 9);
+  std::vector<float> a_col_ind_vec = {1, 2, 2, 3, 1, 4, 5, 3, 5};
+  Data<int> a_col_ind(a_col_ind_vec.data(), 9);
+
+  std::vector<float> b_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  Data<float> b_s(b_vec.data(), 10);
+  Data<double> b_d(b_vec.data(), 10);
+  Data<float2> b_c(b_vec.data(), 10);
+  Data<double2> b_z(b_vec.data(), 10);
+
+  Data<float> c_s(8);
+  Data<double> c_d(8);
+  Data<float2> c_c(8);
+  Data<double2> c_z(8);
+
+  float alpha = 10;
+  Data<float> alpha_s(&alpha, 1);
+  Data<double> alpha_d(&alpha, 1);
+  Data<float2> alpha_c(&alpha, 1);
+  Data<double2> alpha_z(&alpha, 1);
+
+  float beta = 0;
+  Data<float> beta_s(&beta, 1);
+  Data<double> beta_d(&beta, 1);
+  Data<float2> beta_c(&beta, 1);
+  Data<double2> beta_z(&beta, 1);
+
+  cusparseHandle_t handle;
+  cusparseCreate(&handle);
+
+  cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE);
+
+  a_s_val.H2D();
+  a_d_val.H2D();
+  a_c_val.H2D();
+  a_z_val.H2D();
+  a_row_ptr.H2D();
+  a_col_ind.H2D();
+  b_s.H2D();
+  b_d.H2D();
+  b_c.H2D();
+  b_z.H2D();
+  alpha_s.H2D();
+  alpha_d.H2D();
+  alpha_c.H2D();
+  alpha_z.H2D();
+  beta_s.H2D();
+  beta_d.H2D();
+  beta_c.H2D();
+  beta_z.H2D();
+
+  cusparseSpMatDescr_t a_descr_s;
+  cusparseSpMatDescr_t a_descr_d;
+  cusparseSpMatDescr_t a_descr_c;
+  cusparseSpMatDescr_t a_descr_z;
+  cusparseCreateCoo(&a_descr_s, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_s_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_32F);
+  cusparseCreateCoo(&a_descr_d, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_d_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F);
+  cusparseCreateCoo(&a_descr_c, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_c_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_32F);
+  cusparseCreateCoo(&a_descr_z, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_z_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_64F);
+
+  cusparseDnMatDescr_t b_descr_s;
+  cusparseDnMatDescr_t b_descr_d;
+  cusparseDnMatDescr_t b_descr_c;
+  cusparseDnMatDescr_t b_descr_z;
+  cusparseCreateDnMat(&b_descr_s, 5, 2, 5, b_s.d_data, CUDA_R_32F, CUSPARSE_ORDER_COL);
+  cusparseCreateDnMat(&b_descr_d, 5, 2, 5, b_d.d_data, CUDA_R_64F, CUSPARSE_ORDER_COL);
+  cusparseCreateDnMat(&b_descr_c, 5, 2, 5, b_c.d_data, CUDA_C_32F, CUSPARSE_ORDER_COL);
+  cusparseCreateDnMat(&b_descr_z, 5, 2, 5, b_z.d_data, CUDA_C_64F, CUSPARSE_ORDER_COL);
+
+  cusparseDnMatDescr_t c_descr_s;
+  cusparseDnMatDescr_t c_descr_d;
+  cusparseDnMatDescr_t c_descr_c;
+  cusparseDnMatDescr_t c_descr_z;
+  cusparseCreateDnMat(&c_descr_s, 4, 2, 4, c_s.d_data, CUDA_R_32F, CUSPARSE_ORDER_COL);
+  cusparseCreateDnMat(&c_descr_d, 4, 2, 4, c_d.d_data, CUDA_R_64F, CUSPARSE_ORDER_COL);
+  cusparseCreateDnMat(&c_descr_c, 4, 2, 4, c_c.d_data, CUDA_C_32F, CUSPARSE_ORDER_COL);
+  cusparseCreateDnMat(&c_descr_z, 4, 2, 4, c_z.d_data, CUDA_C_64F, CUSPARSE_ORDER_COL);
+
+  size_t ws_size_s;
+  size_t ws_size_d;
+  size_t ws_size_c;
+  size_t ws_size_z;
+  cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_s);
+  cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_d);
+  cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_c);
+  cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_z);
+
+  void *ws_s = nullptr;
+  void *ws_d = nullptr;
+  void *ws_c = nullptr;
+  void *ws_z = nullptr;
+  cudaMalloc(&ws_s, ws_size_s);
+  cudaMalloc(&ws_d, ws_size_d);
+  cudaMalloc(&ws_c, ws_size_c);
+  cudaMalloc(&ws_z, ws_size_z);
+
+  cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_s);
+  cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_d);
+  cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_c);
+  cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_z);
+  cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_s);
+  cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_d);
+  cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_c);
+  cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_z);
+
+  c_s.D2H();
+  c_d.D2H();
+  c_c.D2H();
+  c_z.D2H();
+
+  cudaStreamSynchronize(0);
+
+  cudaFree(ws_s);
+  cudaFree(ws_d);
+  cudaFree(ws_c);
+  cudaFree(ws_z);
+  cusparseDestroySpMat(a_descr_s);
+  cusparseDestroySpMat(a_descr_d);
+  cusparseDestroySpMat(a_descr_c);
+  cusparseDestroySpMat(a_descr_z);
+  cusparseDestroyDnMat(b_descr_s);
+  cusparseDestroyDnMat(b_descr_d);
+  cusparseDestroyDnMat(b_descr_c);
+  cusparseDestroyDnMat(b_descr_z);
+  cusparseDestroyDnMat(c_descr_s);
+  cusparseDestroyDnMat(c_descr_d);
+  cusparseDestroyDnMat(c_descr_c);
+  cusparseDestroyDnMat(c_descr_z);
+  cusparseDestroy(handle);
+
+  float expect_c[8] = {90, 130, 730, 570, 340, 380, 1730, 1320};
+  if (compare_result(expect_c, c_s.h_data, 8) &&
+      compare_result(expect_c, c_d.h_data, 8) &&
+      compare_result(expect_c, c_c.h_data, 8) &&
+      compare_result(expect_c, c_z.h_data, 8))
+    printf("SpMM_COO pass\n");
+  else {
+    printf("SpMM_COO fail\n");
+    test_passed = false;
+  }
+}
+
+int main() {
+  test_cusparseSpMM_COO();
+
+  if (test_passed)
+    return 0;
+  return -1;
+}
diff --git a/features/feature_case/device_info/device_info.cu b/features/feature_case/device_info/device_info.cu
index d8e5abaf7..cd5a6f8fc 100644
--- a/features/feature_case/device_info/device_info.cu
+++ b/features/feature_case/device_info/device_info.cu
@@ -28,6 +28,7 @@ void test1() {
   auto maxTexture1D = properties.maxTexture1D;
   auto maxTexture2D = properties.maxTexture2D;
   auto maxTexture3D = properties.maxTexture3D;
+  auto regsPerBlock = properties.regsPerBlock;
 
   std::cout << "Device ID: " << id << std::endl;
   std::cout << "Device UUID: ";
@@ -43,6 +44,7 @@ void test1() {
   std::cout << "Device maxTexture3D: " << maxTexture3D[0] << " "
             << properties.maxTexture3D[1] << " " << maxTexture3D[2]
             << std::endl;
+  std::cout << "Device regsPerBlock: " << regsPerBlock << std::endl;
 }
 
 int main() {
diff --git a/features/feature_case/math/math-int.cu b/features/feature_case/math/math-int.cu
new file mode 100644
index 000000000..5c09e092b
--- /dev/null
+++ b/features/feature_case/math/math-int.cu
@@ -0,0 +1,196 @@
+// ===--------------- math-int.cu ---------- *- CUDA -* -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+#include <iostream>
+#include <vector>
+
+using namespace std;
+
+int passed = 0;
+int failed = 0;
+
+void check(bool IsPassed) {
+  if (IsPassed) {
+    cout << " ---- passed" << endl;
+    passed++;
+  } else {
+    cout << " ---- failed" << endl;
+    failed++;
+  }
+}
+
+template <typename T> void printFunc(const string &FuncName, const T &Input) {
+  cout << FuncName << "(" << Input << ") ";
+}
+
+template <typename T1, typename T2>
+void printFunc(const string &FuncName, const pair<T1, T2> &Input) {
+  cout << FuncName << "(" << Input.first << ", " << Input.second << ")";
+}
+
+template <typename T> void checkResult(const T &Expect, const T &Result) {
+  cout << " = " << Result << " (expect " << Expect << ")";
+  check(Result == Expect);
+}
+
+// Integer Mathematical Functions
+
+__global__ void _llmax(long long *const Result, long long Input1,
+                       long long Input2) {
+  *Result = llmax(Input1, Input2);
+}
+
+void testLlmaxCases(
+    const vector<pair<pair<long long, long long>, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    _llmax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    printFunc("llmax", TestCase.first);
+    checkResult(TestCase.second, *Result);
+  }
+}
+
+__global__ void _llmin(long long *const Result, long long Input1,
+                       long long Input2) {
+  *Result = llmin(Input1, Input2);
+}
+
+void testLlminCases(
+    const vector<pair<pair<long long, long long>, long long>> &TestCases) {
+  long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    _llmin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    printFunc("llmin", TestCase.first);
+    checkResult(TestCase.second, *Result);
+  }
+}
+
+__global__ void _ullmax(unsigned long long *const Result,
+                        unsigned long long Input1, unsigned long long Input2) {
+  *Result = ullmax(Input1, Input2);
+}
+
+void testUllmaxCases(
+    const vector<pair<pair<unsigned long long, unsigned long long>,
+                      unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    _ullmax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    printFunc("ullmax", TestCase.first);
+    checkResult(TestCase.second, *Result);
+  }
+}
+
+__global__ void _ullmin(unsigned long long *const Result,
+                        unsigned long long Input1, unsigned long long Input2) {
+  *Result = ullmin(Input1, Input2);
+}
+
+void testUllminCases(
+    const vector<pair<pair<unsigned long long, unsigned long long>,
+                      unsigned long long>> &TestCases) {
+  unsigned long long *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    _ullmin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    printFunc("ullmin", TestCase.first);
+    checkResult(TestCase.second, *Result);
+  }
+}
+
+__global__ void _umax(unsigned *const Result, unsigned Input1,
+                      unsigned Input2) {
+  *Result = umax(Input1, Input2);
+}
+
+void testUmaxCases(
+    const vector<pair<pair<unsigned, unsigned>, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    _umax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    printFunc("umax", TestCase.first);
+    checkResult(TestCase.second, *Result);
+  }
+}
+
+__global__ void _umin(unsigned *const Result, unsigned Input1,
+                      unsigned Input2) {
+  *Result = umin(Input1, Input2);
+}
+
+void testUminCases(
+    const vector<pair<pair<unsigned, unsigned>, unsigned>> &TestCases) {
+  unsigned *Result;
+  cudaMallocManaged(&Result, sizeof(*Result));
+  for (const auto &TestCase : TestCases) {
+    _umin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+    cudaDeviceSynchronize();
+    printFunc("umin", TestCase.first);
+    checkResult(TestCase.second, *Result);
+  }
+}
+
+int main() {
+  testLlmaxCases({
+      {{1, 2}, 2},
+      {{-1, -2}, -1},
+      {{1, -2}, 1},
+      {{-1, 2}, 2},
+      {{45212221678, 221332142421}, 221332142421},
+  });
+  testLlminCases({
+      {{1, 2}, 1},
+      {{-1, -2}, -2},
+      {{1, -2}, -2},
+      {{-1, 2}, -1},
+      {{45212221678, 221332142421}, 45212221678},
+  });
+  testUllmaxCases({
+      {{1, 2}, 2},
+      {{18446744073709551615, 18446744073709551614}, 18446744073709551615},
+      {{1, 18446744073709551614}, 18446744073709551614},
+      {{18446744073709551615, 2}, 18446744073709551615},
+      {{45212221678, 221332142421}, 221332142421},
+  });
+  testUllminCases({
+      {{1, 2}, 1},
+      {{18446744073709551615, 18446744073709551614}, 18446744073709551614},
+      {{1, 18446744073709551614}, 1},
+      {{18446744073709551615, 2}, 2},
+      {{45212221678, 221332142421}, 45212221678},
+  });
+  testUmaxCases({
+      {{1, 2}, 2},
+      {{4294967295, 4294967294}, 4294967295},
+      {{1, 4294967294}, 4294967294},
+      {{4294967295, 2}, 4294967295},
+      {{2262548718, 2288810325}, 2288810325},
+  });
+  testUminCases({
+      {{1, 2}, 1},
+      {{4294967295, 4294967294}, 4294967294},
+      {{1, 4294967294}, 1},
+      {{4294967295, 2}, 2},
+      {{2262548718, 2288810325}, 2262548718},
+  });
+  cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
+  if (failed) {
+    cout << "failed!" << endl;
+  }
+  return failed;
+}
diff --git a/features/features.xml b/features/features.xml
index 971570894..ff7838f2e 100644
--- a/features/features.xml
+++ b/features/features.xml
@@ -147,6 +147,7 @@
     <test testName="math-ext-simd" configFile="config/TEMPLATE_math_skip_cuda_backend.xml" />
     <test testName="math-half-conv" configFile="config/TEMPLATE_math_after_11_skip_double_skip_cuda_backend.xml" splitGroup="double"/>
     <test testName="math-helper" configFile="config/TEMPLATE_math.xml" />
+    <test testName="math-int" configFile="config/TEMPLATE_math.xml" />
     <test testName="math-intrinsics" configFile="config/TEMPLATE_math.xml" />
     <test testName="math-saturate" configFile="config/TEMPLATE_math_saturate.xml" />
     <test testName="cusparse" configFile="config/TEMPLATE_cusparse_before_11.xml" />
@@ -282,6 +283,7 @@
     <test testName="cusparse_7" configFile="config/TEMPLATE_cusparse_before_11_skip_double.xml" splitGroup="double"/>
     <test testName="cusparse_8" configFile="config/TEMPLATE_cusparse_after_101_skip_double.xml" splitGroup="double"/>
     <test testName="cusparse_9" configFile="config/TEMPLATE_cusparse_after_113_skip_double.xml" splitGroup="double"/>
+    <test testName="cusparse_10" configFile="config/TEMPLATE_cusparse_after_112_cpu.xml"/>
     <test testName="cusolver_test1" configFile="config/TEMPLATE_solver.xml"  splitGroup="double"/>
     <test testName="cusolver_test2" configFile="config/TEMPLATE_solver_skip80.xml"  splitGroup="double"/>
     <test testName="cusolver_test3" configFile="config/TEMPLATE_solver_11.xml"  splitGroup="double"/>
@@ -318,10 +320,10 @@
     <test testName="image" configFile="config/TEMPLATE_image.xml" />
     <test testName="text_experimental_build_only" configFile="config/TEMPLATE_image.xml" />
     <test testName="text_experimental_build_only_before12" configFile="config/TEMPLATE_image_before_12.xml" />
-    <test testName="text_experimental_obj_array" configFile="config/TEMPLATE_image.xml" />
-    <test testName="text_experimental_obj_mipmap" configFile="config/TEMPLATE_image.xml" />
-    <test testName="text_experimental_obj_linear" configFile="config/TEMPLATE_image.xml" />
-    <test testName="text_experimental_obj_pitch2d" configFile="config/TEMPLATE_image.xml" />
+    <!-- <test testName="text_experimental_obj_array" configFile="config/TEMPLATE_image.xml" /> -->
+    <!-- <test testName="text_experimental_obj_mipmap" configFile="config/TEMPLATE_image.xml" /> -->
+    <!-- <test testName="text_experimental_obj_linear" configFile="config/TEMPLATE_image.xml" /> -->
+    <!-- <test testName="text_experimental_obj_pitch2d" configFile="config/TEMPLATE_image.xml" /> -->
     <test testName="text_obj_array" configFile="config/TEMPLATE_image_skip_cuda.xml" />
     <test testName="text_obj_linear" configFile="config/TEMPLATE_image_skip_cuda.xml" />
     <test testName="text_obj_pitch2d" configFile="config/TEMPLATE_image_skip_cuda.xml" />
@@ -340,5 +342,7 @@
     <test testName="driver_err_handle" configFile="config/TEMPLATE_err_handle.xml" />
     <test testName="codepin_all_public_dump" configFile="config/TEMPLATE_codepin_usmnone.xml" />
     <test testName="codepin_basic" configFile="config/TEMPLATE_codepin.xml" />
+    <test testName="matmul" configFile="config/TEMPLATE_cublasLt.xml" />
+    <test testName="transform" configFile="config/TEMPLATE_cublasLt.xml" />
   </tests>
 </suite>
diff --git a/features/test_feature.py b/features/test_feature.py
index 2ef527bf3..bbd6e08c9 100644
--- a/features/test_feature.py
+++ b/features/test_feature.py
@@ -36,10 +36,10 @@
               'math-ext-bf16-conv', 'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half-conv', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation',
               'cudnn-fill', 'cudnn-lrn', 'cudnn-memory', 'cudnn-pooling', 'cudnn-reorder', 'cudnn-scale', 'cudnn-softmax',
               'cudnn-sum', 'math-funnelshift', 'thrust-sort_by_key', 'thrust-find', 'thrust-inner_product', 'thrust-reduce_by_key',
-              'math-bf16-conv', 'math-emu-bf16-conv-double', 'math-ext-bf16-conv-double', 'math-half-conv',
+              'math-bf16-conv', 'math-emu-bf16-conv-double', 'math-ext-bf16-conv-double', 'math-half-conv', 'math-int',
               'math-bfloat16', 'libcu_atomic', 'test_shared_memory', 'cudnn-reduction', 'cudnn-binary', 'cudnn-bnp1', 'cudnn-bnp2', 'cudnn-bnp3',
               'cudnn-normp1', 'cudnn-normp2', 'cudnn-normp3', 'cudnn-convp1', 'cudnn-convp2', 'cudnn-convp3', 'cudnn-convp4', 'cudnn-convp5', 'cudnn-convp6', 'cudnn-convp7', 
-              'cudnn_mutilple_files', "cusparse_1", "cusparse_2", "cusparse_3", "cusparse_4", "cusparse_5", "cusparse_6", "cusparse_7", "cusparse_8", "cusparse_9",
+              'cudnn_mutilple_files', "cusparse_1", "cusparse_2", "cusparse_3", "cusparse_4", "cusparse_5", "cusparse_6", "cusparse_7", "cusparse_8", "cusparse_9", "cusparse_10",
               'cudnn-GetErrorString', 'cub_device_histgram', 'peer_access', 'driver_err_handle',
               'cudnn-types', 'cudnn-version', 'cudnn-dropout', 'const_opt',
               'constant_attr', 'sync_warp_p2', 'occupancy_calculation',
@@ -60,7 +60,7 @@
               'thrust_swap_ranges', 'thrust_uninitialized_fill_n', 'thrust_equal', 'system_atomic', 'thrust_detail_types',
               'operator_eq', 'operator_neq', 'operator_lege', 'thrust_system', 'thrust_reverse_copy',
               'thrust_device_new_delete', 'thrust_temporary_buffer', 'thrust_malloc_free', 'codepin', 'thrust_unique_count',
-              'thrust_advance_trans_op_itr', 'cuda_stream_query']
+              'thrust_advance_trans_op_itr', 'cuda_stream_query', "matmul", "transform"]
 
 occupancy_calculation_exper = ['occupancy_calculation']
 
@@ -166,7 +166,7 @@ def build_test():
              'cudnn-binary', 'cudnn-bnp1', 'cudnn-bnp2', 'cudnn-bnp3', 'cudnn-normp1', 'cudnn-normp2', 'cudnn-normp3',
              'cudnn-convp1', 'cudnn-convp2', 'cudnn-convp3', 'cudnn-convp4', 'cudnn-convp5', 'cudnn-convp6', 'cudnn-rnn',
              'cudnn-GetErrorString', 'cudnn-convp7',
-             'cudnn-types', 'cudnn-version', 'cudnn-dropout'
+             'cudnn-types', 'cudnn-version', 'cudnn-dropout', 'matmul'
              ]
 
     no_fast_math_tests = ['math-emu-half-after11', 'math-emu-half2-after11', 'math-ext-half-after11', 'math-ext-half2-after11',
diff --git a/help_function/help_function.xml b/help_function/help_function.xml
index d35b308c9..fc047e53e 100644
--- a/help_function/help_function.xml
+++ b/help_function/help_function.xml
@@ -87,7 +87,7 @@
     <test testName="device_dev_mgr" configFile="config/TEMPLATE_help_function.xml" />
     <test testName="device_device_ext" configFile="config/TEMPLATE_help_function.xml" />
     <test testName="device_device_info" configFile="config/TEMPLATE_help_function.xml" />
-    <test testName="bindless_images" configFile="config/TEMPLATE_help_function_skip_cpu_gpu.xml" />
+    <!-- <test testName="bindless_images" configFile="config/TEMPLATE_help_function_skip_cpu_gpu.xml" /> -->
     <test testName="image_image_channel" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" />
     <test testName="image_image_matrix" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" />
     <test testName="image_image_data" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" />
@@ -215,5 +215,6 @@
     <test testName="sparse_utils_4_usm" configFile="config/TEMPLATE_help_function_sparse_usm.xml" splitGroup="double" />
     <test testName="atomic_compare_exchange_strong" configFile="config/TEMPLATE_help_function.xml" />
     <test testName="filter_device" configFile="config/TEMPLATE_help_function.xml" />
+    <test testName="blas_gemm_utils_interface" configFile="config/TEMPLATE_help_function_skip_cuda_backend.xml" />
   </tests>
 </suite>
diff --git a/help_function/src/blas_gemm_utils_interface.cpp b/help_function/src/blas_gemm_utils_interface.cpp
new file mode 100644
index 000000000..551021b31
--- /dev/null
+++ b/help_function/src/blas_gemm_utils_interface.cpp
@@ -0,0 +1,152 @@
+// ===------ blas_gemm_utils_interface.cpp ----------------- *- C++ -* ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#include <sycl/sycl.hpp>
+#include <dpct/dpct.hpp>
+#include <dpct/blas_gemm_utils.hpp>
+#include <dpct/lib_common_utils.hpp>
+
+void foo1 () {
+  dpct::blas_gemm::experimental::descriptor_ptr ltHandle;
+  ltHandle = new dpct::blas_gemm::experimental::descriptor();
+  delete (ltHandle);
+
+  dpct::blas_gemm::experimental::matrix_layout_ptr matLayout;
+  dpct::library_data_t type;
+  uint64_t rows;
+  uint64_t cols;
+  int64_t ld;
+  matLayout =
+      new dpct::blas_gemm::experimental::matrix_layout_t(type, rows, cols, ld);
+
+  dpct::blas_gemm::experimental::matrix_layout_t::attribute attr1;
+  void *buf1;
+  size_t sizeInBytes1;
+  size_t *sizeWritten1;
+  matLayout->get_attribute(attr1, buf1);
+  matLayout->set_attribute(attr1, buf1);
+  delete (matLayout);
+
+  dpct::blas_gemm::experimental::matmul_desc_ptr matmulDesc;
+  dpct::compute_type computeType;
+  dpct::library_data_t scaleType;
+  matmulDesc =
+      new dpct::blas_gemm::experimental::matmul_desc_t(computeType, scaleType);
+
+  dpct::blas_gemm::experimental::matmul_desc_t::attribute attr2;
+  void *buf2;
+  size_t sizeInBytes2;
+  size_t *sizeWritten2;
+  matmulDesc->get_attribute(attr2, buf2);
+  matmulDesc->set_attribute(attr2, buf2);
+  delete (matmulDesc);
+
+  int matmulPreference;
+  void *buf3;
+  size_t sizeInBytes3;
+  size_t *sizeWritten3;
+
+  dpct::blas_gemm::experimental::matrix_layout_ptr Adesc;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Ddesc;
+
+  int requestedAlgoCount = 1;
+  int heuristicResultsArray;
+  int returnAlgoCount;
+  returnAlgoCount = 1;
+}
+
+void foo2() {
+  dpct::blas_gemm::experimental::descriptor_ptr lightHandle;
+  dpct::blas_gemm::experimental::matmul_desc_ptr computeDesc;
+  const void *alpha;
+  const void *A;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Adesc;
+  const void *B;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc;
+  const void *beta;
+  const void *C;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc;
+  void *D;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Ddesc;
+  const int *algo;
+  void *workspace;
+  size_t workspaceSizeInBytes;
+  dpct::queue_ptr stream;
+  dpct::blas_gemm::experimental::matmul(lightHandle, computeDesc, alpha, A,
+                                        Adesc, B, Bdesc, beta, C, Cdesc, D,
+                                        Ddesc, stream);
+}
+
+void foo3() {
+  dpct::blas_gemm::experimental::order_t a;
+  a = dpct::blas_gemm::experimental::order_t::col;
+  a = dpct::blas_gemm::experimental::order_t::row;
+  a = dpct::blas_gemm::experimental::order_t::col32;
+  a = dpct::blas_gemm::experimental::order_t::col4_4r2_8c;
+  a = dpct::blas_gemm::experimental::order_t::col32_2r_4r4;
+
+  dpct::blas_gemm::experimental::pointer_mode_t b;
+  b = dpct::blas_gemm::experimental::pointer_mode_t::host;
+  b = dpct::blas_gemm::experimental::pointer_mode_t::device;
+  b = dpct::blas_gemm::experimental::pointer_mode_t::device_vector;
+  b = dpct::blas_gemm::experimental::pointer_mode_t::
+      alpha_device_vector_beta_zero;
+  b = dpct::blas_gemm::experimental::pointer_mode_t::
+      alpha_device_vector_beta_host;
+
+  dpct::blas_gemm::experimental::matrix_layout_t::attribute c;
+  c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::type;
+  c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::order;
+  c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::rows;
+  c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::cols;
+  c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::ld;
+
+  dpct::blas_gemm::experimental::matmul_desc_t::attribute d;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::compute_type;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::scale_type;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::pointer_mode;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_a;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_b;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_c;
+  d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::epilogue;
+}
+
+void foo4() {
+  dpct::blas_gemm::experimental::transform_desc_ptr transformDesc;
+  dpct::library_data_t scaleType;
+  transformDesc =
+      new dpct::blas_gemm::experimental::transform_desc_t(scaleType);
+  oneapi::mkl::transpose opT = oneapi::mkl::transpose::trans;
+  size_t sizeWritten;
+  transformDesc->set_attribute(
+      dpct::blas_gemm::experimental::transform_desc_t::attribute::trans_a,
+      &opT);
+  transformDesc->get_attribute(
+      dpct::blas_gemm::experimental::transform_desc_t::attribute::trans_a,
+      &opT);
+  delete (transformDesc);
+
+  dpct::blas_gemm::experimental::descriptor_ptr lightHandle;
+  const void *alpha;
+  const void *A;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Adesc;
+  const void *beta;
+  const void *B;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc;
+  void *C;
+  dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc;
+  dpct::queue_ptr stream;
+  dpct::blas_gemm::experimental::matrix_transform(
+      transformDesc, alpha, A, Adesc, beta, B, Bdesc, C, Cdesc, stream);
+}
+
+int main() {
+  return 0;
+}
diff --git a/help_function/src/onedpl_test_reduce.cpp b/help_function/src/onedpl_test_reduce.cpp
index f12b5ff5e..2cf06eea5 100644
--- a/help_function/src/onedpl_test_reduce.cpp
+++ b/help_function/src/onedpl_test_reduce.cpp
@@ -18,7 +18,7 @@
 
 #include <iostream>
 #include <vector>
-
+#include <limits>
 
 
 
@@ -378,6 +378,72 @@ int main() {
         
     }
 
+    // Testing calls to dpct::argmin and dpct::argmax functors with unique and equivalent values
+    {
+        auto queue = dpct::get_default_queue();
+        test_name = "oneapi::dpl::reduce with dpct::argmin functor - All values are unique";
+        std::size_t n = 10;
+        sycl::buffer<dpct::key_value_pair<int, float>> input(n);
+        {
+            auto host_acc = input.get_host_access();
+            for (std::size_t i = 0; i < n; ++i)
+                host_acc[i].key = i + 10, host_acc[i].value = i + 20;
+            // inject min and max
+            host_acc[4].key = 9;
+            host_acc[4].value = 8;
+            host_acc[6].key = 101;
+            host_acc[6].value = 99;
+        }
+        auto argmin_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+                                              oneapi::dpl::begin(input),
+                                              oneapi::dpl::end(input),
+                                              dpct::key_value_pair<int, float>(std::numeric_limits<int>::max(),
+                                                                               std::numeric_limits<float>::max()),
+                                              dpct::argmin());
+
+        failed_tests += ASSERT_EQUAL(test_name, argmin_res.key, 9);
+        failed_tests += ASSERT_EQUAL(test_name, argmin_res.value, 8);
+
+        test_name = "oneapi::dpl::reduce with dpct::argmax functor - All values are unique";
+        auto argmax_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+                                              oneapi::dpl::begin(input),
+                                              oneapi::dpl::end(input),
+                                              dpct::key_value_pair<int, float>(std::numeric_limits<int>::min(),
+                                                                               std::numeric_limits<float>::min()),
+                                              dpct::argmax());
+
+        failed_tests += ASSERT_EQUAL(test_name, argmax_res.key, 101);
+        failed_tests += ASSERT_EQUAL(test_name, argmax_res.value, 99);
+
+        test_name = "oneapi::dpl::reduce with dpct::argmin functor - All values are the same";
+        {
+            auto host_acc = input.get_host_access();
+            for (std::size_t i = 0; i < n; ++i)
+                host_acc[i].key = i + 30, host_acc[i].value = 2;
+        }
+        // Expect the key_value_pair with the lower key to be returned when value compares equal
+        argmin_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+                                         oneapi::dpl::begin(input),
+                                         oneapi::dpl::end(input),
+                                         dpct::key_value_pair<int, float>(std::numeric_limits<int>::max(),
+                                                                          std::numeric_limits<float>::max()),
+                                         dpct::argmin());
+
+        failed_tests += ASSERT_EQUAL(test_name, argmin_res.key, 30);
+        failed_tests += ASSERT_EQUAL(test_name, argmin_res.value, 2);
+
+        argmax_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+                                         oneapi::dpl::begin(input),
+                                         oneapi::dpl::end(input),
+                                         dpct::key_value_pair<int, float>(std::numeric_limits<int>::min(),
+                                                                          std::numeric_limits<float>::min()),
+                                         dpct::argmax());
+
+        // Expect the key_value_pair with the lower key to be returned when value compares equal
+        failed_tests += ASSERT_EQUAL(test_name, argmax_res.key, 30);
+        failed_tests += ASSERT_EQUAL(test_name, argmax_res.value, 2);
+    }
+
     std::cout << std::endl << failed_tests << " failing test(s) detected." << std::endl;
     if (failed_tests == 0) {
         return 0;
diff --git a/help_function/src/sparse_utils_1.cpp b/help_function/src/sparse_utils_1.cpp
index 7279dac85..a9debf19c 100644
--- a/help_function/src/sparse_utils_1.cpp
+++ b/help_function/src/sparse_utils_1.cpp
@@ -71,6 +71,10 @@ bool test1() {
   if (ib != oneapi::mkl::index_base::one)
     return false;
 
+  dpct::sparse::matrix_format mf;
+  mf = dpct::sparse::matrix_format::csr;
+  mf = dpct::sparse::matrix_format::coo;
+
   return true;
 }
 
diff --git a/help_function/test_help.py b/help_function/test_help.py
index b6c585270..52588464a 100644
--- a/help_function/test_help.py
+++ b/help_function/test_help.py
@@ -45,7 +45,7 @@ def build_test():
                 "dnnl_utils_batch_normalization_2", "dnnl_utils_batch_normalization_3", "dnnl_utils_convolution_1",
                 "dnnl_utils_convolution_2", "dnnl_utils_convolution_3", "dnnl_utils_convolution_4", "dnnl_utils_convolution_5",
                 "dnnl_utils_normalization_1", "dnnl_utils_normalization_2", "dnnl_utils_normalization_3", "dnnl_utils_rnn",
-                "dnnl_utils_version", "dnnl_utils_dropout"]
+                "dnnl_utils_version", "dnnl_utils_dropout", "blas_gemm_utils_interface"]
     fft_cases = ["fft_utils_engine_buffer", "fft_utils_engine_usm", "fft_workspace_interface", "fft_set_workspace"]
     lapack_cases = ["lapack_utils_buffer", "lapack_utils_usm"]
     rng_cases = ["rng_generator", "rng_generator_vec_size_1", "rng_host"]