diff --git a/behavior_tests/src/query-api-mapping/do_test.py b/behavior_tests/src/query-api-mapping/do_test.py
index fea3ef5c7..712b99375 100644
--- a/behavior_tests/src/query-api-mapping/do_test.py
+++ b/behavior_tests/src/query-api-mapping/do_test.py
@@ -182,7 +182,7 @@ def migrate_test():
],
[],
[
- "dpct::sparse::spmm(*handle, transa, transb, alpha, a, b, beta, c, computetype);"
+ "dpct::sparse::spmm(handle->get_queue(), transa, transb, alpha, a, b, beta, c, computetype);"
],
],
[ # NCCL
diff --git a/features/config/TEMPLATE_cublasLt.xml b/features/config/TEMPLATE_cublasLt.xml
new file mode 100644
index 000000000..cb15eeed1
--- /dev/null
+++ b/features/config/TEMPLATE_cublasLt.xml
@@ -0,0 +1,13 @@
+
+
+
+ test
+
+
+
+
+
+
+
+
+
diff --git a/features/config/TEMPLATE_cusparse_after_112_cpu.xml b/features/config/TEMPLATE_cusparse_after_112_cpu.xml
new file mode 100644
index 000000000..5776a3d74
--- /dev/null
+++ b/features/config/TEMPLATE_cusparse_after_112_cpu.xml
@@ -0,0 +1,14 @@
+
+
+
+ test
+
+
+
+
+
+
+
+
+
+
diff --git a/features/feature_case/cublasLt/matmul.cu b/features/feature_case/cublasLt/matmul.cu
new file mode 100644
index 000000000..0cf382f7a
--- /dev/null
+++ b/features/feature_case/cublasLt/matmul.cu
@@ -0,0 +1,754 @@
+// ===------------ matmul.cu ----------------------------- *- CUDA -* ----=== //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#include
+#include
+#include
+
+const constexpr int COL_TURING = 0;
+const constexpr int COL_AMPERE = 1;
+
+// The original source of below two functions was under the license below:
+// Copyright (c) Facebook, Inc. and its affiliates.
+//
+// This source code is licensed under the MIT license found in the
+// LICENSE file in the root directory of this source tree.
+//
+// Repo: https://github.com/TimDettmers/bitsandbytes.git
+inline int checkCublasStatus(cublasStatus_t status) {
+ if (status != CUBLAS_STATUS_SUCCESS) {
+ printf("cuBLAS API failed with status %d\n", status);
+ //throw std::logic_error("cuBLAS API failed");
+ return 1;
+ }
+ return 0;
+}
+
+template int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc)
+{
+ int has_error = 0;
+ cublasLtMatmulDesc_t matmulDesc = NULL;
+ cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL;
+ cublasOperation_t opT = CUBLAS_OP_T;
+ cublasLtPointerMode_t alphaVec = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO;
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col_turing = CUBLASLT_ORDER_COL4_4R2_8C;
+ cublasLtOrder_t col_ampere = CUBLASLT_ORDER_COL32_2R_4R4;
+
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8I, m, k, lda));
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8I, n, k, ldb));
+
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+ if(FORMATB == COL_TURING)
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing)));
+ else
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere)));
+
+ if(DTYPE_OUT == 32)
+ {
+ has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, CUDA_R_32I));
+ has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32I, m, n, ldc));
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+ int alpha = 1, beta = 0;
+ has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0));
+ }
+ else
+ {
+ has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, CUDA_R_32F));
+ has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT)));
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_8I, m, n, ldc));
+ has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32)));
+ if(!SCALE_ROWS)
+ {
+ float alpha = 1.0f, beta = 0.0f;
+ has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+ }
+ else
+ {
+ has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec)));
+ has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0));
+ }
+ }
+
+ cudaStreamSynchronize(0);
+
+ if (Cdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Cdesc));
+ if (Bdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Bdesc));
+ if (Adesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Adesc));
+ if (matmulDesc) has_error |= checkCublasStatus(cublasLtMatmulDescDestroy(matmulDesc));
+ if(has_error == 1)
+ printf("error detected");
+
+ return has_error;
+}
+
+void transform(cublasLtHandle_t ltHandle, const void *in, int ld_in,
+ cublasLtMatrixLayout_t layout_in, void *out, int ld_out,
+ cublasLtMatrixLayout_t layout_out) {
+ cublasLtMatrixTransformDesc_t transform_desc = NULL;
+ cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F);
+ float alpha = 1.0f, beta = 0.0f;
+ cublasLtMatrixTransform(ltHandle, transform_desc, &alpha, in, layout_in,
+ &beta, NULL, NULL, out, layout_out, 0);
+ cublasLtMatrixTransformDescDestroy(transform_desc);
+}
+
+// igemmlt
+bool test1() {
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+ const constexpr int m = 4;
+ const constexpr int n = 2;
+ const constexpr int k = 3;
+ int lda = m;
+ int ldb = n;
+ int ldc = m;
+ void *Adev;
+ void *Bdev;
+ void *Cdev;
+ cudaMalloc(&Adev, m * k * sizeof(int8_t));
+ cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+ cudaMalloc(&Cdev, m * n * sizeof(int32_t));
+
+ int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+ int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+ cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+ cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+ Cdesc_col_major = NULL;
+ cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+ cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+ cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_32I, m, n, ldc);
+
+ // Convert A and B
+ cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL,
+ Cdesc_col32 = NULL;
+ int8_t *A_col32, *B_col4_4r2_8c;
+ int32_t *C_col32;
+ cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+ cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t));
+ cudaMalloc(&C_col32, m * 32 * sizeof(std::int32_t));
+ cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+ cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n,
+ ((n + 8 - 1) / 8) * 8 * 32);
+ cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_32I, m, n, m * 32);
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C;
+ cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+ cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c,
+ CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c,
+ sizeof(col4_4r2_8c));
+ cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+
+ transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+ transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32,
+ Bdesc_col4_4r2_8c);
+
+ // Matmul
+ igemmlt(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32,
+ nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32,
+ m * 32);
+
+ // Convert C
+ transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+ cudaStreamSynchronize(0);
+
+ // Check result
+ int32_t Chost[m * n];
+ cudaMemcpy(Chost, Cdev, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+ bool error = false;
+ int32_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+ for (int i = 0; i < m * n; i++) {
+ if (Chost[i] != C_ref[i]) {
+ error = true;
+ break;
+ }
+ }
+ printf("c:\n");
+ for (int i = 0; i < m * n; i++)
+ printf("%d, ", Chost[i]);
+ printf("\n");
+
+ if (error) {
+ printf("error\n");
+ } else {
+ printf("success\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+ cublasLtMatrixLayoutDestroy(Adesc_col32);
+ cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c);
+ cublasLtMatrixLayoutDestroy(Cdesc_col32);
+ cublasLtMatrixLayoutDestroy(Adesc_col_major);
+ cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+ cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+ cudaFree(Adev);
+ cudaFree(Bdev);
+ cudaFree(Cdev);
+
+ return !error;
+}
+
+// igemmlt
+bool test2() {
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+ const constexpr int m = 4;
+ const constexpr int n = 2;
+ const constexpr int k = 3;
+ int lda = m;
+ int ldb = n;
+ int ldc = m;
+ void *Adev;
+ void *Bdev;
+ void *Cdev;
+ cudaMalloc(&Adev, m * k * sizeof(int8_t));
+ cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+ cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+ int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+ int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+ cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+ cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+ Cdesc_col_major = NULL;
+ cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+ cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+ cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+ // Convert A and B
+ cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL,
+ Cdesc_col32 = NULL;
+ int8_t *A_col32, *B_col4_4r2_8c;
+ int8_t *C_col32;
+ cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+ cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t));
+ cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+ cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+ cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n,
+ ((n + 8 - 1) / 8) * 8 * 32);
+ cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C;
+ cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+ cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c,
+ CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c,
+ sizeof(col4_4r2_8c));
+ cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+
+ transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+ transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32,
+ Bdesc_col4_4r2_8c);
+
+ // Matmul
+ igemmlt(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32,
+ nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32,
+ m * 32);
+
+ // Convert C
+ transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+ cudaStreamSynchronize(0);
+
+ // Check result
+ int8_t Chost[m * n];
+ cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+ bool error = false;
+ int8_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+ for (int i = 0; i < m * n; i++) {
+ if (Chost[i] != C_ref[i]) {
+ error = true;
+ break;
+ }
+ }
+ printf("c:\n");
+ for (int i = 0; i < m * n; i++)
+ printf("%d, ", Chost[i]);
+ printf("\n");
+
+ if (error) {
+ printf("error\n");
+ } else {
+ printf("success\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+ cublasLtMatrixLayoutDestroy(Adesc_col32);
+ cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c);
+ cublasLtMatrixLayoutDestroy(Cdesc_col32);
+ cublasLtMatrixLayoutDestroy(Adesc_col_major);
+ cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+ cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+ cudaFree(Adev);
+ cudaFree(Bdev);
+ cudaFree(Cdev);
+
+ return !error;
+}
+
+// igemmlt
+bool test3() {
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+ const constexpr int m = 4;
+ const constexpr int n = 2;
+ const constexpr int k = 3;
+ int lda = m;
+ int ldb = n;
+ int ldc = m;
+ void *Adev;
+ void *Bdev;
+ void *Cdev;
+ cudaMalloc(&Adev, m * k * sizeof(int8_t));
+ cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+ cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+ int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+ int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+ cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+ cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+ Cdesc_col_major = NULL;
+ cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+ cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+ cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+ // Convert A and B
+ cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL,
+ Cdesc_col32 = NULL;
+ int8_t *A_col32, *B_col4_4r2_8c;
+ int8_t *C_col32;
+ cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+ cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t));
+ cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+ cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+ cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n,
+ ((n + 8 - 1) / 8) * 8 * 32);
+ cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C;
+ cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+ cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c,
+ CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c,
+ sizeof(col4_4r2_8c));
+ cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+
+ transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+ transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32,
+ Bdesc_col4_4r2_8c);
+
+ float *alpha;
+ cudaMallocManaged(&alpha, 4 * sizeof(float));
+ alpha[0] = 0;
+ alpha[1] = 1;
+ alpha[2] = 2;
+ alpha[3] = 3;
+
+ // Matmul
+ igemmlt(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32,
+ alpha, m * 32, ((n + 8 - 1) / 8) * 8 * 32, m * 32);
+
+ // Convert C
+ transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+ cudaStreamSynchronize(0);
+
+ // Check result
+ int8_t Chost[m * n];
+ cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+ bool error = false;
+ int8_t C_ref[m * n] = {0, 17, 40, 69, 0, 6, 16, 30};
+ for (int i = 0; i < m * n; i++) {
+ if (Chost[i] != C_ref[i]) {
+ error = true;
+ break;
+ }
+ }
+ printf("c:\n");
+ for (int i = 0; i < m * n; i++)
+ printf("%d, ", Chost[i]);
+ printf("\n");
+
+ if (error) {
+ printf("error\n");
+ } else {
+ printf("success\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+ cublasLtMatrixLayoutDestroy(Adesc_col32);
+ cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c);
+ cublasLtMatrixLayoutDestroy(Cdesc_col32);
+ cublasLtMatrixLayoutDestroy(Adesc_col_major);
+ cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+ cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+ cudaFree(Adev);
+ cudaFree(Bdev);
+ cudaFree(Cdev);
+ cudaFree(alpha);
+
+ return !error;
+}
+
+// igemmlt
+bool test4() {
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+ const constexpr int m = 4;
+ const constexpr int n = 2;
+ const constexpr int k = 3;
+ int lda = m;
+ int ldb = n;
+ int ldc = m;
+ void *Adev;
+ void *Bdev;
+ void *Cdev;
+ cudaMalloc(&Adev, m * k * sizeof(int8_t));
+ cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+ cudaMalloc(&Cdev, m * n * sizeof(int32_t));
+
+ int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+ int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+ cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+ cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+ Cdesc_col_major = NULL;
+ cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+ cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+ cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_32I, m, n, ldc);
+
+ // Convert A and B
+ cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL,
+ Cdesc_col32 = NULL;
+ int8_t *A_col32, *B_col32_2r_4r4;
+ int32_t *C_col32;
+ cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+ cudaMalloc(&B_col32_2r_4r4,
+ ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t));
+ cudaMalloc(&C_col32, m * 32 * sizeof(std::int32_t));
+ cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+ cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n,
+ ((n + 32 - 1) / 32) * 32 * 32);
+ cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_32I, m, n, m * 32);
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+ cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+ cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4,
+ CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4,
+ sizeof(col32_2r_4r4));
+ cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+
+ transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+ transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32,
+ Bdesc_col32_2r_4r4);
+
+ // Matmul
+ igemmlt(ltHandle, m, n, k, A_col32, B_col32_2r_4r4,
+ C_col32, nullptr, m * 32,
+ ((n + 8 - 1) / 8) * 8 * 32, m * 32);
+
+ // Convert C
+ transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+ cudaStreamSynchronize(0);
+
+ // Check result
+ int32_t Chost[m * n];
+ cudaMemcpy(Chost, Cdev, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost);
+
+ bool error = false;
+ int32_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+ for (int i = 0; i < m * n; i++) {
+ if (Chost[i] != C_ref[i]) {
+ error = true;
+ break;
+ }
+ }
+ printf("c:\n");
+ for (int i = 0; i < m * n; i++)
+ printf("%d, ", Chost[i]);
+ printf("\n");
+
+ if (error) {
+ printf("error\n");
+ } else {
+ printf("success\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+ cublasLtMatrixLayoutDestroy(Adesc_col32);
+ cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4);
+ cublasLtMatrixLayoutDestroy(Cdesc_col32);
+ cublasLtMatrixLayoutDestroy(Adesc_col_major);
+ cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+ cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+ cudaFree(Adev);
+ cudaFree(Bdev);
+ cudaFree(Cdev);
+
+ return !error;
+}
+
+// igemmlt
+bool test5() {
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+ const constexpr int m = 4;
+ const constexpr int n = 2;
+ const constexpr int k = 3;
+ int lda = m;
+ int ldb = n;
+ int ldc = m;
+ void *Adev;
+ void *Bdev;
+ void *Cdev;
+ cudaMalloc(&Adev, m * k * sizeof(int8_t));
+ cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+ cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+ int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+ int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+ cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+ cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+ Cdesc_col_major = NULL;
+ cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+ cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+ cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+ // Convert A and B
+ cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL,
+ Cdesc_col32 = NULL;
+ int8_t *A_col32, *B_col32_2r_4r4;
+ int8_t *C_col32;
+ cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+ cudaMalloc(&B_col32_2r_4r4,
+ ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t));
+ cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+ cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+ cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n,
+ ((n + 32 - 1) / 32) * 32 * 32);
+ cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+ cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+ cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4,
+ CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4,
+ sizeof(col32_2r_4r4));
+ cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+
+ transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+ transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32,
+ Bdesc_col32_2r_4r4);
+
+ // Matmul
+ igemmlt(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, C_col32,
+ nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32,
+ m * 32);
+
+ // Convert C
+ transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+ cudaStreamSynchronize(0);
+
+ // Check result
+ int8_t Chost[m * n];
+ cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+ bool error = false;
+ int8_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10};
+ for (int i = 0; i < m * n; i++) {
+ if (Chost[i] != C_ref[i]) {
+ error = true;
+ break;
+ }
+ }
+ printf("c:\n");
+ for (int i = 0; i < m * n; i++)
+ printf("%d, ", Chost[i]);
+ printf("\n");
+
+ if (error) {
+ printf("error\n");
+ } else {
+ printf("success\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+ cublasLtMatrixLayoutDestroy(Adesc_col32);
+ cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4);
+ cublasLtMatrixLayoutDestroy(Cdesc_col32);
+ cublasLtMatrixLayoutDestroy(Adesc_col_major);
+ cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+ cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+ cudaFree(Adev);
+ cudaFree(Bdev);
+ cudaFree(Cdev);
+
+ return !error;
+}
+
+// igemmlt
+bool test6() {
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+ const constexpr int m = 4;
+ const constexpr int n = 2;
+ const constexpr int k = 3;
+ int lda = m;
+ int ldb = n;
+ int ldc = m;
+ void *Adev;
+ void *Bdev;
+ void *Cdev;
+ cudaMalloc(&Adev, m * k * sizeof(int8_t));
+ cudaMalloc(&Bdev, n * k * sizeof(int8_t));
+ cudaMalloc(&Cdev, m * n * sizeof(int8_t));
+
+ int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17};
+ int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0};
+
+ cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+ cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice);
+
+ cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL,
+ Cdesc_col_major = NULL;
+ cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda);
+ cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb);
+ cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc);
+
+ // Convert A and B
+ cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL,
+ Cdesc_col32 = NULL;
+ int8_t *A_col32, *B_col32_2r_4r4;
+ int8_t *C_col32;
+ cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t));
+ cudaMalloc(&B_col32_2r_4r4,
+ ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t));
+ cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t));
+ cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32);
+ cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n,
+ ((n + 32 - 1) / 32) * 32 * 32);
+ cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32);
+ cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32;
+ cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4;
+ cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+ cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4,
+ CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4,
+ sizeof(col32_2r_4r4));
+ cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &col32, sizeof(col32));
+
+ transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32);
+ transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32,
+ Bdesc_col32_2r_4r4);
+
+ float *alpha;
+ cudaMallocManaged(&alpha, 4 * sizeof(float));
+ alpha[0] = 0;
+ alpha[1] = 1;
+ alpha[2] = 2;
+ alpha[3] = 3;
+
+ // Matmul
+ igemmlt(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, C_col32,
+ alpha, m * 32, ((n + 8 - 1) / 8) * 8 * 32, m * 32);
+
+ // Convert C
+ transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major);
+ cudaStreamSynchronize(0);
+
+ // Check result
+ int8_t Chost[m * n];
+ cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost);
+
+ bool error = false;
+ int8_t C_ref[m * n] = {0, 17, 40, 69, 0, 6, 16, 30};
+ for (int i = 0; i < m * n; i++) {
+ if (Chost[i] != C_ref[i]) {
+ error = true;
+ break;
+ }
+ }
+ printf("c:\n");
+ for (int i = 0; i < m * n; i++)
+ printf("%d, ", Chost[i]);
+ printf("\n");
+
+ if (error) {
+ printf("error\n");
+ } else {
+ printf("success\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+ cublasLtMatrixLayoutDestroy(Adesc_col32);
+ cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4);
+ cublasLtMatrixLayoutDestroy(Cdesc_col32);
+ cublasLtMatrixLayoutDestroy(Adesc_col_major);
+ cublasLtMatrixLayoutDestroy(Bdesc_col_major);
+ cublasLtMatrixLayoutDestroy(Cdesc_col_major);
+ cudaFree(Adev);
+ cudaFree(Bdev);
+ cudaFree(Cdev);
+ cudaFree(alpha);
+
+ return !error;
+}
+
+// clang-format off
+// A (4*3) B (2*3)
+// 6 10 14 5 -3 1
+// 7 11 15 4 -2 0
+// 8 12 16
+// 9 13 17
+//
+// alpha * A * op(B) = alpha * C = C
+// 0 6 10 14 5 4 0 14 4 0 0
+// 1 7 11 15 -3 -2 1 17 6 17 6
+// 2 8 12 16 1 0 2 20 8 40 16
+// 3 9 13 17 3 23 10 69 30
+//
+// alpha * A * op(B) = alpha * C = C
+// 1 6 10 14 5 4 1 14 4 14 4
+// 7 11 15 -3 -2 17 6 17 6
+// 8 12 16 1 0 20 8 20 8
+// 9 13 17 23 10 23 10
+// clang-format on
+
+int main() {
+ bool pass = true;
+ pass = test1() && pass;
+ pass = test2() && pass;
+ pass = test3() && pass;
+ pass = test4() && pass;
+ pass = test5() && pass;
+ pass = test6() && pass;
+ return pass ? 0 : 1;
+}
diff --git a/features/feature_case/cublasLt/transform.cu b/features/feature_case/cublasLt/transform.cu
new file mode 100644
index 000000000..3a1205add
--- /dev/null
+++ b/features/feature_case/cublasLt/transform.cu
@@ -0,0 +1,600 @@
+// ===------------ transform.cu -------------------------- *- CUDA -* ----=== //
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#include "cublasLt.h"
+#include
+
+void transform(cublasLtHandle_t ltHandle, void *in, int ld_in,
+ cublasLtOrder_t order_in, void *out, int ld_out,
+ cublasLtOrder_t order_out, int dim1, int dim2) {
+ cublasLtMatrixLayout_t in_desc = NULL, out_desc = NULL;
+ cublasLtMatrixTransformDesc_t transform_desc = NULL;
+
+ cublasLtMatrixLayoutCreate(&in_desc, CUDA_R_8I, dim1, dim2, ld_in);
+ cublasLtMatrixLayoutCreate(&out_desc, CUDA_R_8I, dim1, dim2, ld_out);
+
+ cublasLtMatrixLayoutSetAttribute(in_desc, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &order_in, sizeof(order_in));
+ cublasLtMatrixLayoutSetAttribute(out_desc, CUBLASLT_MATRIX_LAYOUT_ORDER,
+ &order_out, sizeof(order_out));
+
+ cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F);
+
+ float alpha = 1.0f, beta = 0.0f;
+ cublasLtMatrixTransform(ltHandle, transform_desc, &alpha, in, in_desc, &beta,
+ NULL, NULL, out, out_desc, 0);
+
+ cublasLtMatrixLayoutDestroy(in_desc);
+ cublasLtMatrixLayoutDestroy(out_desc);
+ cublasLtMatrixTransformDescDestroy(transform_desc);
+}
+
+bool test_ROW() {
+ const constexpr int m = 2;
+ const constexpr int n = 33;
+ const constexpr int in_ld = 4;
+ void *in_dev;
+ cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+ int8_t in_host[n * in_ld];
+ int8_t value = 0;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % 4 < 2) {
+ in_host[i] = value;
+ value++;
+ } else
+ in_host[i] = 99;
+ }
+ int8_t ref_2nd[n * in_ld];
+ std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+ cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+ cudaMemcpyHostToDevice);
+
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+
+ void *out_dev;
+ const constexpr int out_ld = 36;
+ cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+ cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+ transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+ CUBLASLT_ORDER_ROW, m, n);
+
+ int8_t out_host[out_ld * m];
+ cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_1st = true;
+ int8_t ref_1st[out_ld * m] =
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 0, 0, 0,
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 0, 0, 0};
+ for (int i = 0; i < out_ld * m; i++) {
+ if (i % out_ld < n) {
+ if (out_host[i] != ref_1st[i]) {
+ pass_1st = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < out_ld * m; i++) {
+ printf("%d, ", out_host[i]);
+ }
+ printf("\n");
+ if (pass_1st) {
+ printf("ROW 1st pass\n");
+ } else {
+ printf("ROW 1st fail\n");
+ }
+
+ cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+ std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+ transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_ROW, in_dev, in_ld,
+ CUBLASLT_ORDER_COL, m, n);
+ cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_2nd = true;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % in_ld < m) {
+ if (in_host[i] != ref_2nd[i]) {
+ pass_2nd = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < n * in_ld; i++) {
+ printf("%d, ", in_host[i]);
+ }
+ printf("\n");
+ if (pass_2nd) {
+ printf("ROW 2nd pass\n");
+ } else {
+ printf("ROW 2nd fail\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+
+ return pass_1st && pass_2nd;
+}
+
+bool test_COL32() {
+ const constexpr int m = 2;
+ const constexpr int n = 33;
+ const constexpr int in_ld = 4;
+ void *in_dev;
+ cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+ int8_t in_host[n * in_ld];
+ int8_t value = 0;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % 4 < 2) {
+ in_host[i] = value;
+ value++;
+ } else
+ in_host[i] = 99;
+ }
+ int8_t ref_2nd[n * in_ld];
+ std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+ cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+ cudaMemcpyHostToDevice);
+
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+
+ void *out_dev;
+ const constexpr int out_ld = 64;
+ cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+ cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+ transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+ CUBLASLT_ORDER_COL32, m, n);
+
+ int8_t out_host[out_ld * m];
+ cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_1st = true;
+ int8_t ref_1st[out_ld * m] =
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ for (int i = 0; i < out_ld * m; i++) {
+ if (i % out_ld < n) {
+ if (out_host[i] != ref_1st[i]) {
+ pass_1st = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < out_ld * m; i++) {
+ printf("%d, ", out_host[i]);
+ }
+ printf("\n");
+ if (pass_1st) {
+ printf("COL32 1st pass\n");
+ } else {
+ printf("COL32 1st fail\n");
+ }
+
+ cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+ std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+ transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL32, in_dev, in_ld,
+ CUBLASLT_ORDER_COL, m, n);
+ cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_2nd = true;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % in_ld < m) {
+ if (in_host[i] != ref_2nd[i]) {
+ pass_2nd = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < n * in_ld; i++) {
+ printf("%d, ", in_host[i]);
+ }
+ printf("\n");
+ if (pass_2nd) {
+ printf("COL32 2nd pass\n");
+ } else {
+ printf("COL32 2nd fail\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+
+ return pass_1st && pass_2nd;
+}
+
+bool test_COL4_4R2_8C() {
+ const constexpr int m = 2;
+ const constexpr int n = 33;
+ const constexpr int in_ld = 4;
+ void *in_dev;
+ cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+ int8_t in_host[n * in_ld];
+ int8_t value = 0;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % 4 < 2) {
+ in_host[i] = value;
+ value++;
+ } else
+ in_host[i] = 99;
+ }
+ int8_t ref_2nd[n * in_ld];
+ std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+ cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+ cudaMemcpyHostToDevice);
+
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+
+ void *out_dev;
+ const constexpr int out_ld = (32 * 8) * 2;
+ cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+ cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+ transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+ CUBLASLT_ORDER_COL4_4R2_8C, m, n);
+
+ int8_t out_host[out_ld * m];
+ cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_1st = true;
+ int8_t ref_1st[out_ld * m] =
+ {0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 24, 26, 28, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 32, 34, 36, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 40, 42, 44, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 48, 50, 52, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 56, 58, 60, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 17, 19, 21, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 25, 27, 29, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 33, 35, 37, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 41, 43, 45, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 49, 51, 53, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 57, 59, 61, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ for (int i = 0; i < out_ld * m; i++) {
+ if (i % out_ld < n) {
+ if (out_host[i] != ref_1st[i]) {
+ pass_1st = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < out_ld * m; i++) {
+ printf("%d, ", out_host[i]);
+ }
+ printf("\n");
+ if (pass_1st) {
+ printf("COL4_4R2_8C 1st pass\n");
+ } else {
+ printf("COL4_4R2_8C 1st fail\n");
+ }
+
+ cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+ std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+ transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL4_4R2_8C, in_dev,
+ in_ld, CUBLASLT_ORDER_COL, m, n);
+ cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_2nd = true;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % in_ld < m) {
+ if (in_host[i] != ref_2nd[i]) {
+ pass_2nd = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < n * in_ld; i++) {
+ printf("%d, ", in_host[i]);
+ }
+ printf("\n");
+ if (pass_2nd) {
+ printf("COL4_4R2_8C 2nd pass\n");
+ } else {
+ printf("COL4_4R2_8C 2nd fail\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+
+ return pass_1st && pass_2nd;
+}
+
+bool test_COL32_2R_4R4() {
+ const constexpr int m = 2;
+ const constexpr int n = 33;
+ const constexpr int in_ld = 4;
+ void *in_dev;
+ cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t));
+
+ int8_t in_host[n * in_ld];
+ int8_t value = 0;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % 4 < 2) {
+ in_host[i] = value;
+ value++;
+ } else
+ in_host[i] = 99;
+ }
+ int8_t ref_2nd[n * in_ld];
+ std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t));
+
+ cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t),
+ cudaMemcpyHostToDevice);
+
+ cublasLtHandle_t ltHandle;
+ cublasLtCreate(<Handle);
+
+ void *out_dev;
+ const constexpr int out_ld = (32 * 32) * 2;
+ cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t));
+ cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t));
+ transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld,
+ CUBLASLT_ORDER_COL32_2R_4R4, m, n);
+
+ int8_t out_host[out_ld * m];
+ cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_1st = true;
+ int8_t ref_1st[out_ld * m] =
+ {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62,
+ 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
+ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+ for (int i = 0; i < out_ld * m; i++) {
+ if (i % out_ld < n) {
+ if (out_host[i] != ref_1st[i]) {
+ pass_1st = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < out_ld * m; i++) {
+ printf("%d, ", out_host[i]);
+ }
+ printf("\n");
+ if (pass_1st) {
+ printf("COL32_2R_4R4 1st pass\n");
+ } else {
+ printf("COL32_2R_4R4 1st fail\n");
+ }
+
+ cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t));
+ std::memset(in_host, 0, n * in_ld * sizeof(int8_t));
+ transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL32_2R_4R4, in_dev,
+ in_ld, CUBLASLT_ORDER_COL, m, n);
+ cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t),
+ cudaMemcpyDeviceToHost);
+
+ bool pass_2nd = true;
+ for (int i = 0; i < n * in_ld; i++) {
+ if (i % in_ld < m) {
+ if (in_host[i] != ref_2nd[i]) {
+ pass_2nd = false;
+ break;
+ }
+ }
+ }
+
+ for (int i = 0; i < n * in_ld; i++) {
+ printf("%d, ", in_host[i]);
+ }
+ printf("\n");
+ if (pass_2nd) {
+ printf("COL32_2R_4R4 2nd pass\n");
+ } else {
+ printf("COL32_2R_4R4 2nd fail\n");
+ }
+
+ cublasLtDestroy(ltHandle);
+
+ return pass_1st && pass_2nd;
+}
+
+// Input col_major matrix:
+// 2 rows * 33 columns, ld is 4
+int main() {
+ bool pass = true;
+ pass = test_ROW() && pass;
+ pass = test_COL32() && pass;
+ pass = test_COL4_4R2_8C() && pass;
+ pass = test_COL32_2R_4R4() && pass;
+ return pass ? 0 : 1;
+}
diff --git a/features/feature_case/cusparse/cusparse_10.cu b/features/feature_case/cusparse/cusparse_10.cu
new file mode 100644
index 000000000..10b5abd61
--- /dev/null
+++ b/features/feature_case/cusparse/cusparse_10.cu
@@ -0,0 +1,266 @@
+// ===------- cusparse_10.cu ------------------------------- *- CUDA -* ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===----------------------------------------------------------------------===//
+
+#include "cusparse.h"
+
+#include
+#include
+#include
+#include
+
+template
+struct Data {
+ float *h_data;
+ d_data_t *d_data;
+ int element_num;
+ Data(int element_num) : element_num(element_num) {
+ h_data = (float *)malloc(sizeof(float) * element_num);
+ memset(h_data, 0, sizeof(float) * element_num);
+ cudaMalloc(&d_data, sizeof(d_data_t) * element_num);
+ cudaMemset(d_data, 0, sizeof(d_data_t) * element_num);
+ }
+ Data(float *input_data, int element_num) : element_num(element_num) {
+ h_data = (float *)malloc(sizeof(float) * element_num);
+ cudaMalloc(&d_data, sizeof(d_data_t) * element_num);
+ cudaMemset(d_data, 0, sizeof(d_data_t) * element_num);
+ memcpy(h_data, input_data, sizeof(float) * element_num);
+ }
+ ~Data() {
+ free(h_data);
+ cudaFree(d_data);
+ }
+ void H2D() {
+ d_data_t *h_temp = (d_data_t *)malloc(sizeof(d_data_t) * element_num);
+ memset(h_temp, 0, sizeof(d_data_t) * element_num);
+ from_float_convert(h_data, h_temp);
+ cudaMemcpy(d_data, h_temp, sizeof(d_data_t) * element_num,
+ cudaMemcpyHostToDevice);
+ free(h_temp);
+ }
+ void D2H() {
+ d_data_t *h_temp = (d_data_t *)malloc(sizeof(d_data_t) * element_num);
+ memset(h_temp, 0, sizeof(d_data_t) * element_num);
+ cudaMemcpy(h_temp, d_data, sizeof(d_data_t) * element_num,
+ cudaMemcpyDeviceToHost);
+ to_float_convert(h_temp, h_data);
+ free(h_temp);
+ }
+
+private:
+ inline void from_float_convert(float *in, d_data_t *out) {
+ for (int i = 0; i < element_num; i++)
+ out[i] = in[i];
+ }
+ inline void to_float_convert(d_data_t *in, float *out) {
+ for (int i = 0; i < element_num; i++)
+ out[i] = in[i];
+ }
+};
+template <>
+inline void Data::from_float_convert(float *in, float2 *out) {
+ for (int i = 0; i < element_num; i++)
+ out[i].x = in[i];
+}
+template <>
+inline void Data::from_float_convert(float *in, double2 *out) {
+ for (int i = 0; i < element_num; i++)
+ out[i].x = in[i];
+}
+
+template <>
+inline void Data::to_float_convert(float2 *in, float *out) {
+ for (int i = 0; i < element_num; i++)
+ out[i] = in[i].x;
+}
+template <>
+inline void Data::to_float_convert(double2 *in, float *out) {
+ for (int i = 0; i < element_num; i++)
+ out[i] = in[i].x;
+}
+
+bool compare_result(float *expect, float *result, int element_num) {
+ for (int i = 0; i < element_num; i++) {
+ if (std::abs(result[i] - expect[i]) >= 0.05) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool compare_result(float *expect, float *result, std::vector indices) {
+ for (int i = 0; i < indices.size(); i++) {
+ if (std::abs(result[indices[i]] - expect[indices[i]]) >= 0.05) {
+ return false;
+ }
+ }
+ return true;
+}
+
+bool test_passed = true;
+
+// A
+// 1 4 0 0 0
+// 0 2 3 0 0
+// 5 0 7 8 0
+// 0 0 9 0 6
+void test_cusparseSpMM_COO() {
+ std::vector a_val_vec = {1, 4, 2, 3, 5, 7, 8, 9, 6};
+ Data a_s_val(a_val_vec.data(), 9);
+ Data a_d_val(a_val_vec.data(), 9);
+ Data a_c_val(a_val_vec.data(), 9);
+ Data a_z_val(a_val_vec.data(), 9);
+ std::vector a_row_ptr_vec = {1, 1, 2, 2, 3, 3, 3, 4, 4};
+ Data a_row_ptr(a_row_ptr_vec.data(), 9);
+ std::vector a_col_ind_vec = {1, 2, 2, 3, 1, 4, 5, 3, 5};
+ Data a_col_ind(a_col_ind_vec.data(), 9);
+
+ std::vector b_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+ Data b_s(b_vec.data(), 10);
+ Data b_d(b_vec.data(), 10);
+ Data b_c(b_vec.data(), 10);
+ Data b_z(b_vec.data(), 10);
+
+ Data c_s(8);
+ Data c_d(8);
+ Data c_c(8);
+ Data c_z(8);
+
+ float alpha = 10;
+ Data alpha_s(&alpha, 1);
+ Data alpha_d(&alpha, 1);
+ Data alpha_c(&alpha, 1);
+ Data alpha_z(&alpha, 1);
+
+ float beta = 0;
+ Data beta_s(&beta, 1);
+ Data beta_d(&beta, 1);
+ Data beta_c(&beta, 1);
+ Data beta_z(&beta, 1);
+
+ cusparseHandle_t handle;
+ cusparseCreate(&handle);
+
+ cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE);
+
+ a_s_val.H2D();
+ a_d_val.H2D();
+ a_c_val.H2D();
+ a_z_val.H2D();
+ a_row_ptr.H2D();
+ a_col_ind.H2D();
+ b_s.H2D();
+ b_d.H2D();
+ b_c.H2D();
+ b_z.H2D();
+ alpha_s.H2D();
+ alpha_d.H2D();
+ alpha_c.H2D();
+ alpha_z.H2D();
+ beta_s.H2D();
+ beta_d.H2D();
+ beta_c.H2D();
+ beta_z.H2D();
+
+ cusparseSpMatDescr_t a_descr_s;
+ cusparseSpMatDescr_t a_descr_d;
+ cusparseSpMatDescr_t a_descr_c;
+ cusparseSpMatDescr_t a_descr_z;
+ cusparseCreateCoo(&a_descr_s, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_s_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_32F);
+ cusparseCreateCoo(&a_descr_d, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_d_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F);
+ cusparseCreateCoo(&a_descr_c, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_c_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_32F);
+ cusparseCreateCoo(&a_descr_z, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_z_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_64F);
+
+ cusparseDnMatDescr_t b_descr_s;
+ cusparseDnMatDescr_t b_descr_d;
+ cusparseDnMatDescr_t b_descr_c;
+ cusparseDnMatDescr_t b_descr_z;
+ cusparseCreateDnMat(&b_descr_s, 5, 2, 5, b_s.d_data, CUDA_R_32F, CUSPARSE_ORDER_COL);
+ cusparseCreateDnMat(&b_descr_d, 5, 2, 5, b_d.d_data, CUDA_R_64F, CUSPARSE_ORDER_COL);
+ cusparseCreateDnMat(&b_descr_c, 5, 2, 5, b_c.d_data, CUDA_C_32F, CUSPARSE_ORDER_COL);
+ cusparseCreateDnMat(&b_descr_z, 5, 2, 5, b_z.d_data, CUDA_C_64F, CUSPARSE_ORDER_COL);
+
+ cusparseDnMatDescr_t c_descr_s;
+ cusparseDnMatDescr_t c_descr_d;
+ cusparseDnMatDescr_t c_descr_c;
+ cusparseDnMatDescr_t c_descr_z;
+ cusparseCreateDnMat(&c_descr_s, 4, 2, 4, c_s.d_data, CUDA_R_32F, CUSPARSE_ORDER_COL);
+ cusparseCreateDnMat(&c_descr_d, 4, 2, 4, c_d.d_data, CUDA_R_64F, CUSPARSE_ORDER_COL);
+ cusparseCreateDnMat(&c_descr_c, 4, 2, 4, c_c.d_data, CUDA_C_32F, CUSPARSE_ORDER_COL);
+ cusparseCreateDnMat(&c_descr_z, 4, 2, 4, c_z.d_data, CUDA_C_64F, CUSPARSE_ORDER_COL);
+
+ size_t ws_size_s;
+ size_t ws_size_d;
+ size_t ws_size_c;
+ size_t ws_size_z;
+ cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_s);
+ cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_d);
+ cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_c);
+ cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_z);
+
+ void *ws_s = nullptr;
+ void *ws_d = nullptr;
+ void *ws_c = nullptr;
+ void *ws_z = nullptr;
+ cudaMalloc(&ws_s, ws_size_s);
+ cudaMalloc(&ws_d, ws_size_d);
+ cudaMalloc(&ws_c, ws_size_c);
+ cudaMalloc(&ws_z, ws_size_z);
+
+ cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_s);
+ cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_d);
+ cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_c);
+ cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_z);
+ cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_s);
+ cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_d);
+ cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_c);
+ cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_z);
+
+ c_s.D2H();
+ c_d.D2H();
+ c_c.D2H();
+ c_z.D2H();
+
+ cudaStreamSynchronize(0);
+
+ cudaFree(ws_s);
+ cudaFree(ws_d);
+ cudaFree(ws_c);
+ cudaFree(ws_z);
+ cusparseDestroySpMat(a_descr_s);
+ cusparseDestroySpMat(a_descr_d);
+ cusparseDestroySpMat(a_descr_c);
+ cusparseDestroySpMat(a_descr_z);
+ cusparseDestroyDnMat(b_descr_s);
+ cusparseDestroyDnMat(b_descr_d);
+ cusparseDestroyDnMat(b_descr_c);
+ cusparseDestroyDnMat(b_descr_z);
+ cusparseDestroyDnMat(c_descr_s);
+ cusparseDestroyDnMat(c_descr_d);
+ cusparseDestroyDnMat(c_descr_c);
+ cusparseDestroyDnMat(c_descr_z);
+ cusparseDestroy(handle);
+
+ float expect_c[8] = {90, 130, 730, 570, 340, 380, 1730, 1320};
+ if (compare_result(expect_c, c_s.h_data, 8) &&
+ compare_result(expect_c, c_d.h_data, 8) &&
+ compare_result(expect_c, c_c.h_data, 8) &&
+ compare_result(expect_c, c_z.h_data, 8))
+ printf("SpMM_COO pass\n");
+ else {
+ printf("SpMM_COO fail\n");
+ test_passed = false;
+ }
+}
+
+int main() {
+ test_cusparseSpMM_COO();
+
+ if (test_passed)
+ return 0;
+ return -1;
+}
diff --git a/features/feature_case/device_info/device_info.cu b/features/feature_case/device_info/device_info.cu
index d8e5abaf7..cd5a6f8fc 100644
--- a/features/feature_case/device_info/device_info.cu
+++ b/features/feature_case/device_info/device_info.cu
@@ -28,6 +28,7 @@ void test1() {
auto maxTexture1D = properties.maxTexture1D;
auto maxTexture2D = properties.maxTexture2D;
auto maxTexture3D = properties.maxTexture3D;
+ auto regsPerBlock = properties.regsPerBlock;
std::cout << "Device ID: " << id << std::endl;
std::cout << "Device UUID: ";
@@ -43,6 +44,7 @@ void test1() {
std::cout << "Device maxTexture3D: " << maxTexture3D[0] << " "
<< properties.maxTexture3D[1] << " " << maxTexture3D[2]
<< std::endl;
+ std::cout << "Device regsPerBlock: " << regsPerBlock << std::endl;
}
int main() {
diff --git a/features/feature_case/math/math-int.cu b/features/feature_case/math/math-int.cu
new file mode 100644
index 000000000..5c09e092b
--- /dev/null
+++ b/features/feature_case/math/math-int.cu
@@ -0,0 +1,196 @@
+// ===--------------- math-int.cu ---------- *- CUDA -* -------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//
+// ===---------------------------------------------------------------------===//
+
+#include
+#include
+
+using namespace std;
+
+int passed = 0;
+int failed = 0;
+
+void check(bool IsPassed) {
+ if (IsPassed) {
+ cout << " ---- passed" << endl;
+ passed++;
+ } else {
+ cout << " ---- failed" << endl;
+ failed++;
+ }
+}
+
+template void printFunc(const string &FuncName, const T &Input) {
+ cout << FuncName << "(" << Input << ") ";
+}
+
+template
+void printFunc(const string &FuncName, const pair &Input) {
+ cout << FuncName << "(" << Input.first << ", " << Input.second << ")";
+}
+
+template void checkResult(const T &Expect, const T &Result) {
+ cout << " = " << Result << " (expect " << Expect << ")";
+ check(Result == Expect);
+}
+
+// Integer Mathematical Functions
+
+__global__ void _llmax(long long *const Result, long long Input1,
+ long long Input2) {
+ *Result = llmax(Input1, Input2);
+}
+
+void testLlmaxCases(
+ const vector, long long>> &TestCases) {
+ long long *Result;
+ cudaMallocManaged(&Result, sizeof(*Result));
+ for (const auto &TestCase : TestCases) {
+ _llmax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+ cudaDeviceSynchronize();
+ printFunc("llmax", TestCase.first);
+ checkResult(TestCase.second, *Result);
+ }
+}
+
+__global__ void _llmin(long long *const Result, long long Input1,
+ long long Input2) {
+ *Result = llmin(Input1, Input2);
+}
+
+void testLlminCases(
+ const vector, long long>> &TestCases) {
+ long long *Result;
+ cudaMallocManaged(&Result, sizeof(*Result));
+ for (const auto &TestCase : TestCases) {
+ _llmin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+ cudaDeviceSynchronize();
+ printFunc("llmin", TestCase.first);
+ checkResult(TestCase.second, *Result);
+ }
+}
+
+__global__ void _ullmax(unsigned long long *const Result,
+ unsigned long long Input1, unsigned long long Input2) {
+ *Result = ullmax(Input1, Input2);
+}
+
+void testUllmaxCases(
+ const vector,
+ unsigned long long>> &TestCases) {
+ unsigned long long *Result;
+ cudaMallocManaged(&Result, sizeof(*Result));
+ for (const auto &TestCase : TestCases) {
+ _ullmax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+ cudaDeviceSynchronize();
+ printFunc("ullmax", TestCase.first);
+ checkResult(TestCase.second, *Result);
+ }
+}
+
+__global__ void _ullmin(unsigned long long *const Result,
+ unsigned long long Input1, unsigned long long Input2) {
+ *Result = ullmin(Input1, Input2);
+}
+
+void testUllminCases(
+ const vector,
+ unsigned long long>> &TestCases) {
+ unsigned long long *Result;
+ cudaMallocManaged(&Result, sizeof(*Result));
+ for (const auto &TestCase : TestCases) {
+ _ullmin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+ cudaDeviceSynchronize();
+ printFunc("ullmin", TestCase.first);
+ checkResult(TestCase.second, *Result);
+ }
+}
+
+__global__ void _umax(unsigned *const Result, unsigned Input1,
+ unsigned Input2) {
+ *Result = umax(Input1, Input2);
+}
+
+void testUmaxCases(
+ const vector, unsigned>> &TestCases) {
+ unsigned *Result;
+ cudaMallocManaged(&Result, sizeof(*Result));
+ for (const auto &TestCase : TestCases) {
+ _umax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+ cudaDeviceSynchronize();
+ printFunc("umax", TestCase.first);
+ checkResult(TestCase.second, *Result);
+ }
+}
+
+__global__ void _umin(unsigned *const Result, unsigned Input1,
+ unsigned Input2) {
+ *Result = umin(Input1, Input2);
+}
+
+void testUminCases(
+ const vector, unsigned>> &TestCases) {
+ unsigned *Result;
+ cudaMallocManaged(&Result, sizeof(*Result));
+ for (const auto &TestCase : TestCases) {
+ _umin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second);
+ cudaDeviceSynchronize();
+ printFunc("umin", TestCase.first);
+ checkResult(TestCase.second, *Result);
+ }
+}
+
+int main() {
+ testLlmaxCases({
+ {{1, 2}, 2},
+ {{-1, -2}, -1},
+ {{1, -2}, 1},
+ {{-1, 2}, 2},
+ {{45212221678, 221332142421}, 221332142421},
+ });
+ testLlminCases({
+ {{1, 2}, 1},
+ {{-1, -2}, -2},
+ {{1, -2}, -2},
+ {{-1, 2}, -1},
+ {{45212221678, 221332142421}, 45212221678},
+ });
+ testUllmaxCases({
+ {{1, 2}, 2},
+ {{18446744073709551615, 18446744073709551614}, 18446744073709551615},
+ {{1, 18446744073709551614}, 18446744073709551614},
+ {{18446744073709551615, 2}, 18446744073709551615},
+ {{45212221678, 221332142421}, 221332142421},
+ });
+ testUllminCases({
+ {{1, 2}, 1},
+ {{18446744073709551615, 18446744073709551614}, 18446744073709551614},
+ {{1, 18446744073709551614}, 1},
+ {{18446744073709551615, 2}, 2},
+ {{45212221678, 221332142421}, 45212221678},
+ });
+ testUmaxCases({
+ {{1, 2}, 2},
+ {{4294967295, 4294967294}, 4294967295},
+ {{1, 4294967294}, 4294967294},
+ {{4294967295, 2}, 4294967295},
+ {{2262548718, 2288810325}, 2288810325},
+ });
+ testUminCases({
+ {{1, 2}, 1},
+ {{4294967295, 4294967294}, 4294967294},
+ {{1, 4294967294}, 1},
+ {{4294967295, 2}, 2},
+ {{2262548718, 2288810325}, 2262548718},
+ });
+ cout << "passed " << passed << "/" << passed + failed << " cases!" << endl;
+ if (failed) {
+ cout << "failed!" << endl;
+ }
+ return failed;
+}
diff --git a/features/features.xml b/features/features.xml
index 971570894..ff7838f2e 100644
--- a/features/features.xml
+++ b/features/features.xml
@@ -147,6 +147,7 @@
+
@@ -282,6 +283,7 @@
+
@@ -318,10 +320,10 @@
-
-
-
-
+
+
+
+
@@ -340,5 +342,7 @@
+
+
diff --git a/features/test_feature.py b/features/test_feature.py
index 2ef527bf3..bbd6e08c9 100644
--- a/features/test_feature.py
+++ b/features/test_feature.py
@@ -36,10 +36,10 @@
'math-ext-bf16-conv', 'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half-conv', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation',
'cudnn-fill', 'cudnn-lrn', 'cudnn-memory', 'cudnn-pooling', 'cudnn-reorder', 'cudnn-scale', 'cudnn-softmax',
'cudnn-sum', 'math-funnelshift', 'thrust-sort_by_key', 'thrust-find', 'thrust-inner_product', 'thrust-reduce_by_key',
- 'math-bf16-conv', 'math-emu-bf16-conv-double', 'math-ext-bf16-conv-double', 'math-half-conv',
+ 'math-bf16-conv', 'math-emu-bf16-conv-double', 'math-ext-bf16-conv-double', 'math-half-conv', 'math-int',
'math-bfloat16', 'libcu_atomic', 'test_shared_memory', 'cudnn-reduction', 'cudnn-binary', 'cudnn-bnp1', 'cudnn-bnp2', 'cudnn-bnp3',
'cudnn-normp1', 'cudnn-normp2', 'cudnn-normp3', 'cudnn-convp1', 'cudnn-convp2', 'cudnn-convp3', 'cudnn-convp4', 'cudnn-convp5', 'cudnn-convp6', 'cudnn-convp7',
- 'cudnn_mutilple_files', "cusparse_1", "cusparse_2", "cusparse_3", "cusparse_4", "cusparse_5", "cusparse_6", "cusparse_7", "cusparse_8", "cusparse_9",
+ 'cudnn_mutilple_files', "cusparse_1", "cusparse_2", "cusparse_3", "cusparse_4", "cusparse_5", "cusparse_6", "cusparse_7", "cusparse_8", "cusparse_9", "cusparse_10",
'cudnn-GetErrorString', 'cub_device_histgram', 'peer_access', 'driver_err_handle',
'cudnn-types', 'cudnn-version', 'cudnn-dropout', 'const_opt',
'constant_attr', 'sync_warp_p2', 'occupancy_calculation',
@@ -60,7 +60,7 @@
'thrust_swap_ranges', 'thrust_uninitialized_fill_n', 'thrust_equal', 'system_atomic', 'thrust_detail_types',
'operator_eq', 'operator_neq', 'operator_lege', 'thrust_system', 'thrust_reverse_copy',
'thrust_device_new_delete', 'thrust_temporary_buffer', 'thrust_malloc_free', 'codepin', 'thrust_unique_count',
- 'thrust_advance_trans_op_itr', 'cuda_stream_query']
+ 'thrust_advance_trans_op_itr', 'cuda_stream_query', "matmul", "transform"]
occupancy_calculation_exper = ['occupancy_calculation']
@@ -166,7 +166,7 @@ def build_test():
'cudnn-binary', 'cudnn-bnp1', 'cudnn-bnp2', 'cudnn-bnp3', 'cudnn-normp1', 'cudnn-normp2', 'cudnn-normp3',
'cudnn-convp1', 'cudnn-convp2', 'cudnn-convp3', 'cudnn-convp4', 'cudnn-convp5', 'cudnn-convp6', 'cudnn-rnn',
'cudnn-GetErrorString', 'cudnn-convp7',
- 'cudnn-types', 'cudnn-version', 'cudnn-dropout'
+ 'cudnn-types', 'cudnn-version', 'cudnn-dropout', 'matmul'
]
no_fast_math_tests = ['math-emu-half-after11', 'math-emu-half2-after11', 'math-ext-half-after11', 'math-ext-half2-after11',
diff --git a/help_function/help_function.xml b/help_function/help_function.xml
index d35b308c9..fc047e53e 100644
--- a/help_function/help_function.xml
+++ b/help_function/help_function.xml
@@ -87,7 +87,7 @@
-
+
@@ -215,5 +215,6 @@
+
diff --git a/help_function/src/blas_gemm_utils_interface.cpp b/help_function/src/blas_gemm_utils_interface.cpp
new file mode 100644
index 000000000..551021b31
--- /dev/null
+++ b/help_function/src/blas_gemm_utils_interface.cpp
@@ -0,0 +1,152 @@
+// ===------ blas_gemm_utils_interface.cpp ----------------- *- C++ -* ----===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// ===--------------------------------------------------------------------=== //
+
+#include
+#include
+#include
+#include
+
+void foo1 () {
+ dpct::blas_gemm::experimental::descriptor_ptr ltHandle;
+ ltHandle = new dpct::blas_gemm::experimental::descriptor();
+ delete (ltHandle);
+
+ dpct::blas_gemm::experimental::matrix_layout_ptr matLayout;
+ dpct::library_data_t type;
+ uint64_t rows;
+ uint64_t cols;
+ int64_t ld;
+ matLayout =
+ new dpct::blas_gemm::experimental::matrix_layout_t(type, rows, cols, ld);
+
+ dpct::blas_gemm::experimental::matrix_layout_t::attribute attr1;
+ void *buf1;
+ size_t sizeInBytes1;
+ size_t *sizeWritten1;
+ matLayout->get_attribute(attr1, buf1);
+ matLayout->set_attribute(attr1, buf1);
+ delete (matLayout);
+
+ dpct::blas_gemm::experimental::matmul_desc_ptr matmulDesc;
+ dpct::compute_type computeType;
+ dpct::library_data_t scaleType;
+ matmulDesc =
+ new dpct::blas_gemm::experimental::matmul_desc_t(computeType, scaleType);
+
+ dpct::blas_gemm::experimental::matmul_desc_t::attribute attr2;
+ void *buf2;
+ size_t sizeInBytes2;
+ size_t *sizeWritten2;
+ matmulDesc->get_attribute(attr2, buf2);
+ matmulDesc->set_attribute(attr2, buf2);
+ delete (matmulDesc);
+
+ int matmulPreference;
+ void *buf3;
+ size_t sizeInBytes3;
+ size_t *sizeWritten3;
+
+ dpct::blas_gemm::experimental::matrix_layout_ptr Adesc;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Ddesc;
+
+ int requestedAlgoCount = 1;
+ int heuristicResultsArray;
+ int returnAlgoCount;
+ returnAlgoCount = 1;
+}
+
+void foo2() {
+ dpct::blas_gemm::experimental::descriptor_ptr lightHandle;
+ dpct::blas_gemm::experimental::matmul_desc_ptr computeDesc;
+ const void *alpha;
+ const void *A;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Adesc;
+ const void *B;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc;
+ const void *beta;
+ const void *C;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc;
+ void *D;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Ddesc;
+ const int *algo;
+ void *workspace;
+ size_t workspaceSizeInBytes;
+ dpct::queue_ptr stream;
+ dpct::blas_gemm::experimental::matmul(lightHandle, computeDesc, alpha, A,
+ Adesc, B, Bdesc, beta, C, Cdesc, D,
+ Ddesc, stream);
+}
+
+void foo3() {
+ dpct::blas_gemm::experimental::order_t a;
+ a = dpct::blas_gemm::experimental::order_t::col;
+ a = dpct::blas_gemm::experimental::order_t::row;
+ a = dpct::blas_gemm::experimental::order_t::col32;
+ a = dpct::blas_gemm::experimental::order_t::col4_4r2_8c;
+ a = dpct::blas_gemm::experimental::order_t::col32_2r_4r4;
+
+ dpct::blas_gemm::experimental::pointer_mode_t b;
+ b = dpct::blas_gemm::experimental::pointer_mode_t::host;
+ b = dpct::blas_gemm::experimental::pointer_mode_t::device;
+ b = dpct::blas_gemm::experimental::pointer_mode_t::device_vector;
+ b = dpct::blas_gemm::experimental::pointer_mode_t::
+ alpha_device_vector_beta_zero;
+ b = dpct::blas_gemm::experimental::pointer_mode_t::
+ alpha_device_vector_beta_host;
+
+ dpct::blas_gemm::experimental::matrix_layout_t::attribute c;
+ c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::type;
+ c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::order;
+ c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::rows;
+ c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::cols;
+ c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::ld;
+
+ dpct::blas_gemm::experimental::matmul_desc_t::attribute d;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::compute_type;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::scale_type;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::pointer_mode;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_a;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_b;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_c;
+ d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::epilogue;
+}
+
+void foo4() {
+ dpct::blas_gemm::experimental::transform_desc_ptr transformDesc;
+ dpct::library_data_t scaleType;
+ transformDesc =
+ new dpct::blas_gemm::experimental::transform_desc_t(scaleType);
+ oneapi::mkl::transpose opT = oneapi::mkl::transpose::trans;
+ size_t sizeWritten;
+ transformDesc->set_attribute(
+ dpct::blas_gemm::experimental::transform_desc_t::attribute::trans_a,
+ &opT);
+ transformDesc->get_attribute(
+ dpct::blas_gemm::experimental::transform_desc_t::attribute::trans_a,
+ &opT);
+ delete (transformDesc);
+
+ dpct::blas_gemm::experimental::descriptor_ptr lightHandle;
+ const void *alpha;
+ const void *A;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Adesc;
+ const void *beta;
+ const void *B;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc;
+ void *C;
+ dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc;
+ dpct::queue_ptr stream;
+ dpct::blas_gemm::experimental::matrix_transform(
+ transformDesc, alpha, A, Adesc, beta, B, Bdesc, C, Cdesc, stream);
+}
+
+int main() {
+ return 0;
+}
diff --git a/help_function/src/onedpl_test_reduce.cpp b/help_function/src/onedpl_test_reduce.cpp
index f12b5ff5e..2cf06eea5 100644
--- a/help_function/src/onedpl_test_reduce.cpp
+++ b/help_function/src/onedpl_test_reduce.cpp
@@ -18,7 +18,7 @@
#include
#include
-
+#include
@@ -378,6 +378,72 @@ int main() {
}
+ // Testing calls to dpct::argmin and dpct::argmax functors with unique and equivalent values
+ {
+ auto queue = dpct::get_default_queue();
+ test_name = "oneapi::dpl::reduce with dpct::argmin functor - All values are unique";
+ std::size_t n = 10;
+ sycl::buffer> input(n);
+ {
+ auto host_acc = input.get_host_access();
+ for (std::size_t i = 0; i < n; ++i)
+ host_acc[i].key = i + 10, host_acc[i].value = i + 20;
+ // inject min and max
+ host_acc[4].key = 9;
+ host_acc[4].value = 8;
+ host_acc[6].key = 101;
+ host_acc[6].value = 99;
+ }
+ auto argmin_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+ oneapi::dpl::begin(input),
+ oneapi::dpl::end(input),
+ dpct::key_value_pair(std::numeric_limits::max(),
+ std::numeric_limits::max()),
+ dpct::argmin());
+
+ failed_tests += ASSERT_EQUAL(test_name, argmin_res.key, 9);
+ failed_tests += ASSERT_EQUAL(test_name, argmin_res.value, 8);
+
+ test_name = "oneapi::dpl::reduce with dpct::argmax functor - All values are unique";
+ auto argmax_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+ oneapi::dpl::begin(input),
+ oneapi::dpl::end(input),
+ dpct::key_value_pair(std::numeric_limits::min(),
+ std::numeric_limits::min()),
+ dpct::argmax());
+
+ failed_tests += ASSERT_EQUAL(test_name, argmax_res.key, 101);
+ failed_tests += ASSERT_EQUAL(test_name, argmax_res.value, 99);
+
+ test_name = "oneapi::dpl::reduce with dpct::argmin functor - All values are the same";
+ {
+ auto host_acc = input.get_host_access();
+ for (std::size_t i = 0; i < n; ++i)
+ host_acc[i].key = i + 30, host_acc[i].value = 2;
+ }
+ // Expect the key_value_pair with the lower key to be returned when value compares equal
+ argmin_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+ oneapi::dpl::begin(input),
+ oneapi::dpl::end(input),
+ dpct::key_value_pair(std::numeric_limits::max(),
+ std::numeric_limits::max()),
+ dpct::argmin());
+
+ failed_tests += ASSERT_EQUAL(test_name, argmin_res.key, 30);
+ failed_tests += ASSERT_EQUAL(test_name, argmin_res.value, 2);
+
+ argmax_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue),
+ oneapi::dpl::begin(input),
+ oneapi::dpl::end(input),
+ dpct::key_value_pair(std::numeric_limits::min(),
+ std::numeric_limits::min()),
+ dpct::argmax());
+
+ // Expect the key_value_pair with the lower key to be returned when value compares equal
+ failed_tests += ASSERT_EQUAL(test_name, argmax_res.key, 30);
+ failed_tests += ASSERT_EQUAL(test_name, argmax_res.value, 2);
+ }
+
std::cout << std::endl << failed_tests << " failing test(s) detected." << std::endl;
if (failed_tests == 0) {
return 0;
diff --git a/help_function/src/sparse_utils_1.cpp b/help_function/src/sparse_utils_1.cpp
index 7279dac85..a9debf19c 100644
--- a/help_function/src/sparse_utils_1.cpp
+++ b/help_function/src/sparse_utils_1.cpp
@@ -71,6 +71,10 @@ bool test1() {
if (ib != oneapi::mkl::index_base::one)
return false;
+ dpct::sparse::matrix_format mf;
+ mf = dpct::sparse::matrix_format::csr;
+ mf = dpct::sparse::matrix_format::coo;
+
return true;
}
diff --git a/help_function/test_help.py b/help_function/test_help.py
index b6c585270..52588464a 100644
--- a/help_function/test_help.py
+++ b/help_function/test_help.py
@@ -45,7 +45,7 @@ def build_test():
"dnnl_utils_batch_normalization_2", "dnnl_utils_batch_normalization_3", "dnnl_utils_convolution_1",
"dnnl_utils_convolution_2", "dnnl_utils_convolution_3", "dnnl_utils_convolution_4", "dnnl_utils_convolution_5",
"dnnl_utils_normalization_1", "dnnl_utils_normalization_2", "dnnl_utils_normalization_3", "dnnl_utils_rnn",
- "dnnl_utils_version", "dnnl_utils_dropout"]
+ "dnnl_utils_version", "dnnl_utils_dropout", "blas_gemm_utils_interface"]
fft_cases = ["fft_utils_engine_buffer", "fft_utils_engine_usm", "fft_workspace_interface", "fft_set_workspace"]
lapack_cases = ["lapack_utils_buffer", "lapack_utils_usm"]
rng_cases = ["rng_generator", "rng_generator_vec_size_1", "rng_host"]