diff --git a/behavior_tests/src/query-api-mapping/do_test.py b/behavior_tests/src/query-api-mapping/do_test.py index fea3ef5c7..712b99375 100644 --- a/behavior_tests/src/query-api-mapping/do_test.py +++ b/behavior_tests/src/query-api-mapping/do_test.py @@ -182,7 +182,7 @@ def migrate_test(): ], [], [ - "dpct::sparse::spmm(*handle, transa, transb, alpha, a, b, beta, c, computetype);" + "dpct::sparse::spmm(handle->get_queue(), transa, transb, alpha, a, b, beta, c, computetype);" ], ], [ # NCCL diff --git a/features/config/TEMPLATE_cublasLt.xml b/features/config/TEMPLATE_cublasLt.xml new file mode 100644 index 000000000..cb15eeed1 --- /dev/null +++ b/features/config/TEMPLATE_cublasLt.xml @@ -0,0 +1,13 @@ + + + + test + + + + + + + + + diff --git a/features/config/TEMPLATE_cusparse_after_112_cpu.xml b/features/config/TEMPLATE_cusparse_after_112_cpu.xml new file mode 100644 index 000000000..5776a3d74 --- /dev/null +++ b/features/config/TEMPLATE_cusparse_after_112_cpu.xml @@ -0,0 +1,14 @@ + + + + test + + + + + + + + + + diff --git a/features/feature_case/cublasLt/matmul.cu b/features/feature_case/cublasLt/matmul.cu new file mode 100644 index 000000000..0cf382f7a --- /dev/null +++ b/features/feature_case/cublasLt/matmul.cu @@ -0,0 +1,754 @@ +// ===------------ matmul.cu ----------------------------- *- CUDA -* ----=== // +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===--------------------------------------------------------------------=== // + +#include +#include +#include + +const constexpr int COL_TURING = 0; +const constexpr int COL_AMPERE = 1; + +// The original source of below two functions was under the license below: +// Copyright (c) Facebook, Inc. and its affiliates. +// +// This source code is licensed under the MIT license found in the +// LICENSE file in the root directory of this source tree. +// +// Repo: https://github.com/TimDettmers/bitsandbytes.git +inline int checkCublasStatus(cublasStatus_t status) { + if (status != CUBLAS_STATUS_SUCCESS) { + printf("cuBLAS API failed with status %d\n", status); + //throw std::logic_error("cuBLAS API failed"); + return 1; + } + return 0; +} + +template int igemmlt(cublasLtHandle_t ltHandle, int m, int n, int k, const int8_t *A, const int8_t *B, void *C, float *row_scale, int lda, int ldb, int ldc) +{ + int has_error = 0; + cublasLtMatmulDesc_t matmulDesc = NULL; + cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; + cublasOperation_t opT = CUBLAS_OP_T; + cublasLtPointerMode_t alphaVec = CUBLASLT_POINTER_MODE_ALPHA_DEVICE_VECTOR_BETA_ZERO; + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col_turing = CUBLASLT_ORDER_COL4_4R2_8C; + cublasLtOrder_t col_ampere = CUBLASLT_ORDER_COL32_2R_4R4; + + has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Adesc, CUDA_R_8I, m, k, lda)); + has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Bdesc, CUDA_R_8I, n, k, ldb)); + + has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Adesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32))); + if(FORMATB == COL_TURING) + has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_turing, sizeof(col_turing))); + else + has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Bdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col_ampere, sizeof(col_ampere))); + + if(DTYPE_OUT == 32) + { + has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, CUDA_R_32I)); + has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT))); + has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_32I, m, n, ldc)); + has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32))); + int alpha = 1, beta = 0; + has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int32_t*)C, Cdesc, (int32_t*)C, Cdesc, NULL, NULL, 0, 0)); + } + else + { + has_error |= checkCublasStatus(cublasLtMatmulDescCreate(&matmulDesc, CUBLAS_COMPUTE_32I, CUDA_R_32F)); + has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_TRANSB, &opT, sizeof(opT))); + has_error |= checkCublasStatus(cublasLtMatrixLayoutCreate(&Cdesc, CUDA_R_8I, m, n, ldc)); + has_error |= checkCublasStatus(cublasLtMatrixLayoutSetAttribute(Cdesc, CUBLASLT_MATRIX_LAYOUT_ORDER, &col32, sizeof(col32))); + if(!SCALE_ROWS) + { + float alpha = 1.0f, beta = 0.0f; + has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc,&alpha, A, Adesc, B, Bdesc, &beta, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0)); + } + else + { + has_error |= checkCublasStatus(cublasLtMatmulDescSetAttribute(matmulDesc, CUBLASLT_MATMUL_DESC_POINTER_MODE, &alphaVec, sizeof(alphaVec))); + has_error |= checkCublasStatus(cublasLtMatmul(ltHandle, matmulDesc, row_scale, A, Adesc, B, Bdesc, NULL, (int8_t*)C, Cdesc, (int8_t*)C, Cdesc, NULL, NULL, 0, 0)); + } + } + + cudaStreamSynchronize(0); + + if (Cdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Cdesc)); + if (Bdesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Bdesc)); + if (Adesc) has_error |= checkCublasStatus(cublasLtMatrixLayoutDestroy(Adesc)); + if (matmulDesc) has_error |= checkCublasStatus(cublasLtMatmulDescDestroy(matmulDesc)); + if(has_error == 1) + printf("error detected"); + + return has_error; +} + +void transform(cublasLtHandle_t ltHandle, const void *in, int ld_in, + cublasLtMatrixLayout_t layout_in, void *out, int ld_out, + cublasLtMatrixLayout_t layout_out) { + cublasLtMatrixTransformDesc_t transform_desc = NULL; + cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F); + float alpha = 1.0f, beta = 0.0f; + cublasLtMatrixTransform(ltHandle, transform_desc, &alpha, in, layout_in, + &beta, NULL, NULL, out, layout_out, 0); + cublasLtMatrixTransformDescDestroy(transform_desc); +} + +// igemmlt +bool test1() { + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + const constexpr int m = 4; + const constexpr int n = 2; + const constexpr int k = 3; + int lda = m; + int ldb = n; + int ldc = m; + void *Adev; + void *Bdev; + void *Cdev; + cudaMalloc(&Adev, m * k * sizeof(int8_t)); + cudaMalloc(&Bdev, n * k * sizeof(int8_t)); + cudaMalloc(&Cdev, m * n * sizeof(int32_t)); + + int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0}; + + cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice); + + cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL, + Cdesc_col_major = NULL; + cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda); + cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_32I, m, n, ldc); + + // Convert A and B + cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL, + Cdesc_col32 = NULL; + int8_t *A_col32, *B_col4_4r2_8c; + int32_t *C_col32; + cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t)); + cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t)); + cudaMalloc(&C_col32, m * 32 * sizeof(std::int32_t)); + cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32); + cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n, + ((n + 8 - 1) / 8) * 8 * 32); + cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_32I, m, n, m * 32); + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C; + cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c, + CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c, + sizeof(col4_4r2_8c)); + cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + + transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32); + transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32, + Bdesc_col4_4r2_8c); + + // Matmul + igemmlt(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32, + nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32, + m * 32); + + // Convert C + transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major); + cudaStreamSynchronize(0); + + // Check result + int32_t Chost[m * n]; + cudaMemcpy(Chost, Cdev, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost); + + bool error = false; + int32_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10}; + for (int i = 0; i < m * n; i++) { + if (Chost[i] != C_ref[i]) { + error = true; + break; + } + } + printf("c:\n"); + for (int i = 0; i < m * n; i++) + printf("%d, ", Chost[i]); + printf("\n"); + + if (error) { + printf("error\n"); + } else { + printf("success\n"); + } + + cublasLtDestroy(ltHandle); + cublasLtMatrixLayoutDestroy(Adesc_col32); + cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c); + cublasLtMatrixLayoutDestroy(Cdesc_col32); + cublasLtMatrixLayoutDestroy(Adesc_col_major); + cublasLtMatrixLayoutDestroy(Bdesc_col_major); + cublasLtMatrixLayoutDestroy(Cdesc_col_major); + cudaFree(Adev); + cudaFree(Bdev); + cudaFree(Cdev); + + return !error; +} + +// igemmlt +bool test2() { + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + const constexpr int m = 4; + const constexpr int n = 2; + const constexpr int k = 3; + int lda = m; + int ldb = n; + int ldc = m; + void *Adev; + void *Bdev; + void *Cdev; + cudaMalloc(&Adev, m * k * sizeof(int8_t)); + cudaMalloc(&Bdev, n * k * sizeof(int8_t)); + cudaMalloc(&Cdev, m * n * sizeof(int8_t)); + + int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0}; + + cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice); + + cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL, + Cdesc_col_major = NULL; + cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda); + cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc); + + // Convert A and B + cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL, + Cdesc_col32 = NULL; + int8_t *A_col32, *B_col4_4r2_8c; + int8_t *C_col32; + cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t)); + cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t)); + cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t)); + cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32); + cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n, + ((n + 8 - 1) / 8) * 8 * 32); + cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32); + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C; + cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c, + CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c, + sizeof(col4_4r2_8c)); + cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + + transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32); + transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32, + Bdesc_col4_4r2_8c); + + // Matmul + igemmlt(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32, + nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32, + m * 32); + + // Convert C + transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major); + cudaStreamSynchronize(0); + + // Check result + int8_t Chost[m * n]; + cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost); + + bool error = false; + int8_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10}; + for (int i = 0; i < m * n; i++) { + if (Chost[i] != C_ref[i]) { + error = true; + break; + } + } + printf("c:\n"); + for (int i = 0; i < m * n; i++) + printf("%d, ", Chost[i]); + printf("\n"); + + if (error) { + printf("error\n"); + } else { + printf("success\n"); + } + + cublasLtDestroy(ltHandle); + cublasLtMatrixLayoutDestroy(Adesc_col32); + cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c); + cublasLtMatrixLayoutDestroy(Cdesc_col32); + cublasLtMatrixLayoutDestroy(Adesc_col_major); + cublasLtMatrixLayoutDestroy(Bdesc_col_major); + cublasLtMatrixLayoutDestroy(Cdesc_col_major); + cudaFree(Adev); + cudaFree(Bdev); + cudaFree(Cdev); + + return !error; +} + +// igemmlt +bool test3() { + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + const constexpr int m = 4; + const constexpr int n = 2; + const constexpr int k = 3; + int lda = m; + int ldb = n; + int ldc = m; + void *Adev; + void *Bdev; + void *Cdev; + cudaMalloc(&Adev, m * k * sizeof(int8_t)); + cudaMalloc(&Bdev, n * k * sizeof(int8_t)); + cudaMalloc(&Cdev, m * n * sizeof(int8_t)); + + int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0}; + + cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice); + + cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL, + Cdesc_col_major = NULL; + cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda); + cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc); + + // Convert A and B + cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col4_4r2_8c = NULL, + Cdesc_col32 = NULL; + int8_t *A_col32, *B_col4_4r2_8c; + int8_t *C_col32; + cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t)); + cudaMalloc(&B_col4_4r2_8c, ((n + 8 - 1) / 8) * 8 * 32 * sizeof(std::int8_t)); + cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t)); + cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32); + cublasLtMatrixLayoutCreate(&Bdesc_col4_4r2_8c, CUDA_R_8I, k, n, + ((n + 8 - 1) / 8) * 8 * 32); + cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32); + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col4_4r2_8c = CUBLASLT_ORDER_COL4_4R2_8C; + cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + cublasLtMatrixLayoutSetAttribute(Bdesc_col4_4r2_8c, + CUBLASLT_MATRIX_LAYOUT_ORDER, &col4_4r2_8c, + sizeof(col4_4r2_8c)); + cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + + transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32); + transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col4_4r2_8c, 8 * 32, + Bdesc_col4_4r2_8c); + + float *alpha; + cudaMallocManaged(&alpha, 4 * sizeof(float)); + alpha[0] = 0; + alpha[1] = 1; + alpha[2] = 2; + alpha[3] = 3; + + // Matmul + igemmlt(ltHandle, m, n, k, A_col32, B_col4_4r2_8c, C_col32, + alpha, m * 32, ((n + 8 - 1) / 8) * 8 * 32, m * 32); + + // Convert C + transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major); + cudaStreamSynchronize(0); + + // Check result + int8_t Chost[m * n]; + cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost); + + bool error = false; + int8_t C_ref[m * n] = {0, 17, 40, 69, 0, 6, 16, 30}; + for (int i = 0; i < m * n; i++) { + if (Chost[i] != C_ref[i]) { + error = true; + break; + } + } + printf("c:\n"); + for (int i = 0; i < m * n; i++) + printf("%d, ", Chost[i]); + printf("\n"); + + if (error) { + printf("error\n"); + } else { + printf("success\n"); + } + + cublasLtDestroy(ltHandle); + cublasLtMatrixLayoutDestroy(Adesc_col32); + cublasLtMatrixLayoutDestroy(Bdesc_col4_4r2_8c); + cublasLtMatrixLayoutDestroy(Cdesc_col32); + cublasLtMatrixLayoutDestroy(Adesc_col_major); + cublasLtMatrixLayoutDestroy(Bdesc_col_major); + cublasLtMatrixLayoutDestroy(Cdesc_col_major); + cudaFree(Adev); + cudaFree(Bdev); + cudaFree(Cdev); + cudaFree(alpha); + + return !error; +} + +// igemmlt +bool test4() { + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + const constexpr int m = 4; + const constexpr int n = 2; + const constexpr int k = 3; + int lda = m; + int ldb = n; + int ldc = m; + void *Adev; + void *Bdev; + void *Cdev; + cudaMalloc(&Adev, m * k * sizeof(int8_t)); + cudaMalloc(&Bdev, n * k * sizeof(int8_t)); + cudaMalloc(&Cdev, m * n * sizeof(int32_t)); + + int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0}; + + cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice); + + cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL, + Cdesc_col_major = NULL; + cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda); + cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_32I, m, n, ldc); + + // Convert A and B + cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL, + Cdesc_col32 = NULL; + int8_t *A_col32, *B_col32_2r_4r4; + int32_t *C_col32; + cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t)); + cudaMalloc(&B_col32_2r_4r4, + ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t)); + cudaMalloc(&C_col32, m * 32 * sizeof(std::int32_t)); + cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32); + cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n, + ((n + 32 - 1) / 32) * 32 * 32); + cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_32I, m, n, m * 32); + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4; + cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4, + CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4, + sizeof(col32_2r_4r4)); + cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + + transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32); + transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32, + Bdesc_col32_2r_4r4); + + // Matmul + igemmlt(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, + C_col32, nullptr, m * 32, + ((n + 8 - 1) / 8) * 8 * 32, m * 32); + + // Convert C + transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major); + cudaStreamSynchronize(0); + + // Check result + int32_t Chost[m * n]; + cudaMemcpy(Chost, Cdev, m * n * sizeof(int32_t), cudaMemcpyDeviceToHost); + + bool error = false; + int32_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10}; + for (int i = 0; i < m * n; i++) { + if (Chost[i] != C_ref[i]) { + error = true; + break; + } + } + printf("c:\n"); + for (int i = 0; i < m * n; i++) + printf("%d, ", Chost[i]); + printf("\n"); + + if (error) { + printf("error\n"); + } else { + printf("success\n"); + } + + cublasLtDestroy(ltHandle); + cublasLtMatrixLayoutDestroy(Adesc_col32); + cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4); + cublasLtMatrixLayoutDestroy(Cdesc_col32); + cublasLtMatrixLayoutDestroy(Adesc_col_major); + cublasLtMatrixLayoutDestroy(Bdesc_col_major); + cublasLtMatrixLayoutDestroy(Cdesc_col_major); + cudaFree(Adev); + cudaFree(Bdev); + cudaFree(Cdev); + + return !error; +} + +// igemmlt +bool test5() { + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + const constexpr int m = 4; + const constexpr int n = 2; + const constexpr int k = 3; + int lda = m; + int ldb = n; + int ldc = m; + void *Adev; + void *Bdev; + void *Cdev; + cudaMalloc(&Adev, m * k * sizeof(int8_t)); + cudaMalloc(&Bdev, n * k * sizeof(int8_t)); + cudaMalloc(&Cdev, m * n * sizeof(int8_t)); + + int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0}; + + cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice); + + cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL, + Cdesc_col_major = NULL; + cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda); + cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc); + + // Convert A and B + cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL, + Cdesc_col32 = NULL; + int8_t *A_col32, *B_col32_2r_4r4; + int8_t *C_col32; + cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t)); + cudaMalloc(&B_col32_2r_4r4, + ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t)); + cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t)); + cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32); + cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n, + ((n + 32 - 1) / 32) * 32 * 32); + cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32); + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4; + cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4, + CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4, + sizeof(col32_2r_4r4)); + cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + + transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32); + transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32, + Bdesc_col32_2r_4r4); + + // Matmul + igemmlt(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, C_col32, + nullptr, m * 32, ((n + 8 - 1) / 8) * 8 * 32, + m * 32); + + // Convert C + transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major); + cudaStreamSynchronize(0); + + // Check result + int8_t Chost[m * n]; + cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost); + + bool error = false; + int8_t C_ref[m * n] = {14, 17, 20, 23, 4, 6, 8, 10}; + for (int i = 0; i < m * n; i++) { + if (Chost[i] != C_ref[i]) { + error = true; + break; + } + } + printf("c:\n"); + for (int i = 0; i < m * n; i++) + printf("%d, ", Chost[i]); + printf("\n"); + + if (error) { + printf("error\n"); + } else { + printf("success\n"); + } + + cublasLtDestroy(ltHandle); + cublasLtMatrixLayoutDestroy(Adesc_col32); + cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4); + cublasLtMatrixLayoutDestroy(Cdesc_col32); + cublasLtMatrixLayoutDestroy(Adesc_col_major); + cublasLtMatrixLayoutDestroy(Bdesc_col_major); + cublasLtMatrixLayoutDestroy(Cdesc_col_major); + cudaFree(Adev); + cudaFree(Bdev); + cudaFree(Cdev); + + return !error; +} + +// igemmlt +bool test6() { + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + const constexpr int m = 4; + const constexpr int n = 2; + const constexpr int k = 3; + int lda = m; + int ldb = n; + int ldc = m; + void *Adev; + void *Bdev; + void *Cdev; + cudaMalloc(&Adev, m * k * sizeof(int8_t)); + cudaMalloc(&Bdev, n * k * sizeof(int8_t)); + cudaMalloc(&Cdev, m * n * sizeof(int8_t)); + + int8_t Ahost[m * k] = {6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17}; + int8_t Bhost[n * k] = {5, 4, -3, -2, 1, 0}; + + cudaMemcpy(Adev, Ahost, m * k * sizeof(int8_t), cudaMemcpyHostToDevice); + cudaMemcpy(Bdev, Bhost, n * k * sizeof(int8_t), cudaMemcpyHostToDevice); + + cublasLtMatrixLayout_t Adesc_col_major = NULL, Bdesc_col_major = NULL, + Cdesc_col_major = NULL; + cublasLtMatrixLayoutCreate(&Adesc_col_major, CUDA_R_8I, m, k, lda); + cublasLtMatrixLayoutCreate(&Bdesc_col_major, CUDA_R_8I, n, k, ldb); + cublasLtMatrixLayoutCreate(&Cdesc_col_major, CUDA_R_8I, m, n, ldc); + + // Convert A and B + cublasLtMatrixLayout_t Adesc_col32 = NULL, Bdesc_col32_2r_4r4 = NULL, + Cdesc_col32 = NULL; + int8_t *A_col32, *B_col32_2r_4r4; + int8_t *C_col32; + cudaMalloc(&A_col32, m * 32 * sizeof(std::int8_t)); + cudaMalloc(&B_col32_2r_4r4, + ((n + 32 - 1) / 32) * 32 * 32 * sizeof(std::int8_t)); + cudaMalloc(&C_col32, m * 32 * sizeof(std::int8_t)); + cublasLtMatrixLayoutCreate(&Adesc_col32, CUDA_R_8I, m, k, m * 32); + cublasLtMatrixLayoutCreate(&Bdesc_col32_2r_4r4, CUDA_R_8I, k, n, + ((n + 32 - 1) / 32) * 32 * 32); + cublasLtMatrixLayoutCreate(&Cdesc_col32, CUDA_R_8I, m, n, m * 32); + cublasLtOrder_t col32 = CUBLASLT_ORDER_COL32; + cublasLtOrder_t col32_2r_4r4 = CUBLASLT_ORDER_COL32_2R_4R4; + cublasLtMatrixLayoutSetAttribute(Adesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + cublasLtMatrixLayoutSetAttribute(Bdesc_col32_2r_4r4, + CUBLASLT_MATRIX_LAYOUT_ORDER, &col32_2r_4r4, + sizeof(col32_2r_4r4)); + cublasLtMatrixLayoutSetAttribute(Cdesc_col32, CUBLASLT_MATRIX_LAYOUT_ORDER, + &col32, sizeof(col32)); + + transform(ltHandle, Adev, lda, Adesc_col_major, A_col32, m * 32, Adesc_col32); + transform(ltHandle, Bdev, ldb, Bdesc_col_major, B_col32_2r_4r4, 8 * 32, + Bdesc_col32_2r_4r4); + + float *alpha; + cudaMallocManaged(&alpha, 4 * sizeof(float)); + alpha[0] = 0; + alpha[1] = 1; + alpha[2] = 2; + alpha[3] = 3; + + // Matmul + igemmlt(ltHandle, m, n, k, A_col32, B_col32_2r_4r4, C_col32, + alpha, m * 32, ((n + 8 - 1) / 8) * 8 * 32, m * 32); + + // Convert C + transform(ltHandle, C_col32, m * 32, Cdesc_col32, Cdev, ldc, Cdesc_col_major); + cudaStreamSynchronize(0); + + // Check result + int8_t Chost[m * n]; + cudaMemcpy(Chost, Cdev, m * n * sizeof(int8_t), cudaMemcpyDeviceToHost); + + bool error = false; + int8_t C_ref[m * n] = {0, 17, 40, 69, 0, 6, 16, 30}; + for (int i = 0; i < m * n; i++) { + if (Chost[i] != C_ref[i]) { + error = true; + break; + } + } + printf("c:\n"); + for (int i = 0; i < m * n; i++) + printf("%d, ", Chost[i]); + printf("\n"); + + if (error) { + printf("error\n"); + } else { + printf("success\n"); + } + + cublasLtDestroy(ltHandle); + cublasLtMatrixLayoutDestroy(Adesc_col32); + cublasLtMatrixLayoutDestroy(Bdesc_col32_2r_4r4); + cublasLtMatrixLayoutDestroy(Cdesc_col32); + cublasLtMatrixLayoutDestroy(Adesc_col_major); + cublasLtMatrixLayoutDestroy(Bdesc_col_major); + cublasLtMatrixLayoutDestroy(Cdesc_col_major); + cudaFree(Adev); + cudaFree(Bdev); + cudaFree(Cdev); + cudaFree(alpha); + + return !error; +} + +// clang-format off +// A (4*3) B (2*3) +// 6 10 14 5 -3 1 +// 7 11 15 4 -2 0 +// 8 12 16 +// 9 13 17 +// +// alpha * A * op(B) = alpha * C = C +// 0 6 10 14 5 4 0 14 4 0 0 +// 1 7 11 15 -3 -2 1 17 6 17 6 +// 2 8 12 16 1 0 2 20 8 40 16 +// 3 9 13 17 3 23 10 69 30 +// +// alpha * A * op(B) = alpha * C = C +// 1 6 10 14 5 4 1 14 4 14 4 +// 7 11 15 -3 -2 17 6 17 6 +// 8 12 16 1 0 20 8 20 8 +// 9 13 17 23 10 23 10 +// clang-format on + +int main() { + bool pass = true; + pass = test1() && pass; + pass = test2() && pass; + pass = test3() && pass; + pass = test4() && pass; + pass = test5() && pass; + pass = test6() && pass; + return pass ? 0 : 1; +} diff --git a/features/feature_case/cublasLt/transform.cu b/features/feature_case/cublasLt/transform.cu new file mode 100644 index 000000000..3a1205add --- /dev/null +++ b/features/feature_case/cublasLt/transform.cu @@ -0,0 +1,600 @@ +// ===------------ transform.cu -------------------------- *- CUDA -* ----=== // +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===--------------------------------------------------------------------=== // + +#include "cublasLt.h" +#include + +void transform(cublasLtHandle_t ltHandle, void *in, int ld_in, + cublasLtOrder_t order_in, void *out, int ld_out, + cublasLtOrder_t order_out, int dim1, int dim2) { + cublasLtMatrixLayout_t in_desc = NULL, out_desc = NULL; + cublasLtMatrixTransformDesc_t transform_desc = NULL; + + cublasLtMatrixLayoutCreate(&in_desc, CUDA_R_8I, dim1, dim2, ld_in); + cublasLtMatrixLayoutCreate(&out_desc, CUDA_R_8I, dim1, dim2, ld_out); + + cublasLtMatrixLayoutSetAttribute(in_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, + &order_in, sizeof(order_in)); + cublasLtMatrixLayoutSetAttribute(out_desc, CUBLASLT_MATRIX_LAYOUT_ORDER, + &order_out, sizeof(order_out)); + + cublasLtMatrixTransformDescCreate(&transform_desc, CUDA_R_32F); + + float alpha = 1.0f, beta = 0.0f; + cublasLtMatrixTransform(ltHandle, transform_desc, &alpha, in, in_desc, &beta, + NULL, NULL, out, out_desc, 0); + + cublasLtMatrixLayoutDestroy(in_desc); + cublasLtMatrixLayoutDestroy(out_desc); + cublasLtMatrixTransformDescDestroy(transform_desc); +} + +bool test_ROW() { + const constexpr int m = 2; + const constexpr int n = 33; + const constexpr int in_ld = 4; + void *in_dev; + cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t)); + + int8_t in_host[n * in_ld]; + int8_t value = 0; + for (int i = 0; i < n * in_ld; i++) { + if (i % 4 < 2) { + in_host[i] = value; + value++; + } else + in_host[i] = 99; + } + int8_t ref_2nd[n * in_ld]; + std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t)); + + cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t), + cudaMemcpyHostToDevice); + + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + + void *out_dev; + const constexpr int out_ld = 36; + cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t)); + cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t)); + transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld, + CUBLASLT_ORDER_ROW, m, n); + + int8_t out_host[out_ld * m]; + cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_1st = true; + int8_t ref_1st[out_ld * m] = + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 0, 0, 0, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, 65, 0, 0, 0}; + for (int i = 0; i < out_ld * m; i++) { + if (i % out_ld < n) { + if (out_host[i] != ref_1st[i]) { + pass_1st = false; + break; + } + } + } + + for (int i = 0; i < out_ld * m; i++) { + printf("%d, ", out_host[i]); + } + printf("\n"); + if (pass_1st) { + printf("ROW 1st pass\n"); + } else { + printf("ROW 1st fail\n"); + } + + cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t)); + std::memset(in_host, 0, n * in_ld * sizeof(int8_t)); + transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_ROW, in_dev, in_ld, + CUBLASLT_ORDER_COL, m, n); + cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_2nd = true; + for (int i = 0; i < n * in_ld; i++) { + if (i % in_ld < m) { + if (in_host[i] != ref_2nd[i]) { + pass_2nd = false; + break; + } + } + } + + for (int i = 0; i < n * in_ld; i++) { + printf("%d, ", in_host[i]); + } + printf("\n"); + if (pass_2nd) { + printf("ROW 2nd pass\n"); + } else { + printf("ROW 2nd fail\n"); + } + + cublasLtDestroy(ltHandle); + + return pass_1st && pass_2nd; +} + +bool test_COL32() { + const constexpr int m = 2; + const constexpr int n = 33; + const constexpr int in_ld = 4; + void *in_dev; + cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t)); + + int8_t in_host[n * in_ld]; + int8_t value = 0; + for (int i = 0; i < n * in_ld; i++) { + if (i % 4 < 2) { + in_host[i] = value; + value++; + } else + in_host[i] = 99; + } + int8_t ref_2nd[n * in_ld]; + std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t)); + + cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t), + cudaMemcpyHostToDevice); + + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + + void *out_dev; + const constexpr int out_ld = 64; + cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t)); + cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t)); + transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld, + CUBLASLT_ORDER_COL32, m, n); + + int8_t out_host[out_ld * m]; + cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_1st = true; + int8_t ref_1st[out_ld * m] = + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + for (int i = 0; i < out_ld * m; i++) { + if (i % out_ld < n) { + if (out_host[i] != ref_1st[i]) { + pass_1st = false; + break; + } + } + } + + for (int i = 0; i < out_ld * m; i++) { + printf("%d, ", out_host[i]); + } + printf("\n"); + if (pass_1st) { + printf("COL32 1st pass\n"); + } else { + printf("COL32 1st fail\n"); + } + + cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t)); + std::memset(in_host, 0, n * in_ld * sizeof(int8_t)); + transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL32, in_dev, in_ld, + CUBLASLT_ORDER_COL, m, n); + cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_2nd = true; + for (int i = 0; i < n * in_ld; i++) { + if (i % in_ld < m) { + if (in_host[i] != ref_2nd[i]) { + pass_2nd = false; + break; + } + } + } + + for (int i = 0; i < n * in_ld; i++) { + printf("%d, ", in_host[i]); + } + printf("\n"); + if (pass_2nd) { + printf("COL32 2nd pass\n"); + } else { + printf("COL32 2nd fail\n"); + } + + cublasLtDestroy(ltHandle); + + return pass_1st && pass_2nd; +} + +bool test_COL4_4R2_8C() { + const constexpr int m = 2; + const constexpr int n = 33; + const constexpr int in_ld = 4; + void *in_dev; + cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t)); + + int8_t in_host[n * in_ld]; + int8_t value = 0; + for (int i = 0; i < n * in_ld; i++) { + if (i % 4 < 2) { + in_host[i] = value; + value++; + } else + in_host[i] = 99; + } + int8_t ref_2nd[n * in_ld]; + std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t)); + + cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t), + cudaMemcpyHostToDevice); + + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + + void *out_dev; + const constexpr int out_ld = (32 * 8) * 2; + cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t)); + cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t)); + transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld, + CUBLASLT_ORDER_COL4_4R2_8C, m, n); + + int8_t out_host[out_ld * m]; + cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_1st = true; + int8_t ref_1st[out_ld * m] = + {0, 2, 4, 6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 8, 10, 12, 14, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 16, 18, 20, 22, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 24, 26, 28, 30, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 32, 34, 36, 38, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 40, 42, 44, 46, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 48, 50, 52, 54, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 56, 58, 60, 62, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 1, 3, 5, 7, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 9, 11, 13, 15, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 17, 19, 21, 23, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 25, 27, 29, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 33, 35, 37, 39, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 41, 43, 45, 47, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 49, 51, 53, 55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 57, 59, 61, 63, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + for (int i = 0; i < out_ld * m; i++) { + if (i % out_ld < n) { + if (out_host[i] != ref_1st[i]) { + pass_1st = false; + break; + } + } + } + + for (int i = 0; i < out_ld * m; i++) { + printf("%d, ", out_host[i]); + } + printf("\n"); + if (pass_1st) { + printf("COL4_4R2_8C 1st pass\n"); + } else { + printf("COL4_4R2_8C 1st fail\n"); + } + + cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t)); + std::memset(in_host, 0, n * in_ld * sizeof(int8_t)); + transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL4_4R2_8C, in_dev, + in_ld, CUBLASLT_ORDER_COL, m, n); + cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_2nd = true; + for (int i = 0; i < n * in_ld; i++) { + if (i % in_ld < m) { + if (in_host[i] != ref_2nd[i]) { + pass_2nd = false; + break; + } + } + } + + for (int i = 0; i < n * in_ld; i++) { + printf("%d, ", in_host[i]); + } + printf("\n"); + if (pass_2nd) { + printf("COL4_4R2_8C 2nd pass\n"); + } else { + printf("COL4_4R2_8C 2nd fail\n"); + } + + cublasLtDestroy(ltHandle); + + return pass_1st && pass_2nd; +} + +bool test_COL32_2R_4R4() { + const constexpr int m = 2; + const constexpr int n = 33; + const constexpr int in_ld = 4; + void *in_dev; + cudaMalloc(&in_dev, n * in_ld * sizeof(int8_t)); + + int8_t in_host[n * in_ld]; + int8_t value = 0; + for (int i = 0; i < n * in_ld; i++) { + if (i % 4 < 2) { + in_host[i] = value; + value++; + } else + in_host[i] = 99; + } + int8_t ref_2nd[n * in_ld]; + std::memcpy(ref_2nd, in_host, n * in_ld * sizeof(int8_t)); + + cudaMemcpy(in_dev, in_host, n * in_ld * sizeof(int8_t), + cudaMemcpyHostToDevice); + + cublasLtHandle_t ltHandle; + cublasLtCreate(<Handle); + + void *out_dev; + const constexpr int out_ld = (32 * 32) * 2; + cudaMalloc(&out_dev, out_ld * m * sizeof(int8_t)); + cudaMemset(out_dev, 0, out_ld * m * sizeof(int8_t)); + transform(ltHandle, in_dev, in_ld, CUBLASLT_ORDER_COL, out_dev, out_ld, + CUBLASLT_ORDER_COL32_2R_4R4, m, n); + + int8_t out_host[out_ld * m]; + cudaMemcpy(out_host, out_dev, out_ld * m * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_1st = true; + int8_t ref_1st[out_ld * m] = + {0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46, 48, 50, 52, 54, 56, 58, 60, 62, + 1, 3, 5, 7, 9, 11, 13, 15, 17, 19, 21, 23, 25, 27, 29, 31, 33, 35, 37, 39, 41, 43, 45, 47, 49, 51, 53, 55, 57, 59, 61, 63, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 64, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 65, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + for (int i = 0; i < out_ld * m; i++) { + if (i % out_ld < n) { + if (out_host[i] != ref_1st[i]) { + pass_1st = false; + break; + } + } + } + + for (int i = 0; i < out_ld * m; i++) { + printf("%d, ", out_host[i]); + } + printf("\n"); + if (pass_1st) { + printf("COL32_2R_4R4 1st pass\n"); + } else { + printf("COL32_2R_4R4 1st fail\n"); + } + + cudaMemset(in_dev, 0, n * in_ld * sizeof(int8_t)); + std::memset(in_host, 0, n * in_ld * sizeof(int8_t)); + transform(ltHandle, out_dev, out_ld, CUBLASLT_ORDER_COL32_2R_4R4, in_dev, + in_ld, CUBLASLT_ORDER_COL, m, n); + cudaMemcpy(in_host, in_dev, n * in_ld * sizeof(int8_t), + cudaMemcpyDeviceToHost); + + bool pass_2nd = true; + for (int i = 0; i < n * in_ld; i++) { + if (i % in_ld < m) { + if (in_host[i] != ref_2nd[i]) { + pass_2nd = false; + break; + } + } + } + + for (int i = 0; i < n * in_ld; i++) { + printf("%d, ", in_host[i]); + } + printf("\n"); + if (pass_2nd) { + printf("COL32_2R_4R4 2nd pass\n"); + } else { + printf("COL32_2R_4R4 2nd fail\n"); + } + + cublasLtDestroy(ltHandle); + + return pass_1st && pass_2nd; +} + +// Input col_major matrix: +// 2 rows * 33 columns, ld is 4 +int main() { + bool pass = true; + pass = test_ROW() && pass; + pass = test_COL32() && pass; + pass = test_COL4_4R2_8C() && pass; + pass = test_COL32_2R_4R4() && pass; + return pass ? 0 : 1; +} diff --git a/features/feature_case/cusparse/cusparse_10.cu b/features/feature_case/cusparse/cusparse_10.cu new file mode 100644 index 000000000..10b5abd61 --- /dev/null +++ b/features/feature_case/cusparse/cusparse_10.cu @@ -0,0 +1,266 @@ +// ===------- cusparse_10.cu ------------------------------- *- CUDA -* ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===----------------------------------------------------------------------===// + +#include "cusparse.h" + +#include +#include +#include +#include + +template +struct Data { + float *h_data; + d_data_t *d_data; + int element_num; + Data(int element_num) : element_num(element_num) { + h_data = (float *)malloc(sizeof(float) * element_num); + memset(h_data, 0, sizeof(float) * element_num); + cudaMalloc(&d_data, sizeof(d_data_t) * element_num); + cudaMemset(d_data, 0, sizeof(d_data_t) * element_num); + } + Data(float *input_data, int element_num) : element_num(element_num) { + h_data = (float *)malloc(sizeof(float) * element_num); + cudaMalloc(&d_data, sizeof(d_data_t) * element_num); + cudaMemset(d_data, 0, sizeof(d_data_t) * element_num); + memcpy(h_data, input_data, sizeof(float) * element_num); + } + ~Data() { + free(h_data); + cudaFree(d_data); + } + void H2D() { + d_data_t *h_temp = (d_data_t *)malloc(sizeof(d_data_t) * element_num); + memset(h_temp, 0, sizeof(d_data_t) * element_num); + from_float_convert(h_data, h_temp); + cudaMemcpy(d_data, h_temp, sizeof(d_data_t) * element_num, + cudaMemcpyHostToDevice); + free(h_temp); + } + void D2H() { + d_data_t *h_temp = (d_data_t *)malloc(sizeof(d_data_t) * element_num); + memset(h_temp, 0, sizeof(d_data_t) * element_num); + cudaMemcpy(h_temp, d_data, sizeof(d_data_t) * element_num, + cudaMemcpyDeviceToHost); + to_float_convert(h_temp, h_data); + free(h_temp); + } + +private: + inline void from_float_convert(float *in, d_data_t *out) { + for (int i = 0; i < element_num; i++) + out[i] = in[i]; + } + inline void to_float_convert(d_data_t *in, float *out) { + for (int i = 0; i < element_num; i++) + out[i] = in[i]; + } +}; +template <> +inline void Data::from_float_convert(float *in, float2 *out) { + for (int i = 0; i < element_num; i++) + out[i].x = in[i]; +} +template <> +inline void Data::from_float_convert(float *in, double2 *out) { + for (int i = 0; i < element_num; i++) + out[i].x = in[i]; +} + +template <> +inline void Data::to_float_convert(float2 *in, float *out) { + for (int i = 0; i < element_num; i++) + out[i] = in[i].x; +} +template <> +inline void Data::to_float_convert(double2 *in, float *out) { + for (int i = 0; i < element_num; i++) + out[i] = in[i].x; +} + +bool compare_result(float *expect, float *result, int element_num) { + for (int i = 0; i < element_num; i++) { + if (std::abs(result[i] - expect[i]) >= 0.05) { + return false; + } + } + return true; +} + +bool compare_result(float *expect, float *result, std::vector indices) { + for (int i = 0; i < indices.size(); i++) { + if (std::abs(result[indices[i]] - expect[indices[i]]) >= 0.05) { + return false; + } + } + return true; +} + +bool test_passed = true; + +// A +// 1 4 0 0 0 +// 0 2 3 0 0 +// 5 0 7 8 0 +// 0 0 9 0 6 +void test_cusparseSpMM_COO() { + std::vector a_val_vec = {1, 4, 2, 3, 5, 7, 8, 9, 6}; + Data a_s_val(a_val_vec.data(), 9); + Data a_d_val(a_val_vec.data(), 9); + Data a_c_val(a_val_vec.data(), 9); + Data a_z_val(a_val_vec.data(), 9); + std::vector a_row_ptr_vec = {1, 1, 2, 2, 3, 3, 3, 4, 4}; + Data a_row_ptr(a_row_ptr_vec.data(), 9); + std::vector a_col_ind_vec = {1, 2, 2, 3, 1, 4, 5, 3, 5}; + Data a_col_ind(a_col_ind_vec.data(), 9); + + std::vector b_vec = {1, 2, 3, 4, 5, 6, 7, 8, 9, 10}; + Data b_s(b_vec.data(), 10); + Data b_d(b_vec.data(), 10); + Data b_c(b_vec.data(), 10); + Data b_z(b_vec.data(), 10); + + Data c_s(8); + Data c_d(8); + Data c_c(8); + Data c_z(8); + + float alpha = 10; + Data alpha_s(&alpha, 1); + Data alpha_d(&alpha, 1); + Data alpha_c(&alpha, 1); + Data alpha_z(&alpha, 1); + + float beta = 0; + Data beta_s(&beta, 1); + Data beta_d(&beta, 1); + Data beta_c(&beta, 1); + Data beta_z(&beta, 1); + + cusparseHandle_t handle; + cusparseCreate(&handle); + + cusparseSetPointerMode(handle, CUSPARSE_POINTER_MODE_DEVICE); + + a_s_val.H2D(); + a_d_val.H2D(); + a_c_val.H2D(); + a_z_val.H2D(); + a_row_ptr.H2D(); + a_col_ind.H2D(); + b_s.H2D(); + b_d.H2D(); + b_c.H2D(); + b_z.H2D(); + alpha_s.H2D(); + alpha_d.H2D(); + alpha_c.H2D(); + alpha_z.H2D(); + beta_s.H2D(); + beta_d.H2D(); + beta_c.H2D(); + beta_z.H2D(); + + cusparseSpMatDescr_t a_descr_s; + cusparseSpMatDescr_t a_descr_d; + cusparseSpMatDescr_t a_descr_c; + cusparseSpMatDescr_t a_descr_z; + cusparseCreateCoo(&a_descr_s, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_s_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_32F); + cusparseCreateCoo(&a_descr_d, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_d_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_R_64F); + cusparseCreateCoo(&a_descr_c, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_c_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_32F); + cusparseCreateCoo(&a_descr_z, 4, 5, 9, a_row_ptr.d_data, a_col_ind.d_data, a_z_val.d_data, CUSPARSE_INDEX_32I, CUSPARSE_INDEX_BASE_ONE, CUDA_C_64F); + + cusparseDnMatDescr_t b_descr_s; + cusparseDnMatDescr_t b_descr_d; + cusparseDnMatDescr_t b_descr_c; + cusparseDnMatDescr_t b_descr_z; + cusparseCreateDnMat(&b_descr_s, 5, 2, 5, b_s.d_data, CUDA_R_32F, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&b_descr_d, 5, 2, 5, b_d.d_data, CUDA_R_64F, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&b_descr_c, 5, 2, 5, b_c.d_data, CUDA_C_32F, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&b_descr_z, 5, 2, 5, b_z.d_data, CUDA_C_64F, CUSPARSE_ORDER_COL); + + cusparseDnMatDescr_t c_descr_s; + cusparseDnMatDescr_t c_descr_d; + cusparseDnMatDescr_t c_descr_c; + cusparseDnMatDescr_t c_descr_z; + cusparseCreateDnMat(&c_descr_s, 4, 2, 4, c_s.d_data, CUDA_R_32F, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&c_descr_d, 4, 2, 4, c_d.d_data, CUDA_R_64F, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&c_descr_c, 4, 2, 4, c_c.d_data, CUDA_C_32F, CUSPARSE_ORDER_COL); + cusparseCreateDnMat(&c_descr_z, 4, 2, 4, c_z.d_data, CUDA_C_64F, CUSPARSE_ORDER_COL); + + size_t ws_size_s; + size_t ws_size_d; + size_t ws_size_c; + size_t ws_size_z; + cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_s); + cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_d); + cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_c); + cusparseSpMM_bufferSize(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, &ws_size_z); + + void *ws_s = nullptr; + void *ws_d = nullptr; + void *ws_c = nullptr; + void *ws_z = nullptr; + cudaMalloc(&ws_s, ws_size_s); + cudaMalloc(&ws_d, ws_size_d); + cudaMalloc(&ws_c, ws_size_c); + cudaMalloc(&ws_z, ws_size_z); + + cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_s); + cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_d); + cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_c); + cusparseSpMM_preprocess(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_z); + cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_s.d_data, a_descr_s, b_descr_s, beta_s.d_data, c_descr_s, CUDA_R_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_s); + cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_d.d_data, a_descr_d, b_descr_d, beta_d.d_data, c_descr_d, CUDA_R_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_d); + cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_c.d_data, a_descr_c, b_descr_c, beta_c.d_data, c_descr_c, CUDA_C_32F, CUSPARSE_SPMM_ALG_DEFAULT, ws_c); + cusparseSpMM(handle, CUSPARSE_OPERATION_NON_TRANSPOSE, CUSPARSE_OPERATION_NON_TRANSPOSE, alpha_z.d_data, a_descr_z, b_descr_z, beta_z.d_data, c_descr_z, CUDA_C_64F, CUSPARSE_SPMM_ALG_DEFAULT, ws_z); + + c_s.D2H(); + c_d.D2H(); + c_c.D2H(); + c_z.D2H(); + + cudaStreamSynchronize(0); + + cudaFree(ws_s); + cudaFree(ws_d); + cudaFree(ws_c); + cudaFree(ws_z); + cusparseDestroySpMat(a_descr_s); + cusparseDestroySpMat(a_descr_d); + cusparseDestroySpMat(a_descr_c); + cusparseDestroySpMat(a_descr_z); + cusparseDestroyDnMat(b_descr_s); + cusparseDestroyDnMat(b_descr_d); + cusparseDestroyDnMat(b_descr_c); + cusparseDestroyDnMat(b_descr_z); + cusparseDestroyDnMat(c_descr_s); + cusparseDestroyDnMat(c_descr_d); + cusparseDestroyDnMat(c_descr_c); + cusparseDestroyDnMat(c_descr_z); + cusparseDestroy(handle); + + float expect_c[8] = {90, 130, 730, 570, 340, 380, 1730, 1320}; + if (compare_result(expect_c, c_s.h_data, 8) && + compare_result(expect_c, c_d.h_data, 8) && + compare_result(expect_c, c_c.h_data, 8) && + compare_result(expect_c, c_z.h_data, 8)) + printf("SpMM_COO pass\n"); + else { + printf("SpMM_COO fail\n"); + test_passed = false; + } +} + +int main() { + test_cusparseSpMM_COO(); + + if (test_passed) + return 0; + return -1; +} diff --git a/features/feature_case/device_info/device_info.cu b/features/feature_case/device_info/device_info.cu index d8e5abaf7..cd5a6f8fc 100644 --- a/features/feature_case/device_info/device_info.cu +++ b/features/feature_case/device_info/device_info.cu @@ -28,6 +28,7 @@ void test1() { auto maxTexture1D = properties.maxTexture1D; auto maxTexture2D = properties.maxTexture2D; auto maxTexture3D = properties.maxTexture3D; + auto regsPerBlock = properties.regsPerBlock; std::cout << "Device ID: " << id << std::endl; std::cout << "Device UUID: "; @@ -43,6 +44,7 @@ void test1() { std::cout << "Device maxTexture3D: " << maxTexture3D[0] << " " << properties.maxTexture3D[1] << " " << maxTexture3D[2] << std::endl; + std::cout << "Device regsPerBlock: " << regsPerBlock << std::endl; } int main() { diff --git a/features/feature_case/math/math-int.cu b/features/feature_case/math/math-int.cu new file mode 100644 index 000000000..5c09e092b --- /dev/null +++ b/features/feature_case/math/math-int.cu @@ -0,0 +1,196 @@ +// ===--------------- math-int.cu ---------- *- CUDA -* -------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// +// ===---------------------------------------------------------------------===// + +#include +#include + +using namespace std; + +int passed = 0; +int failed = 0; + +void check(bool IsPassed) { + if (IsPassed) { + cout << " ---- passed" << endl; + passed++; + } else { + cout << " ---- failed" << endl; + failed++; + } +} + +template void printFunc(const string &FuncName, const T &Input) { + cout << FuncName << "(" << Input << ") "; +} + +template +void printFunc(const string &FuncName, const pair &Input) { + cout << FuncName << "(" << Input.first << ", " << Input.second << ")"; +} + +template void checkResult(const T &Expect, const T &Result) { + cout << " = " << Result << " (expect " << Expect << ")"; + check(Result == Expect); +} + +// Integer Mathematical Functions + +__global__ void _llmax(long long *const Result, long long Input1, + long long Input2) { + *Result = llmax(Input1, Input2); +} + +void testLlmaxCases( + const vector, long long>> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + _llmax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second); + cudaDeviceSynchronize(); + printFunc("llmax", TestCase.first); + checkResult(TestCase.second, *Result); + } +} + +__global__ void _llmin(long long *const Result, long long Input1, + long long Input2) { + *Result = llmin(Input1, Input2); +} + +void testLlminCases( + const vector, long long>> &TestCases) { + long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + _llmin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second); + cudaDeviceSynchronize(); + printFunc("llmin", TestCase.first); + checkResult(TestCase.second, *Result); + } +} + +__global__ void _ullmax(unsigned long long *const Result, + unsigned long long Input1, unsigned long long Input2) { + *Result = ullmax(Input1, Input2); +} + +void testUllmaxCases( + const vector, + unsigned long long>> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + _ullmax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second); + cudaDeviceSynchronize(); + printFunc("ullmax", TestCase.first); + checkResult(TestCase.second, *Result); + } +} + +__global__ void _ullmin(unsigned long long *const Result, + unsigned long long Input1, unsigned long long Input2) { + *Result = ullmin(Input1, Input2); +} + +void testUllminCases( + const vector, + unsigned long long>> &TestCases) { + unsigned long long *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + _ullmin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second); + cudaDeviceSynchronize(); + printFunc("ullmin", TestCase.first); + checkResult(TestCase.second, *Result); + } +} + +__global__ void _umax(unsigned *const Result, unsigned Input1, + unsigned Input2) { + *Result = umax(Input1, Input2); +} + +void testUmaxCases( + const vector, unsigned>> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + _umax<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second); + cudaDeviceSynchronize(); + printFunc("umax", TestCase.first); + checkResult(TestCase.second, *Result); + } +} + +__global__ void _umin(unsigned *const Result, unsigned Input1, + unsigned Input2) { + *Result = umin(Input1, Input2); +} + +void testUminCases( + const vector, unsigned>> &TestCases) { + unsigned *Result; + cudaMallocManaged(&Result, sizeof(*Result)); + for (const auto &TestCase : TestCases) { + _umin<<<1, 1>>>(Result, TestCase.first.first, TestCase.first.second); + cudaDeviceSynchronize(); + printFunc("umin", TestCase.first); + checkResult(TestCase.second, *Result); + } +} + +int main() { + testLlmaxCases({ + {{1, 2}, 2}, + {{-1, -2}, -1}, + {{1, -2}, 1}, + {{-1, 2}, 2}, + {{45212221678, 221332142421}, 221332142421}, + }); + testLlminCases({ + {{1, 2}, 1}, + {{-1, -2}, -2}, + {{1, -2}, -2}, + {{-1, 2}, -1}, + {{45212221678, 221332142421}, 45212221678}, + }); + testUllmaxCases({ + {{1, 2}, 2}, + {{18446744073709551615, 18446744073709551614}, 18446744073709551615}, + {{1, 18446744073709551614}, 18446744073709551614}, + {{18446744073709551615, 2}, 18446744073709551615}, + {{45212221678, 221332142421}, 221332142421}, + }); + testUllminCases({ + {{1, 2}, 1}, + {{18446744073709551615, 18446744073709551614}, 18446744073709551614}, + {{1, 18446744073709551614}, 1}, + {{18446744073709551615, 2}, 2}, + {{45212221678, 221332142421}, 45212221678}, + }); + testUmaxCases({ + {{1, 2}, 2}, + {{4294967295, 4294967294}, 4294967295}, + {{1, 4294967294}, 4294967294}, + {{4294967295, 2}, 4294967295}, + {{2262548718, 2288810325}, 2288810325}, + }); + testUminCases({ + {{1, 2}, 1}, + {{4294967295, 4294967294}, 4294967294}, + {{1, 4294967294}, 1}, + {{4294967295, 2}, 2}, + {{2262548718, 2288810325}, 2262548718}, + }); + cout << "passed " << passed << "/" << passed + failed << " cases!" << endl; + if (failed) { + cout << "failed!" << endl; + } + return failed; +} diff --git a/features/features.xml b/features/features.xml index 971570894..ff7838f2e 100644 --- a/features/features.xml +++ b/features/features.xml @@ -147,6 +147,7 @@ + @@ -282,6 +283,7 @@ + @@ -318,10 +320,10 @@ - - - - + + + + @@ -340,5 +342,7 @@ + + diff --git a/features/test_feature.py b/features/test_feature.py index 2ef527bf3..bbd6e08c9 100644 --- a/features/test_feature.py +++ b/features/test_feature.py @@ -36,10 +36,10 @@ 'math-ext-bf16-conv', 'math-ext-double', 'math-ext-float', 'math-ext-half', 'math-ext-half-after11', 'math-ext-half-conv', 'math-ext-half2', 'math-ext-half2-after11', 'math-ext-simd', 'cudnn-activation', 'cudnn-fill', 'cudnn-lrn', 'cudnn-memory', 'cudnn-pooling', 'cudnn-reorder', 'cudnn-scale', 'cudnn-softmax', 'cudnn-sum', 'math-funnelshift', 'thrust-sort_by_key', 'thrust-find', 'thrust-inner_product', 'thrust-reduce_by_key', - 'math-bf16-conv', 'math-emu-bf16-conv-double', 'math-ext-bf16-conv-double', 'math-half-conv', + 'math-bf16-conv', 'math-emu-bf16-conv-double', 'math-ext-bf16-conv-double', 'math-half-conv', 'math-int', 'math-bfloat16', 'libcu_atomic', 'test_shared_memory', 'cudnn-reduction', 'cudnn-binary', 'cudnn-bnp1', 'cudnn-bnp2', 'cudnn-bnp3', 'cudnn-normp1', 'cudnn-normp2', 'cudnn-normp3', 'cudnn-convp1', 'cudnn-convp2', 'cudnn-convp3', 'cudnn-convp4', 'cudnn-convp5', 'cudnn-convp6', 'cudnn-convp7', - 'cudnn_mutilple_files', "cusparse_1", "cusparse_2", "cusparse_3", "cusparse_4", "cusparse_5", "cusparse_6", "cusparse_7", "cusparse_8", "cusparse_9", + 'cudnn_mutilple_files', "cusparse_1", "cusparse_2", "cusparse_3", "cusparse_4", "cusparse_5", "cusparse_6", "cusparse_7", "cusparse_8", "cusparse_9", "cusparse_10", 'cudnn-GetErrorString', 'cub_device_histgram', 'peer_access', 'driver_err_handle', 'cudnn-types', 'cudnn-version', 'cudnn-dropout', 'const_opt', 'constant_attr', 'sync_warp_p2', 'occupancy_calculation', @@ -60,7 +60,7 @@ 'thrust_swap_ranges', 'thrust_uninitialized_fill_n', 'thrust_equal', 'system_atomic', 'thrust_detail_types', 'operator_eq', 'operator_neq', 'operator_lege', 'thrust_system', 'thrust_reverse_copy', 'thrust_device_new_delete', 'thrust_temporary_buffer', 'thrust_malloc_free', 'codepin', 'thrust_unique_count', - 'thrust_advance_trans_op_itr', 'cuda_stream_query'] + 'thrust_advance_trans_op_itr', 'cuda_stream_query', "matmul", "transform"] occupancy_calculation_exper = ['occupancy_calculation'] @@ -166,7 +166,7 @@ def build_test(): 'cudnn-binary', 'cudnn-bnp1', 'cudnn-bnp2', 'cudnn-bnp3', 'cudnn-normp1', 'cudnn-normp2', 'cudnn-normp3', 'cudnn-convp1', 'cudnn-convp2', 'cudnn-convp3', 'cudnn-convp4', 'cudnn-convp5', 'cudnn-convp6', 'cudnn-rnn', 'cudnn-GetErrorString', 'cudnn-convp7', - 'cudnn-types', 'cudnn-version', 'cudnn-dropout' + 'cudnn-types', 'cudnn-version', 'cudnn-dropout', 'matmul' ] no_fast_math_tests = ['math-emu-half-after11', 'math-emu-half2-after11', 'math-ext-half-after11', 'math-ext-half2-after11', diff --git a/help_function/help_function.xml b/help_function/help_function.xml index d35b308c9..fc047e53e 100644 --- a/help_function/help_function.xml +++ b/help_function/help_function.xml @@ -87,7 +87,7 @@ - + @@ -215,5 +215,6 @@ + diff --git a/help_function/src/blas_gemm_utils_interface.cpp b/help_function/src/blas_gemm_utils_interface.cpp new file mode 100644 index 000000000..551021b31 --- /dev/null +++ b/help_function/src/blas_gemm_utils_interface.cpp @@ -0,0 +1,152 @@ +// ===------ blas_gemm_utils_interface.cpp ----------------- *- C++ -* ----===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +// ===--------------------------------------------------------------------=== // + +#include +#include +#include +#include + +void foo1 () { + dpct::blas_gemm::experimental::descriptor_ptr ltHandle; + ltHandle = new dpct::blas_gemm::experimental::descriptor(); + delete (ltHandle); + + dpct::blas_gemm::experimental::matrix_layout_ptr matLayout; + dpct::library_data_t type; + uint64_t rows; + uint64_t cols; + int64_t ld; + matLayout = + new dpct::blas_gemm::experimental::matrix_layout_t(type, rows, cols, ld); + + dpct::blas_gemm::experimental::matrix_layout_t::attribute attr1; + void *buf1; + size_t sizeInBytes1; + size_t *sizeWritten1; + matLayout->get_attribute(attr1, buf1); + matLayout->set_attribute(attr1, buf1); + delete (matLayout); + + dpct::blas_gemm::experimental::matmul_desc_ptr matmulDesc; + dpct::compute_type computeType; + dpct::library_data_t scaleType; + matmulDesc = + new dpct::blas_gemm::experimental::matmul_desc_t(computeType, scaleType); + + dpct::blas_gemm::experimental::matmul_desc_t::attribute attr2; + void *buf2; + size_t sizeInBytes2; + size_t *sizeWritten2; + matmulDesc->get_attribute(attr2, buf2); + matmulDesc->set_attribute(attr2, buf2); + delete (matmulDesc); + + int matmulPreference; + void *buf3; + size_t sizeInBytes3; + size_t *sizeWritten3; + + dpct::blas_gemm::experimental::matrix_layout_ptr Adesc; + dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc; + dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc; + dpct::blas_gemm::experimental::matrix_layout_ptr Ddesc; + + int requestedAlgoCount = 1; + int heuristicResultsArray; + int returnAlgoCount; + returnAlgoCount = 1; +} + +void foo2() { + dpct::blas_gemm::experimental::descriptor_ptr lightHandle; + dpct::blas_gemm::experimental::matmul_desc_ptr computeDesc; + const void *alpha; + const void *A; + dpct::blas_gemm::experimental::matrix_layout_ptr Adesc; + const void *B; + dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc; + const void *beta; + const void *C; + dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc; + void *D; + dpct::blas_gemm::experimental::matrix_layout_ptr Ddesc; + const int *algo; + void *workspace; + size_t workspaceSizeInBytes; + dpct::queue_ptr stream; + dpct::blas_gemm::experimental::matmul(lightHandle, computeDesc, alpha, A, + Adesc, B, Bdesc, beta, C, Cdesc, D, + Ddesc, stream); +} + +void foo3() { + dpct::blas_gemm::experimental::order_t a; + a = dpct::blas_gemm::experimental::order_t::col; + a = dpct::blas_gemm::experimental::order_t::row; + a = dpct::blas_gemm::experimental::order_t::col32; + a = dpct::blas_gemm::experimental::order_t::col4_4r2_8c; + a = dpct::blas_gemm::experimental::order_t::col32_2r_4r4; + + dpct::blas_gemm::experimental::pointer_mode_t b; + b = dpct::blas_gemm::experimental::pointer_mode_t::host; + b = dpct::blas_gemm::experimental::pointer_mode_t::device; + b = dpct::blas_gemm::experimental::pointer_mode_t::device_vector; + b = dpct::blas_gemm::experimental::pointer_mode_t:: + alpha_device_vector_beta_zero; + b = dpct::blas_gemm::experimental::pointer_mode_t:: + alpha_device_vector_beta_host; + + dpct::blas_gemm::experimental::matrix_layout_t::attribute c; + c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::type; + c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::order; + c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::rows; + c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::cols; + c = dpct::blas_gemm::experimental::matrix_layout_t::attribute::ld; + + dpct::blas_gemm::experimental::matmul_desc_t::attribute d; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::compute_type; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::scale_type; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::pointer_mode; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_a; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_b; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::trans_c; + d = dpct::blas_gemm::experimental::matmul_desc_t::attribute::epilogue; +} + +void foo4() { + dpct::blas_gemm::experimental::transform_desc_ptr transformDesc; + dpct::library_data_t scaleType; + transformDesc = + new dpct::blas_gemm::experimental::transform_desc_t(scaleType); + oneapi::mkl::transpose opT = oneapi::mkl::transpose::trans; + size_t sizeWritten; + transformDesc->set_attribute( + dpct::blas_gemm::experimental::transform_desc_t::attribute::trans_a, + &opT); + transformDesc->get_attribute( + dpct::blas_gemm::experimental::transform_desc_t::attribute::trans_a, + &opT); + delete (transformDesc); + + dpct::blas_gemm::experimental::descriptor_ptr lightHandle; + const void *alpha; + const void *A; + dpct::blas_gemm::experimental::matrix_layout_ptr Adesc; + const void *beta; + const void *B; + dpct::blas_gemm::experimental::matrix_layout_ptr Bdesc; + void *C; + dpct::blas_gemm::experimental::matrix_layout_ptr Cdesc; + dpct::queue_ptr stream; + dpct::blas_gemm::experimental::matrix_transform( + transformDesc, alpha, A, Adesc, beta, B, Bdesc, C, Cdesc, stream); +} + +int main() { + return 0; +} diff --git a/help_function/src/onedpl_test_reduce.cpp b/help_function/src/onedpl_test_reduce.cpp index f12b5ff5e..2cf06eea5 100644 --- a/help_function/src/onedpl_test_reduce.cpp +++ b/help_function/src/onedpl_test_reduce.cpp @@ -18,7 +18,7 @@ #include #include - +#include @@ -378,6 +378,72 @@ int main() { } + // Testing calls to dpct::argmin and dpct::argmax functors with unique and equivalent values + { + auto queue = dpct::get_default_queue(); + test_name = "oneapi::dpl::reduce with dpct::argmin functor - All values are unique"; + std::size_t n = 10; + sycl::buffer> input(n); + { + auto host_acc = input.get_host_access(); + for (std::size_t i = 0; i < n; ++i) + host_acc[i].key = i + 10, host_acc[i].value = i + 20; + // inject min and max + host_acc[4].key = 9; + host_acc[4].value = 8; + host_acc[6].key = 101; + host_acc[6].value = 99; + } + auto argmin_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue), + oneapi::dpl::begin(input), + oneapi::dpl::end(input), + dpct::key_value_pair(std::numeric_limits::max(), + std::numeric_limits::max()), + dpct::argmin()); + + failed_tests += ASSERT_EQUAL(test_name, argmin_res.key, 9); + failed_tests += ASSERT_EQUAL(test_name, argmin_res.value, 8); + + test_name = "oneapi::dpl::reduce with dpct::argmax functor - All values are unique"; + auto argmax_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue), + oneapi::dpl::begin(input), + oneapi::dpl::end(input), + dpct::key_value_pair(std::numeric_limits::min(), + std::numeric_limits::min()), + dpct::argmax()); + + failed_tests += ASSERT_EQUAL(test_name, argmax_res.key, 101); + failed_tests += ASSERT_EQUAL(test_name, argmax_res.value, 99); + + test_name = "oneapi::dpl::reduce with dpct::argmin functor - All values are the same"; + { + auto host_acc = input.get_host_access(); + for (std::size_t i = 0; i < n; ++i) + host_acc[i].key = i + 30, host_acc[i].value = 2; + } + // Expect the key_value_pair with the lower key to be returned when value compares equal + argmin_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue), + oneapi::dpl::begin(input), + oneapi::dpl::end(input), + dpct::key_value_pair(std::numeric_limits::max(), + std::numeric_limits::max()), + dpct::argmin()); + + failed_tests += ASSERT_EQUAL(test_name, argmin_res.key, 30); + failed_tests += ASSERT_EQUAL(test_name, argmin_res.value, 2); + + argmax_res = oneapi::dpl::reduce(dpl::execution::make_device_policy(queue), + oneapi::dpl::begin(input), + oneapi::dpl::end(input), + dpct::key_value_pair(std::numeric_limits::min(), + std::numeric_limits::min()), + dpct::argmax()); + + // Expect the key_value_pair with the lower key to be returned when value compares equal + failed_tests += ASSERT_EQUAL(test_name, argmax_res.key, 30); + failed_tests += ASSERT_EQUAL(test_name, argmax_res.value, 2); + } + std::cout << std::endl << failed_tests << " failing test(s) detected." << std::endl; if (failed_tests == 0) { return 0; diff --git a/help_function/src/sparse_utils_1.cpp b/help_function/src/sparse_utils_1.cpp index 7279dac85..a9debf19c 100644 --- a/help_function/src/sparse_utils_1.cpp +++ b/help_function/src/sparse_utils_1.cpp @@ -71,6 +71,10 @@ bool test1() { if (ib != oneapi::mkl::index_base::one) return false; + dpct::sparse::matrix_format mf; + mf = dpct::sparse::matrix_format::csr; + mf = dpct::sparse::matrix_format::coo; + return true; } diff --git a/help_function/test_help.py b/help_function/test_help.py index b6c585270..52588464a 100644 --- a/help_function/test_help.py +++ b/help_function/test_help.py @@ -45,7 +45,7 @@ def build_test(): "dnnl_utils_batch_normalization_2", "dnnl_utils_batch_normalization_3", "dnnl_utils_convolution_1", "dnnl_utils_convolution_2", "dnnl_utils_convolution_3", "dnnl_utils_convolution_4", "dnnl_utils_convolution_5", "dnnl_utils_normalization_1", "dnnl_utils_normalization_2", "dnnl_utils_normalization_3", "dnnl_utils_rnn", - "dnnl_utils_version", "dnnl_utils_dropout"] + "dnnl_utils_version", "dnnl_utils_dropout", "blas_gemm_utils_interface"] fft_cases = ["fft_utils_engine_buffer", "fft_utils_engine_usm", "fft_workspace_interface", "fft_set_workspace"] lapack_cases = ["lapack_utils_buffer", "lapack_utils_usm"] rng_cases = ["rng_generator", "rng_generator_vec_size_1", "rng_host"]