Skip to content
Permalink

Comparing changes

This is a direct comparison between two commits made in this repository or its related repositories. View the default comparison for this range or learn more about diff comparisons.

Open a pull request

Create a new pull request by comparing changes across two branches. If you need to, you can also . Learn more about diff comparisons here.
base repository: kokkos/kokkos-kernels
Failed to load repositories. Confirm that selected base ref is valid, then try again.
Loading
base: 9c6a166c19830701fbb05c3ea2e2436bbc5e5390
Choose a base ref
..
head repository: kokkos/kokkos-kernels
Failed to load repositories. Confirm that selected head ref is valid, then try again.
Loading
compare: b075ed4b0c234fde1d0a116b2bb083e3f3f3c773
Choose a head ref
6 changes: 6 additions & 0 deletions blas/tpls/KokkosBlas1_dot_tpl_spec_avail.hpp
Original file line number Diff line number Diff line change
@@ -89,9 +89,15 @@ KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL_BLAS(Kokkos::complex<float>, Kokkos::LayoutLeft,
KOKKOSBLAS1_DOT_TPL_SPEC(Kokkos::complex<double>, LAYOUT, EXECSPACE, MEMSPACE)

#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
// Note BMK: CUBLAS dot is consistently slower than our native dot
// (measured 11.2, 11.8, 12.0 using perf test, and all are similar)
// If a future version improves performance, re-enable it here and
// in the tpl_spec_decl file.
#if 0
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::Cuda,
Kokkos::CudaSpace)
#endif
#endif

#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
KOKKOSBLAS1_DOT_TPL_SPEC_AVAIL(Kokkos::LayoutLeft, Kokkos::HIP,
4 changes: 4 additions & 0 deletions blas/tpls/KokkosBlas1_dot_tpl_spec_decl.hpp
Original file line number Diff line number Diff line change
@@ -101,6 +101,9 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_BLAS_EXT(false)

// cuBLAS
#ifdef KOKKOSKERNELS_ENABLE_TPL_CUBLAS
// Disabled because native has better performance.
// See tpl_spec_avail file for more details
#if 0
#include <KokkosBlas_tpl_spec.hpp>

namespace KokkosBlas {
@@ -174,6 +177,7 @@ KOKKOSBLAS1_DOT_TPL_SPEC_DECL_CUBLAS_EXT(false)
} // namespace Impl
} // namespace KokkosBlas
#endif
#endif

// rocBLAS
#ifdef KOKKOSKERNELS_ENABLE_TPL_ROCBLAS
44 changes: 44 additions & 0 deletions common/impl/KokkosKernels_NaN.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,44 @@
//@HEADER
// ************************************************************************
//
// Kokkos v. 4.0
// Copyright (2022) National Technology & Engineering
// Solutions of Sandia, LLC (NTESS).
//
// Under the terms of Contract DE-NA0003525 with NTESS,
// the U.S. Government retains certain rights in this software.
//
// Part of Kokkos, under the Apache License v2.0 with LLVM Exceptions.
// See https://kokkos.org/LICENSE for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
//@HEADER

#ifndef KOKKOSKERNELS_NAN_HPP
#define KOKKOSKERNELS_NAN_HPP

#include <Kokkos_ArithTraits.hpp>
#include <Kokkos_NumericTraits.hpp>

namespace KokkosKernels::Impl {

// This could be constexpr if Kokkos::complex ctor was
template <typename T>
KOKKOS_INLINE_FUNCTION T quiet_NaN() {
if constexpr (std::is_same_v<double, T>) {
return double(Kokkos::Experimental::quiet_NaN_v<
float>); // Kokkos::Experimetnal::quiet_NaN_v<double>
// is undefined in
// device code
} else if constexpr (Kokkos::ArithTraits<T>::is_complex) {
using value_type = typename T::value_type;
return T(quiet_NaN<value_type>(),
quiet_NaN<value_type>()); // Kokkos::complex ctor is not constexpr
} else {
return Kokkos::Experimental::quiet_NaN_v<T>;
}
}

} // namespace KokkosKernels::Impl

#endif // KOKKOSKERNELS_NAN_HPP
18 changes: 12 additions & 6 deletions sparse/impl/KokkosSparse_spmv_impl.hpp
Original file line number Diff line number Diff line change
@@ -450,16 +450,19 @@ static void spmv_beta_transpose(const execution_space& exec,
const AMatrix& A, const XVector& x,
typename YVector::const_value_type& beta,
const YVector& y) {
using ordinal_type = typename AMatrix::non_const_ordinal_type;
using size_type = typename AMatrix::non_const_size_type;
using ordinal_type = typename AMatrix::non_const_ordinal_type;
using size_type = typename AMatrix::non_const_size_type;
using y_scalar_type = typename YVector::non_const_value_type;

if (A.numRows() <= static_cast<ordinal_type>(0)) {
return;
}

// We need to scale y first ("scaling" by zero just means filling
// with zeros), since the functor works by atomic-adding into y.
if (dobeta != 1) {
if (0 == dobeta || y_scalar_type(0) == beta) {
Kokkos::deep_copy(exec, y, y_scalar_type(0));
} else if (dobeta != 1) {
KokkosBlas::scal(exec, y, beta, y);
}

@@ -540,16 +543,19 @@ static void spmv_beta_transpose(const execution_space& exec,
const AMatrix& A, const XVector& x,
typename YVector::const_value_type& beta,
const YVector& y) {
using ordinal_type = typename AMatrix::non_const_ordinal_type;
using size_type = typename AMatrix::non_const_size_type;
using ordinal_type = typename AMatrix::non_const_ordinal_type;
using size_type = typename AMatrix::non_const_size_type;
using y_scalar_type = typename YVector::non_const_value_type;

if (A.numRows() <= static_cast<ordinal_type>(0)) {
return;
}

// We need to scale y first ("scaling" by zero just means filling
// with zeros), since the functor works by atomic-adding into y.
if (dobeta != 1) {
if (0 == dobeta || y_scalar_type(0) == beta) {
Kokkos::deep_copy(exec, y, y_scalar_type(0));
} else if (dobeta != 1) {
KokkosBlas::scal(exec, y, beta, y);
}

6 changes: 5 additions & 1 deletion sparse/impl/KokkosSparse_spmv_impl_merge.hpp
Original file line number Diff line number Diff line change
@@ -309,7 +309,11 @@ struct SpmvMergeHierarchical {
static_assert(XVector::rank == 1, "");
static_assert(YVector::rank == 1, "");

KokkosBlas::scal(y, beta, y);
if (y_value_type(0) == beta) {
Kokkos::deep_copy(space, y, y_value_type(0));
} else {
KokkosBlas::scal(space, y, beta, y);
}

/* determine launch parameters for different architectures
On architectures where there is a natural execution hierarchy with true
56 changes: 13 additions & 43 deletions sparse/impl/KokkosSparse_spmv_spec.hpp
Original file line number Diff line number Diff line change
@@ -203,54 +203,24 @@ struct SPMV_MV<ExecutionSpace, Handle, AMatrix, XVector, YVector, false, false,
KOKKOSKERNELS_IMPL_COMPILE_LIBRARY> {
typedef typename YVector::non_const_value_type coefficient_type;

static void spmv_mv(const ExecutionSpace& space, Handle* handle,
// TODO: pass handle through to implementation and use tuning parameters
static void spmv_mv(const ExecutionSpace& space, Handle* /* handle */,
const char mode[], const coefficient_type& alpha,
const AMatrix& A, const XVector& x,
const coefficient_type& beta, const YVector& y) {
typedef Kokkos::ArithTraits<coefficient_type> KAT;
// Intercept special case: if x/y have only 1 column and both are
// contiguous, use the more efficient single-vector impl.
//
// We cannot do this if x or y is noncontiguous, because the column subview
// must be LayoutStride which is not ETI'd.
//
// Do not use a TPL even if one is available for the types:
// we don't want the same handle being used in both TPL and non-TPL versions
if (x.extent(1) == size_t(1) && x.span_is_contiguous() &&
y.span_is_contiguous()) {
Kokkos::View<typename XVector::const_value_type*, default_layout,
typename XVector::device_type>
x0(x.data(), x.extent(0));
Kokkos::View<typename YVector::non_const_value_type*, default_layout,
typename YVector::device_type>
y0(y.data(), y.extent(0));
if (beta == KAT::zero()) {
spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
0>(space, handle, mode, alpha, A, x0, beta, y0);
} else if (beta == KAT::one()) {
spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
1>(space, handle, mode, alpha, A, x0, beta, y0);
} else if (beta == -KAT::one()) {
spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
-1>(space, handle, mode, alpha, A, x0, beta, y0);
} else {
spmv_beta<ExecutionSpace, Handle, AMatrix, decltype(x0), decltype(y0),
2>(space, handle, mode, alpha, A, x0, beta, y0);
}
if (alpha == KAT::zero()) {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 0>(
space, mode, alpha, A, x, beta, y);
} else if (alpha == KAT::one()) {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 1>(
space, mode, alpha, A, x, beta, y);
} else if (alpha == -KAT::one()) {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, -1>(
space, mode, alpha, A, x, beta, y);
} else {
if (alpha == KAT::zero()) {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 0>(
space, mode, alpha, A, x, beta, y);
} else if (alpha == KAT::one()) {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 1>(
space, mode, alpha, A, x, beta, y);
} else if (alpha == -KAT::one()) {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, -1>(
space, mode, alpha, A, x, beta, y);
} else {
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 2>(
space, mode, alpha, A, x, beta, y);
}
spmv_alpha_mv<ExecutionSpace, AMatrix, XVector, YVector, 2>(
space, mode, alpha, A, x, beta, y);
}
}
};
48 changes: 8 additions & 40 deletions sparse/src/KokkosSparse_spmv.hpp
Original file line number Diff line number Diff line change
@@ -40,31 +40,6 @@ struct RANK_ONE {};
struct RANK_TWO {};
} // namespace

namespace Impl {
template <typename ExecutionSpace, typename Handle, typename AMatrix,
typename XVector, class YVector>
inline constexpr bool spmv_general_tpl_avail() {
constexpr bool isBSR = ::KokkosSparse::Experimental::is_bsr_matrix_v<AMatrix>;
if constexpr (!isBSR) {
// CRS
if constexpr (XVector::rank() == 1)
return spmv_tpl_spec_avail<ExecutionSpace, Handle, AMatrix, XVector,
YVector>::value;
else
return spmv_mv_tpl_spec_avail<ExecutionSpace, Handle, AMatrix, XVector,
YVector>::value;
} else {
// BSR
if constexpr (XVector::rank() == 1)
return spmv_bsrmatrix_tpl_spec_avail<ExecutionSpace, Handle, AMatrix,
XVector, YVector>::value;
else
return spmv_mv_bsrmatrix_tpl_spec_avail<ExecutionSpace, Handle, AMatrix,
XVector, YVector>::value;
}
}
} // namespace Impl

// clang-format off
/// \brief Kokkos sparse matrix-vector multiply.
/// Computes y := alpha*Op(A)*x + beta*y, where Op(A) is
@@ -247,8 +222,8 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[],
typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;

// Special case: XVector/YVector are rank-2 but x,y both have one column and
// are contiguous. If a TPL is available for rank-1 vectors but not rank-2,
// take rank-1 subviews of x,y and call the rank-1 version.
// are contiguous. In this case take rank-1 subviews of x,y and call the
// rank-1 version.
if constexpr (XVector::rank() == 2) {
using XVector_SubInternal = Kokkos::View<
typename XVector::const_value_type*,
@@ -259,19 +234,12 @@ void spmv(const ExecutionSpace& space, Handle* handle, const char mode[],
typename YVector::non_const_value_type*,
typename KokkosKernels::Impl::GetUnifiedLayout<YVector>::array_layout,
typename YVector::device_type, Kokkos::MemoryTraits<Kokkos::Unmanaged>>;
if constexpr (!Impl::spmv_general_tpl_avail<
ExecutionSpace, HandleImpl, AMatrix_Internal,
XVector_Internal, YVector_Internal>() &&
Impl::spmv_general_tpl_avail<
ExecutionSpace, HandleImpl, AMatrix_Internal,
XVector_SubInternal, YVector_SubInternal>()) {
if (x.extent(1) == size_t(1) && x.span_is_contiguous() &&
y.span_is_contiguous()) {
XVector_SubInternal xsub(x.data(), x.extent(0));
YVector_SubInternal ysub(y.data(), y.extent(0));
spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub);
return;
}
if (x.extent(1) == size_t(1) && x.span_is_contiguous() &&
y.span_is_contiguous()) {
XVector_SubInternal xsub(x.data(), x.extent(0));
YVector_SubInternal ysub(y.data(), y.extent(0));
spmv(space, handle->get_impl(), mode, alpha, A, xsub, beta, ysub);
return;
}
}

9 changes: 5 additions & 4 deletions sparse/src/KokkosSparse_spmv_handle.hpp
Original file line number Diff line number Diff line change
@@ -234,17 +234,18 @@ struct SPMVHandleImpl {
"SPMVHandleImpl: Ordinal must not be a const type");
SPMVHandleImpl(SPMVAlgorithm algo_) : algo(algo_) {}
~SPMVHandleImpl() {
if (tpl) delete tpl;
if (tpl_rank1) delete tpl_rank1;
if (tpl_rank2) delete tpl_rank2;
}

ImplType* get_impl() { return this; }

/// Get the SPMVAlgorithm used by this handle
SPMVAlgorithm get_algorithm() const { return this->algo; }

bool is_set_up = false;
const SPMVAlgorithm algo = SPMV_DEFAULT;
TPL_SpMV_Data<ExecutionSpace>* tpl = nullptr;
const SPMVAlgorithm algo = SPMV_DEFAULT;
TPL_SpMV_Data<ExecutionSpace>* tpl_rank1 = nullptr;
TPL_SpMV_Data<ExecutionSpace>* tpl_rank2 = nullptr;
// Expert tuning parameters for native SpMV
// TODO: expose a proper Experimental interface to set these. Currently they
// can be assigned directly in the SPMVHandle as they are public members.
Loading